outliertree 0.1.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/NOTICE.txt +1 -1
- data/README.md +11 -10
- data/ext/outliertree/ext.cpp +104 -105
- data/ext/outliertree/extconf.rb +1 -1
- data/lib/outliertree/result.rb +3 -3
- data/lib/outliertree/version.rb +1 -1
- data/vendor/outliertree/README.md +77 -40
- data/vendor/outliertree/src/Makevars.in +4 -0
- data/vendor/outliertree/src/Makevars.win +4 -0
- data/vendor/outliertree/src/RcppExports.cpp +20 -9
- data/vendor/outliertree/src/Rwrapper.cpp +256 -57
- data/vendor/outliertree/src/cat_outlier.cpp +6 -6
- data/vendor/outliertree/src/clusters.cpp +114 -9
- data/vendor/outliertree/src/fit_model.cpp +505 -308
- data/vendor/outliertree/src/misc.cpp +165 -4
- data/vendor/outliertree/src/outlier_tree.hpp +159 -51
- data/vendor/outliertree/src/outliertree-win.def +3 -0
- data/vendor/outliertree/src/predict.cpp +33 -0
- data/vendor/outliertree/src/split.cpp +124 -20
- metadata +10 -8
- data/vendor/outliertree/src/Makevars +0 -3
| @@ -134,6 +134,7 @@ bool fit_outliers_models(ModelOutputs &model_outputs, | |
| 134 134 | 
             
                                     size_t max_depth, double max_perc_outliers, size_t min_size_numeric, size_t min_size_categ,
         | 
| 135 135 | 
             
                                     double min_gain, bool gain_as_pct, bool follow_all, double z_norm, double z_outlier)
         | 
| 136 136 | 
             
            {
         | 
| 137 | 
            +
                SignalSwitcher ss = SignalSwitcher();
         | 
| 137 138 |  | 
| 138 139 | 
             
                /* put parameters and data into structs to avoid passing too many function arguments each time */
         | 
| 139 140 | 
             
                double z_tail = z_outlier - z_norm;
         | 
| @@ -188,10 +189,10 @@ bool fit_outliers_models(ModelOutputs &model_outputs, | |
| 188 189 | 
             
                /* determine maximum number of categories in a column, allocate arrays for category counts and proportions */
         | 
| 189 190 | 
             
                model_outputs.start_ix_cat_counts[0] = 0;
         | 
| 190 191 | 
             
                if (tot_cols > ncols_numeric) {
         | 
| 191 | 
            -
                    input_data.max_categ = calculate_category_indices( | 
| 192 | 
            -
                                                                      (bool*)  | 
| 193 | 
            -
                    input_data.max_categ = calculate_category_indices( | 
| 194 | 
            -
                                                                      (bool*)  | 
| 192 | 
            +
                    input_data.max_categ = calculate_category_indices(model_outputs.start_ix_cat_counts.data(), input_data.ncat, input_data.ncols_categ,
         | 
| 193 | 
            +
                                                                      (bool*) input_data.skip_col.data() + ncols_numeric);
         | 
| 194 | 
            +
                    input_data.max_categ = calculate_category_indices(model_outputs.start_ix_cat_counts.data() + input_data.ncols_categ, input_data.ncat_ord, input_data.ncols_ord,
         | 
| 195 | 
            +
                                                                      (bool*) input_data.skip_col.data() + input_data.ncols_numeric + input_data.ncols_categ, input_data.max_categ);
         | 
| 195 196 | 
             
                } else {
         | 
| 196 197 | 
             
                    input_data.max_categ = 0;
         | 
| 197 198 | 
             
                }
         | 
| @@ -201,6 +202,11 @@ bool fit_outliers_models(ModelOutputs &model_outputs, | |
| 201 202 | 
             
                model_params.prop_small.resize(model_outputs.start_ix_cat_counts[ncols_categ + ncols_ord]);
         | 
| 202 203 | 
             
                model_outputs.prop_categ.resize(model_outputs.start_ix_cat_counts[ncols_categ + ncols_ord]);
         | 
| 203 204 |  | 
| 205 | 
            +
                check_interrupt_switch(ss);
         | 
| 206 | 
            +
                #if defined(DONT_THROW_ON_INTERRUPT)
         | 
| 207 | 
            +
                if (interrupt_switch) return false;
         | 
| 208 | 
            +
                #endif
         | 
| 209 | 
            +
             | 
| 204 210 | 
             
                /* calculate prior probabilities for categorical variables (in parallel), see if any is unsplittable */
         | 
| 205 211 | 
             
                if (tot_cols > ncols_numeric) {
         | 
| 206 212 | 
             
                    #pragma omp parallel
         | 
| @@ -211,14 +217,14 @@ bool fit_outliers_models(ModelOutputs &model_outputs, | |
| 211 217 | 
             
                            #pragma omp section
         | 
| 212 218 | 
             
                            {
         | 
| 213 219 | 
             
                                if (ncols_categ > 0) {
         | 
| 214 | 
            -
                                    calculate_all_cat_counts( | 
| 220 | 
            +
                                    calculate_all_cat_counts(model_outputs.start_ix_cat_counts.data(), input_data.cat_counts.data(), input_data.ncat,
         | 
| 215 221 | 
             
                                                             input_data.categorical_data, input_data.ncols_categ, input_data.nrows,
         | 
| 216 | 
            -
                                                             (bool*)  | 
| 222 | 
            +
                                                             (bool*) input_data.has_NA.data() + ncols_numeric, (bool*) input_data.skip_col.data() + input_data.ncols_numeric,
         | 
| 217 223 | 
             
                                                             std::min(input_data.ncols_categ, (size_t)std::max(1, nthreads - 1)) );
         | 
| 218 224 |  | 
| 219 | 
            -
                                    check_cat_col_unsplittable( | 
| 225 | 
            +
                                    check_cat_col_unsplittable(model_outputs.start_ix_cat_counts.data(), input_data.cat_counts.data(), input_data.ncat,
         | 
| 220 226 | 
             
                                                               input_data.ncols_categ, std::min(model_params.min_size_numeric, model_params.min_size_categ), input_data.nrows,
         | 
| 221 | 
            -
                                                               (bool*)  | 
| 227 | 
            +
                                                               (bool*) input_data.skip_col.data() + input_data.ncols_numeric,
         | 
| 222 228 | 
             
                                                               std::min(input_data.ncols_categ, (size_t)std::max(1, nthreads - 1)));
         | 
| 223 229 | 
             
                                }
         | 
| 224 230 |  | 
| @@ -228,15 +234,15 @@ bool fit_outliers_models(ModelOutputs &model_outputs, | |
| 228 234 | 
             
                            #pragma omp section
         | 
| 229 235 | 
             
                            {
         | 
| 230 236 | 
             
                                if (ncols_ord > 0) {
         | 
| 231 | 
            -
                                    calculate_all_cat_counts( | 
| 237 | 
            +
                                    calculate_all_cat_counts(model_outputs.start_ix_cat_counts.data() + input_data.ncols_categ, input_data.cat_counts.data(), input_data.ncat_ord,
         | 
| 232 238 | 
             
                                                             input_data.ordinal_data, input_data.ncols_ord, input_data.nrows,
         | 
| 233 | 
            -
                                                             (bool*)  | 
| 234 | 
            -
                                                             (bool*)  | 
| 239 | 
            +
                                                             (bool*) input_data.has_NA.data() + input_data.ncols_numeric + input_data.ncols_categ,
         | 
| 240 | 
            +
                                                             (bool*) input_data.skip_col.data() + input_data.ncols_numeric + input_data.ncols_categ,
         | 
| 235 241 | 
             
                                                             std::max((int)1, nthreads - (int)input_data.ncols_categ) );
         | 
| 236 242 |  | 
| 237 | 
            -
                                    check_cat_col_unsplittable( | 
| 243 | 
            +
                                    check_cat_col_unsplittable(model_outputs.start_ix_cat_counts.data() + input_data.ncols_categ, input_data.cat_counts.data(), input_data.ncat_ord,
         | 
| 238 244 | 
             
                                                               ncols_ord, std::min(model_params.min_size_numeric, model_params.min_size_categ), input_data.nrows,
         | 
| 239 | 
            -
                                                               (bool*)  | 
| 245 | 
            +
                                                               (bool*) input_data.skip_col.data() + input_data.ncols_numeric + input_data.ncols_categ,
         | 
| 240 246 | 
             
                                                               std::max((int)1, nthreads - (int)input_data.ncols_categ));
         | 
| 241 247 | 
             
                                }
         | 
| 242 248 | 
             
                            }
         | 
| @@ -246,15 +252,15 @@ bool fit_outliers_models(ModelOutputs &model_outputs, | |
| 246 252 |  | 
| 247 253 |  | 
| 248 254 | 
             
                    /* calculate proprotion limit and CI for each category of each column */
         | 
| 249 | 
            -
                    calculate_lowerlim_proportion( | 
| 250 | 
            -
                                                   | 
| 251 | 
            -
                    calculate_lowerlim_proportion( | 
| 252 | 
            -
                                                   | 
| 255 | 
            +
                    calculate_lowerlim_proportion(model_params.prop_small.data(), model_outputs.prop_categ.data(), model_outputs.start_ix_cat_counts.data(),
         | 
| 256 | 
            +
                                                  input_data.cat_counts.data(), input_data.ncols_categ, input_data.nrows, model_params.z_norm, model_params.z_tail);
         | 
| 257 | 
            +
                    calculate_lowerlim_proportion(model_params.prop_small.data(), model_outputs.prop_categ.data(), model_outputs.start_ix_cat_counts.data() + input_data.ncols_categ,
         | 
| 258 | 
            +
                                                  input_data.cat_counts.data(), input_data.ncols_ord,  input_data.nrows, model_params.z_norm, model_params.z_tail);
         | 
| 253 259 | 
             
                }
         | 
| 254 260 |  | 
| 255 261 | 
             
                /* for numerical columns, check if they have NAs or if total variance is  too small */
         | 
| 256 262 | 
             
                check_missing_no_variance(input_data.numeric_data, input_data.ncols_numeric, input_data.nrows,
         | 
| 257 | 
            -
                                          (bool*)  | 
| 263 | 
            +
                                          (bool*) input_data.has_NA.data(), (bool*) input_data.skip_col.data(),
         | 
| 258 264 | 
             
                                          model_outputs.min_decimals_col.data(), nthreads);
         | 
| 259 265 |  | 
| 260 266 | 
             
                /* determine an approximate size for the output clusters, and reserve memory right away */
         | 
| @@ -269,133 +275,159 @@ bool fit_outliers_models(ModelOutputs &model_outputs, | |
| 269 275 | 
             
                    /* this is not exact as categoricals and ordinals can also be split multiple times */
         | 
| 270 276 | 
             
                }
         | 
| 271 277 |  | 
| 278 | 
            +
                check_interrupt_switch(ss);
         | 
| 279 | 
            +
                #if defined(DONT_THROW_ON_INTERRUPT)
         | 
| 280 | 
            +
                if (interrupt_switch) return false;
         | 
| 281 | 
            +
                #endif
         | 
| 272 282 |  | 
| 273 283 | 
             
                /* now run the procedure on each column separately */
         | 
| 274 284 | 
             
                int tid;
         | 
| 285 | 
            +
                bool threw_exception = false;
         | 
| 286 | 
            +
                std::exception_ptr ex = NULL;
         | 
| 275 287 | 
             
                nthreads = std::min(nthreads, (int)(ncols_numeric + ncols_categ + ncols_ord));
         | 
| 276 288 | 
             
                #pragma omp parallel for num_threads(nthreads) schedule(dynamic, 1) private(tid) shared(workspace, model_outputs, input_data, model_params, tot_cols)
         | 
| 277 289 | 
             
                for (size_t_for col = 0; col < tot_cols; col++) {
         | 
| 278 290 |  | 
| 291 | 
            +
                    if (interrupt_switch || threw_exception) continue;
         | 
| 292 | 
            +
             | 
| 279 293 | 
             
                    if (cols_ignore != NULL && cols_ignore[col]) continue;
         | 
| 280 294 | 
             
                    if (input_data.skip_col[col] && col < input_data.ncols_numeric) continue;
         | 
| 281 295 | 
             
                    tid = omp_get_thread_num();
         | 
| 282 296 |  | 
| 283 | 
            -
                     | 
| 284 | 
            -
             | 
| 285 | 
            -
                         | 
| 286 | 
            -
             | 
| 287 | 
            -
             | 
| 288 | 
            -
             | 
| 289 | 
            -
                         | 
| 290 | 
            -
             | 
| 291 | 
            -
             | 
| 292 | 
            -
             | 
| 293 | 
            -
             | 
| 294 | 
            -
             | 
| 295 | 
            -
             | 
| 296 | 
            -
             | 
| 297 | 
            +
                    try {
         | 
| 298 | 
            +
                        /* re-use thread-private memory if possible */
         | 
| 299 | 
            +
                        if (!check_workspace_is_allocated(workspace[tid]))
         | 
| 300 | 
            +
                            allocate_thread_workspace(workspace[tid], input_data.nrows, input_data.max_categ);
         | 
| 301 | 
            +
                            
         | 
| 302 | 
            +
                        /* numerical column */
         | 
| 303 | 
            +
                        if (col < input_data.ncols_numeric) {
         | 
| 304 | 
            +
                            process_numeric_col(model_outputs.all_clusters[col],
         | 
| 305 | 
            +
                                                model_outputs.all_trees[col],
         | 
| 306 | 
            +
                                                col,
         | 
| 307 | 
            +
                                                workspace[tid],
         | 
| 308 | 
            +
                                                input_data,
         | 
| 309 | 
            +
                                                model_params, model_outputs);
         | 
| 310 | 
            +
                            calculate_cluster_minimums(model_outputs, col);
         | 
| 311 | 
            +
                        }
         | 
| 297 312 |  | 
| 298 | 
            -
             | 
| 299 | 
            -
             | 
| 300 | 
            -
             | 
| 301 | 
            -
             | 
| 302 | 
            -
             | 
| 303 | 
            -
             | 
| 304 | 
            -
             | 
| 305 | 
            -
             | 
| 306 | 
            -
             | 
| 307 | 
            -
             | 
| 313 | 
            +
                        /* categorical column */
         | 
| 314 | 
            +
                        else if (col < (input_data.ncols_numeric + input_data.ncols_categ)) {
         | 
| 315 | 
            +
                            process_categ_col(model_outputs.all_clusters[col],
         | 
| 316 | 
            +
                                              model_outputs.all_trees[col],
         | 
| 317 | 
            +
                                              col, false,
         | 
| 318 | 
            +
                                              workspace[tid],
         | 
| 319 | 
            +
                                              input_data,
         | 
| 320 | 
            +
                                              model_params, model_outputs);
         | 
| 321 | 
            +
                            calculate_cluster_poss_categs(model_outputs, col, col - input_data.ncols_numeric);
         | 
| 322 | 
            +
                        }
         | 
| 308 323 |  | 
| 309 | 
            -
             | 
| 310 | 
            -
             | 
| 311 | 
            -
             | 
| 312 | 
            -
             | 
| 313 | 
            -
             | 
| 314 | 
            -
             | 
| 315 | 
            -
             | 
| 316 | 
            -
             | 
| 317 | 
            -
             | 
| 318 | 
            -
             | 
| 324 | 
            +
                        /* ordinal column */
         | 
| 325 | 
            +
                        else {
         | 
| 326 | 
            +
                            process_categ_col(model_outputs.all_clusters[col],
         | 
| 327 | 
            +
                                              model_outputs.all_trees[col],
         | 
| 328 | 
            +
                                              col, true,
         | 
| 329 | 
            +
                                              workspace[tid],
         | 
| 330 | 
            +
                                              input_data,
         | 
| 331 | 
            +
                                              model_params, model_outputs);
         | 
| 332 | 
            +
                            calculate_cluster_poss_categs(model_outputs, col, col - input_data.ncols_numeric);
         | 
| 333 | 
            +
                        }
         | 
| 319 334 |  | 
| 320 | 
            -
             | 
| 321 | 
            -
             | 
| 322 | 
            -
             | 
| 323 | 
            -
             | 
| 324 | 
            -
             | 
| 325 | 
            -
             | 
| 326 | 
            -
             | 
| 327 | 
            -
             | 
| 328 | 
            -
             | 
| 329 | 
            -
             | 
| 330 | 
            -
             | 
| 331 | 
            -
             | 
| 332 | 
            -
             | 
| 333 | 
            -
             | 
| 334 | 
            -
             | 
| 335 | 
            -
             | 
| 336 | 
            -
             | 
| 337 | 
            -
             | 
| 338 | 
            -
             | 
| 335 | 
            +
                        /* shrink the dynamic vectors to what ended up used only */
         | 
| 336 | 
            +
                        #ifdef TEST_MODE_DEFINE
         | 
| 337 | 
            +
                        prune_unused_trees(model_outputs.all_trees[col]);
         | 
| 338 | 
            +
                        #endif
         | 
| 339 | 
            +
                        if (
         | 
| 340 | 
            +
                            model_outputs.all_clusters[col].size() == 0 ||
         | 
| 341 | 
            +
                            model_outputs.all_trees[col].size() == 0 ||
         | 
| 342 | 
            +
                            check_tree_is_not_needed(model_outputs.all_trees[col][0])
         | 
| 343 | 
            +
                        )
         | 
| 344 | 
            +
                        {
         | 
| 345 | 
            +
                            model_outputs.all_trees[col].clear();
         | 
| 346 | 
            +
                            model_outputs.all_clusters[col].clear();
         | 
| 347 | 
            +
                        }
         | 
| 348 | 
            +
                        model_outputs.all_trees[col].shrink_to_fit();
         | 
| 349 | 
            +
                        model_outputs.all_clusters[col].shrink_to_fit();
         | 
| 350 | 
            +
                        
         | 
| 351 | 
            +
                        /* simplify single-elements in subset to 'equals' or 'not equals' */
         | 
| 352 | 
            +
                        simplify_when_equal_cond(model_outputs.all_clusters[col], ncat_ord);
         | 
| 353 | 
            +
                        simplify_when_equal_cond(model_outputs.all_trees[col],    ncat_ord);
         | 
| 339 354 |  | 
| 340 | 
            -
             | 
| 341 | 
            -
             | 
| 342 | 
            -
             | 
| 355 | 
            +
                        /* remember only the best (rarest) value for each row */
         | 
| 356 | 
            +
                        #pragma omp critical
         | 
| 357 | 
            +
                        if (workspace[tid].col_has_outliers) {
         | 
| 343 358 |  | 
| 344 | 
            -
             | 
| 345 | 
            -
             | 
| 359 | 
            +
                            found_outliers = true;
         | 
| 360 | 
            +
                            for (size_t row = 0; row < input_data.nrows; row++) {
         | 
| 346 361 |  | 
| 347 | 
            -
             | 
| 362 | 
            +
                                if (workspace[tid].outlier_scores[row] < 1.0) {
         | 
| 348 363 |  | 
| 349 | 
            -
             | 
| 350 | 
            -
             | 
| 351 | 
            -
                                    (
         | 
| 352 | 
            -
                                        workspace[tid].outlier_depth[row] < model_outputs.outlier_depth_final[row] &&
         | 
| 353 | 
            -
                                        (
         | 
| 354 | 
            -
                                            !model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch ||
         | 
| 355 | 
            -
                                            model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
         | 
| 356 | 
            -
                                        )
         | 
| 357 | 
            -
                                    ) ||
         | 
| 364 | 
            +
                                    if (
         | 
| 365 | 
            +
                                        model_outputs.outlier_scores_final[row] >= 1.0 ||
         | 
| 358 366 | 
             
                                        (
         | 
| 359 | 
            -
             | 
| 360 | 
            -
             | 
| 367 | 
            +
                                            workspace[tid].outlier_depth[row] < model_outputs.outlier_depth_final[row] &&
         | 
| 368 | 
            +
                                            (
         | 
| 369 | 
            +
                                                !model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch ||
         | 
| 370 | 
            +
                                                model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
         | 
| 371 | 
            +
                                            )
         | 
| 361 372 | 
             
                                        ) ||
         | 
| 362 | 
            -
             | 
| 363 | 
            -
             | 
| 364 | 
            -
                                            model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch
         | 
| 365 | 
            -
             | 
| 366 | 
            -
                                             | 
| 367 | 
            -
                                            &&
         | 
| 368 | 
            -
                                            model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].cluster_size
         | 
| 369 | 
            -
                                                <
         | 
| 370 | 
            -
                                            model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].cluster_size
         | 
| 371 | 
            -
                                        ) ||
         | 
| 372 | 
            -
                                        (
         | 
| 373 | 
            +
                                            (
         | 
| 374 | 
            +
                                            model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch &&
         | 
| 375 | 
            +
                                            !model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch
         | 
| 376 | 
            +
                                            ) ||
         | 
| 377 | 
            +
                                            (
         | 
| 373 378 | 
             
                                            workspace[tid].outlier_depth[row] == model_outputs.outlier_depth_final[row] &&
         | 
| 374 | 
            -
             | 
| 375 | 
            -
             | 
| 376 | 
            -
             | 
| 377 | 
            -
             | 
| 378 | 
            -
             | 
| 379 | 
            -
             | 
| 380 | 
            -
             | 
| 381 | 
            -
                                             | 
| 382 | 
            -
                                             | 
| 383 | 
            -
             | 
| 384 | 
            -
             | 
| 385 | 
            -
             | 
| 386 | 
            -
             | 
| 387 | 
            -
             | 
| 388 | 
            -
             | 
| 389 | 
            -
             | 
| 390 | 
            -
             | 
| 379 | 
            +
                                                model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch
         | 
| 380 | 
            +
                                                    ==
         | 
| 381 | 
            +
                                                model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
         | 
| 382 | 
            +
                                                &&
         | 
| 383 | 
            +
                                                model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].cluster_size
         | 
| 384 | 
            +
                                                    <
         | 
| 385 | 
            +
                                                model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].cluster_size
         | 
| 386 | 
            +
                                            ) ||
         | 
| 387 | 
            +
                                            (
         | 
| 388 | 
            +
                                                workspace[tid].outlier_depth[row] == model_outputs.outlier_depth_final[row] &&
         | 
| 389 | 
            +
                                                model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].cluster_size
         | 
| 390 | 
            +
                                                    ==
         | 
| 391 | 
            +
                                                model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].cluster_size
         | 
| 392 | 
            +
                                                &&
         | 
| 393 | 
            +
                                                model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch
         | 
| 394 | 
            +
                                                    ==
         | 
| 395 | 
            +
                                                model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
         | 
| 396 | 
            +
                                                &&
         | 
| 397 | 
            +
                                                workspace[tid].outlier_scores[row] < model_outputs.outlier_scores_final[row]
         | 
| 398 | 
            +
                                            )
         | 
| 399 | 
            +
                                    )
         | 
| 400 | 
            +
                                    {
         | 
| 401 | 
            +
                                        model_outputs.outlier_scores_final[row] = workspace[tid].outlier_scores[row];
         | 
| 402 | 
            +
                                        model_outputs.outlier_clusters_final[row] = workspace[tid].outlier_clusters[row];
         | 
| 403 | 
            +
                                        model_outputs.outlier_trees_final[row] = workspace[tid].outlier_trees[row];
         | 
| 404 | 
            +
                                        model_outputs.outlier_depth_final[row] = workspace[tid].outlier_depth[row];
         | 
| 405 | 
            +
                                        model_outputs.outlier_columns_final[row] = col;
         | 
| 406 | 
            +
                                    }
         | 
| 391 407 | 
             
                                }
         | 
| 408 | 
            +
             | 
| 392 409 | 
             
                            }
         | 
| 410 | 
            +
                        }
         | 
| 411 | 
            +
                    }
         | 
| 393 412 |  | 
| 413 | 
            +
                    catch(...) {
         | 
| 414 | 
            +
                        #pragma omp critical
         | 
| 415 | 
            +
                        {
         | 
| 416 | 
            +
                            if (!threw_exception) {
         | 
| 417 | 
            +
                                threw_exception = true;
         | 
| 418 | 
            +
                                ex = std::current_exception();
         | 
| 419 | 
            +
                            }
         | 
| 394 420 | 
             
                        }
         | 
| 395 421 | 
             
                    }
         | 
| 422 | 
            +
                }
         | 
| 396 423 |  | 
| 424 | 
            +
                check_interrupt_switch(ss);
         | 
| 425 | 
            +
                #if defined(DONT_THROW_ON_INTERRUPT)
         | 
| 426 | 
            +
                if (interrupt_switch) return false;
         | 
| 427 | 
            +
                #endif
         | 
| 397 428 |  | 
| 398 | 
            -
                 | 
| 429 | 
            +
                if (threw_exception)
         | 
| 430 | 
            +
                    std::rethrow_exception(ex);
         | 
| 399 431 |  | 
| 400 432 | 
             
                /* once finished, determine how many decimals to report for numerical outliers */
         | 
| 401 433 | 
             
                if (found_outliers)
         | 
| @@ -443,21 +475,23 @@ void process_numeric_col(std::vector<Cluster> &cluster_root, | |
| 443 475 | 
             
                                     ModelParams &model_params,
         | 
| 444 476 | 
             
                                     ModelOutputs &model_outputs)
         | 
| 445 477 | 
             
            {
         | 
| 478 | 
            +
                if (interrupt_switch) return;
         | 
| 479 | 
            +
             | 
| 446 480 | 
             
                /* discard NAs and infinites */
         | 
| 447 481 | 
             
                workspace.target_col_num = target_col_num;
         | 
| 448 482 | 
             
                workspace.target_numeric_col = input_data.numeric_data + target_col_num * input_data.nrows;
         | 
| 449 483 | 
             
                workspace.orig_target_col = workspace.target_numeric_col;
         | 
| 450 484 | 
             
                workspace.end = input_data.nrows - 1;
         | 
| 451 | 
            -
                workspace.st = move_NAs_to_front( | 
| 485 | 
            +
                workspace.st = move_NAs_to_front(workspace.ix_arr.data(), workspace.target_numeric_col, 0, workspace.end, true);
         | 
| 452 486 | 
             
                workspace.col_has_outliers = false;
         | 
| 453 487 |  | 
| 454 488 | 
             
                /* check for problematic distributions - need to sort data first */
         | 
| 455 | 
            -
                std::sort( | 
| 489 | 
            +
                std::sort(workspace.ix_arr.data() + workspace.st, workspace.ix_arr.data() + workspace.end + 1,
         | 
| 456 490 | 
             
                          [&workspace](const size_t a, const size_t b){return workspace.target_numeric_col[a] < workspace.target_numeric_col[b];});
         | 
| 457 491 |  | 
| 458 492 | 
             
                long double running_mean = 0;
         | 
| 459 | 
            -
                long double mean_prev    = 0;
         | 
| 460 493 | 
             
                long double running_ssq  = 0;
         | 
| 494 | 
            +
                long double mean_prev    = workspace.target_numeric_col[workspace.ix_arr[workspace.st]];
         | 
| 461 495 | 
             
                double xval;
         | 
| 462 496 | 
             
                for (size_t row = workspace.st; row <= workspace.end; row++) {
         | 
| 463 497 | 
             
                    xval = workspace.target_numeric_col[workspace.ix_arr[row]];
         | 
| @@ -466,10 +500,10 @@ void process_numeric_col(std::vector<Cluster> &cluster_root, | |
| 466 500 | 
             
                    mean_prev     = running_mean;
         | 
| 467 501 | 
             
                }
         | 
| 468 502 |  | 
| 469 | 
            -
                check_for_tails( | 
| 503 | 
            +
                check_for_tails(workspace.ix_arr.data(), workspace.st, workspace.end, workspace.target_numeric_col,
         | 
| 470 504 | 
             
                                model_params.z_norm, model_params.max_perc_outliers,
         | 
| 471 | 
            -
                                 | 
| 472 | 
            -
                                (double) | 
| 505 | 
            +
                                workspace.buffer_transf_y.data(), (double)running_mean,
         | 
| 506 | 
            +
                                (double)std::sqrt(running_ssq / (long double)(workspace.end - workspace.st)),
         | 
| 473 507 | 
             
                                &workspace.left_tail, &workspace.right_tail,
         | 
| 474 508 | 
             
                                &workspace.exp_transf, &workspace.log_transf);
         | 
| 475 509 |  | 
| @@ -480,11 +514,11 @@ void process_numeric_col(std::vector<Cluster> &cluster_root, | |
| 480 514 | 
             
                if (workspace.exp_transf) {
         | 
| 481 515 |  | 
| 482 516 | 
             
                    workspace.orig_mean = (double) running_mean;
         | 
| 483 | 
            -
                    workspace.orig_sd   = (double)  | 
| 517 | 
            +
                    workspace.orig_sd   = (double) std::sqrt(running_ssq / (long double)(workspace.end - workspace.st));
         | 
| 484 518 | 
             
                    for (size_t row = workspace.st; row <= workspace.end; row++) {
         | 
| 485 519 | 
             
                        workspace.buffer_transf_y[workspace.ix_arr[row]] = exp(z_score(workspace.target_numeric_col[workspace.ix_arr[row]], workspace.orig_mean, workspace.orig_sd));
         | 
| 486 520 | 
             
                    }
         | 
| 487 | 
            -
                    workspace.target_numeric_col =  | 
| 521 | 
            +
                    workspace.target_numeric_col = workspace.buffer_transf_y.data();
         | 
| 488 522 | 
             
                    model_outputs.col_transf[workspace.target_col_num] = Exp;
         | 
| 489 523 | 
             
                    model_outputs.transf_offset[workspace.target_col_num] = workspace.orig_mean;
         | 
| 490 524 | 
             
                    model_outputs.sd_div[workspace.target_col_num] = workspace.orig_sd;
         | 
| @@ -501,7 +535,7 @@ void process_numeric_col(std::vector<Cluster> &cluster_root, | |
| 501 535 | 
             
                    for (size_t row = workspace.st; row <= workspace.end; row++) {
         | 
| 502 536 | 
             
                        workspace.buffer_transf_y[workspace.ix_arr[row]] = log(workspace.target_numeric_col[workspace.ix_arr[row]] - workspace.log_minval);
         | 
| 503 537 | 
             
                    }
         | 
| 504 | 
            -
                    workspace.target_numeric_col =  | 
| 538 | 
            +
                    workspace.target_numeric_col = workspace.buffer_transf_y.data();
         | 
| 505 539 | 
             
                    model_outputs.col_transf[workspace.target_col_num] = Log;
         | 
| 506 540 | 
             
                    model_outputs.transf_offset[workspace.target_col_num] = workspace.log_minval;
         | 
| 507 541 |  | 
| @@ -514,9 +548,9 @@ void process_numeric_col(std::vector<Cluster> &cluster_root, | |
| 514 548 | 
             
                workspace.tree->emplace_back(0, Root);
         | 
| 515 549 |  | 
| 516 550 | 
             
                workspace.clusters->emplace_back(NoType, Root);
         | 
| 517 | 
            -
                workspace.col_has_outliers = define_numerical_cluster(workspace.target_numeric_col,  | 
| 518 | 
            -
                                                                      workspace.end,  | 
| 519 | 
            -
                                                                       | 
| 551 | 
            +
                workspace.col_has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.st,
         | 
| 552 | 
            +
                                                                      workspace.end, workspace.outlier_scores.data(),
         | 
| 553 | 
            +
                                                                      workspace.outlier_clusters.data(), workspace.outlier_trees.data(), workspace.outlier_depth.data(),
         | 
| 520 554 | 
             
                                                                      workspace.clusters->back(), *(workspace.clusters), 0, 0, 0,
         | 
| 521 555 | 
             
                                                                      workspace.log_transf, workspace.log_minval, workspace.exp_transf,
         | 
| 522 556 | 
             
                                                                      workspace.orig_mean, workspace.orig_sd,
         | 
| @@ -526,16 +560,19 @@ void process_numeric_col(std::vector<Cluster> &cluster_root, | |
| 526 560 |  | 
| 527 561 | 
             
                /* remove outliers if any were found */
         | 
| 528 562 | 
             
                if (workspace.has_outliers)
         | 
| 529 | 
            -
                    workspace.st = move_outliers_to_front( | 
| 563 | 
            +
                    workspace.st = move_outliers_to_front(workspace.ix_arr.data(), workspace.outlier_scores.data(), workspace.st, workspace.end);
         | 
| 530 564 |  | 
| 531 565 | 
             
                /* update statistics if they've changed */
         | 
| 532 566 | 
             
                if (workspace.has_outliers || workspace.exp_transf || workspace.log_transf)
         | 
| 533 | 
            -
                    workspace.sd_y = calc_sd( | 
| 567 | 
            +
                    workspace.sd_y = calc_sd(workspace.ix_arr.data(), workspace.target_numeric_col,
         | 
| 534 568 | 
             
                                             workspace.st, workspace.end, &workspace.mean_y);
         | 
| 535 569 | 
             
                else
         | 
| 536 570 | 
             
                    workspace.sd_y = sqrtl(running_ssq / (long double)(workspace.end - workspace.st));
         | 
| 537 571 |  | 
| 538 | 
            -
                if (model_params.max_depth > 0 | 
| 572 | 
            +
                if (model_params.max_depth > 0 && workspace.sd_y > 0) {
         | 
| 573 | 
            +
                    workspace.exhausted_col_tracker.initialize(input_data.tot_cols, model_params.max_depth);
         | 
| 574 | 
            +
                    recursive_split_numeric(workspace, input_data, model_params, 0, false);
         | 
| 575 | 
            +
                }
         | 
| 539 576 | 
             
            }
         | 
| 540 577 |  | 
| 541 578 | 
             
            void recursive_split_numeric(Workspace &workspace,
         | 
| @@ -543,19 +580,27 @@ void recursive_split_numeric(Workspace &workspace, | |
| 543 580 | 
             
                                         ModelParams &model_params,
         | 
| 544 581 | 
             
                                         size_t curr_depth, bool is_NA_branch)
         | 
| 545 582 | 
             
            {
         | 
| 583 | 
            +
                if (interrupt_switch) return;
         | 
| 584 | 
            +
             | 
| 546 585 | 
             
                workspace.best_gain = -HUGE_VAL;
         | 
| 547 586 | 
             
                workspace.column_type_best = NoType;
         | 
| 548 587 | 
             
                workspace.lev_has_outliers = false;
         | 
| 549 | 
            -
                if (curr_depth > 0) workspace.sd_y = calc_sd(&workspace.ix_arr[0], workspace.target_numeric_col,
         | 
| 550 | 
            -
                                                             workspace.st, workspace.end, &workspace.mean_y);
         | 
| 551 588 |  | 
| 552 589 | 
             
                /* these are used to keep track of where to continue after calling a further recursion */
         | 
| 553 590 | 
             
                size_t ix1, ix2, ix3;
         | 
| 554 591 | 
             
                SplitType spl1, spl2;
         | 
| 555 592 | 
             
                size_t tree_from = workspace.tree->size() - 1;
         | 
| 593 | 
            +
                std::unique_ptr<RecursionState> state_backup;
         | 
| 594 | 
            +
                ExhaustedColumnsLevel level_col_tracker;
         | 
| 595 | 
            +
             | 
| 596 | 
            +
                if (curr_depth > 0) {
         | 
| 597 | 
            +
                    workspace.sd_y = calc_sd(workspace.ix_arr.data(), workspace.target_numeric_col,
         | 
| 598 | 
            +
                                             workspace.st, workspace.end, &workspace.mean_y);
         | 
| 599 | 
            +
                    if (workspace.sd_y <= 0) goto abandon_column;
         | 
| 600 | 
            +
                }
         | 
| 601 | 
            +
                level_col_tracker.initialize(&workspace.exhausted_col_tracker);
         | 
| 556 602 |  | 
| 557 603 | 
             
                /* when using 'follow_all' need to keep track of a lot more things */
         | 
| 558 | 
            -
                std::unique_ptr<RecursionState> state_backup;
         | 
| 559 604 | 
             
                if (model_params.follow_all) state_backup = std::unique_ptr<RecursionState>(new RecursionState);
         | 
| 560 605 |  | 
| 561 606 |  | 
| @@ -566,10 +611,16 @@ void recursive_split_numeric(Workspace &workspace, | |
| 566 611 |  | 
| 567 612 | 
             
                    if (col == workspace.target_col_num) continue;
         | 
| 568 613 | 
             
                    if (input_data.skip_col[col]) continue;
         | 
| 569 | 
            -
                     | 
| 614 | 
            +
                    if (workspace.exhausted_col_tracker.is_exhausted[col]) continue;
         | 
| 615 | 
            +
                    split_numericx_numericy(workspace.ix_arr.data(), workspace.st, workspace.end, input_data.numeric_data + col * input_data.nrows,
         | 
| 570 616 | 
             
                                            workspace.target_numeric_col, workspace.sd_y, (bool)(input_data.has_NA[col]), model_params.min_size_numeric,
         | 
| 571 | 
            -
                                            model_params.take_mid,  | 
| 572 | 
            -
                                            &(workspace.this_split_ix), &(workspace.this_split_NA));
         | 
| 617 | 
            +
                                            model_params.take_mid, workspace.buffer_sd.data(), &(workspace.this_gain), &(workspace.this_split_point),
         | 
| 618 | 
            +
                                            &(workspace.this_split_ix), &(workspace.this_split_NA), &workspace.has_zero_variance);
         | 
| 619 | 
            +
                    if (workspace.has_zero_variance) {
         | 
| 620 | 
            +
                        workspace.has_zero_variance = false;
         | 
| 621 | 
            +
                        workspace.exhausted_col_tracker.push_col(col);
         | 
| 622 | 
            +
                        continue;
         | 
| 623 | 
            +
                    }
         | 
| 573 624 | 
             
                    if (model_params.gain_as_pct) workspace.this_gain /= workspace.sd_y;
         | 
| 574 625 |  | 
| 575 626 | 
             
                    /* if the gain is not insignificant, check clusters created by this split */
         | 
| @@ -581,9 +632,9 @@ void recursive_split_numeric(Workspace &workspace, | |
| 581 632 |  | 
| 582 633 | 
             
                            (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
         | 
| 583 634 | 
             
                            workspace.clusters->emplace_back(Numeric, col, IsNa, -HUGE_VAL, true);
         | 
| 584 | 
            -
                            workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col,  | 
| 585 | 
            -
                                                                              workspace.this_split_NA - 1,  | 
| 586 | 
            -
                                                                               | 
| 635 | 
            +
                            workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.st,
         | 
| 636 | 
            +
                                                                              workspace.this_split_NA - 1, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
         | 
| 637 | 
            +
                                                                              workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
         | 
| 587 638 | 
             
                                                                              workspace.clusters->size() - 1, tree_from, curr_depth + 1,
         | 
| 588 639 | 
             
                                                                              workspace.log_transf, workspace.log_minval, workspace.exp_transf,
         | 
| 589 640 | 
             
                                                                              workspace.orig_mean, workspace.orig_sd,
         | 
| @@ -596,6 +647,9 @@ void recursive_split_numeric(Workspace &workspace, | |
| 596 647 | 
             
                                workspace.tree->emplace_back(tree_from, col, HUGE_VAL, IsNa);
         | 
| 597 648 | 
             
                                backup_recursion_state(workspace, *state_backup);
         | 
| 598 649 | 
             
                                workspace.end = workspace.this_split_NA - 1;
         | 
| 650 | 
            +
                                ExhaustedColumnsLevel level_col_tracker2;
         | 
| 651 | 
            +
                                level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
         | 
| 652 | 
            +
                                workspace.exhausted_col_tracker.push_col(col);
         | 
| 599 653 | 
             
                                recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, true);
         | 
| 600 654 | 
             
                                restore_recursion_state(workspace, *state_backup);
         | 
| 601 655 | 
             
                            }
         | 
| @@ -605,9 +659,9 @@ void recursive_split_numeric(Workspace &workspace, | |
| 605 659 | 
             
                        /* left branch */
         | 
| 606 660 | 
             
                        (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
         | 
| 607 661 | 
             
                        workspace.clusters->emplace_back(Numeric, col, LessOrEqual, workspace.this_split_point, is_NA_branch);
         | 
| 608 | 
            -
                        workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col,  | 
| 609 | 
            -
                                                                          workspace.this_split_ix,  | 
| 610 | 
            -
                                                                           | 
| 662 | 
            +
                        workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.this_split_NA,
         | 
| 663 | 
            +
                                                                          workspace.this_split_ix, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
         | 
| 664 | 
            +
                                                                          workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
         | 
| 611 665 | 
             
                                                                          workspace.clusters->size() - 1, tree_from, curr_depth + 1,
         | 
| 612 666 | 
             
                                                                          workspace.log_transf, workspace.log_minval, workspace.exp_transf,
         | 
| 613 667 | 
             
                                                                          workspace.orig_mean, workspace.orig_sd,
         | 
| @@ -629,9 +683,9 @@ void recursive_split_numeric(Workspace &workspace, | |
| 629 683 | 
             
                        /* right branch */
         | 
| 630 684 | 
             
                        (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
         | 
| 631 685 | 
             
                        workspace.clusters->emplace_back(Numeric, col, Greater, workspace.this_split_point, is_NA_branch);
         | 
| 632 | 
            -
                        workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col,  | 
| 633 | 
            -
                                                                          workspace.end,  | 
| 634 | 
            -
                                                                           | 
| 686 | 
            +
                        workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.this_split_ix + 1,
         | 
| 687 | 
            +
                                                                          workspace.end, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
         | 
| 688 | 
            +
                                                                          workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
         | 
| 635 689 | 
             
                                                                          workspace.clusters->size() - 1, tree_from, curr_depth + 1,
         | 
| 636 690 | 
             
                                                                          workspace.log_transf, workspace.log_minval, workspace.exp_transf,
         | 
| 637 691 | 
             
                                                                          workspace.orig_mean, workspace.orig_sd,
         | 
| @@ -665,29 +719,36 @@ void recursive_split_numeric(Workspace &workspace, | |
| 665 719 | 
             
                for (size_t col = 0; col < input_data.ncols_categ; col++) {
         | 
| 666 720 |  | 
| 667 721 | 
             
                    if (input_data.skip_col[col + input_data.ncols_numeric]) continue;
         | 
| 722 | 
            +
                    if (workspace.exhausted_col_tracker.is_exhausted[col + input_data.ncols_numeric]) continue;
         | 
| 668 723 |  | 
| 669 | 
            -
                    split_categx_numericy( | 
| 670 | 
            -
                                          workspace.target_numeric_col, workspace.sd_y, workspace.mean_y, false, input_data.ncat[col],  | 
| 671 | 
            -
                                           | 
| 724 | 
            +
                    split_categx_numericy(workspace.ix_arr.data(), workspace.st, workspace.end, input_data.categorical_data + col * input_data.nrows,
         | 
| 725 | 
            +
                                          workspace.target_numeric_col, workspace.sd_y, workspace.mean_y, false, input_data.ncat[col], workspace.buffer_cat_cnt.data(),
         | 
| 726 | 
            +
                                          workspace.buffer_cat_sum.data(), workspace.buffer_cat_sum_sq.data(), workspace.buffer_cat_sorted.data(),
         | 
| 672 727 | 
             
                                          (bool)(input_data.has_NA[col + input_data.ncols_numeric]), model_params.min_size_numeric,
         | 
| 673 | 
            -
                                          &(workspace.this_gain),  | 
| 728 | 
            +
                                          &(workspace.this_gain), workspace.buffer_subset_categ.data(), NULL,
         | 
| 729 | 
            +
                                          &workspace.has_zero_variance, &workspace.is_binary_split);
         | 
| 730 | 
            +
                    if (workspace.has_zero_variance) {
         | 
| 731 | 
            +
                        workspace.has_zero_variance = false;
         | 
| 732 | 
            +
                        workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
         | 
| 733 | 
            +
                        continue;
         | 
| 734 | 
            +
                    }
         | 
| 674 735 | 
             
                    if (model_params.gain_as_pct) workspace.this_gain /= workspace.sd_y;
         | 
| 675 736 |  | 
| 676 737 | 
             
                    if (workspace.this_gain >= model_params.min_gain) {
         | 
| 677 738 |  | 
| 678 739 | 
             
                        /* data is not arranged inside the splitting function, need to now assign to the branches as determined */
         | 
| 679 | 
            -
                        divide_subset_split( | 
| 680 | 
            -
                                             | 
| 740 | 
            +
                        divide_subset_split(workspace.ix_arr.data(), input_data.categorical_data + col * input_data.nrows, workspace.st, workspace.end,
         | 
| 741 | 
            +
                                            workspace.buffer_subset_categ.data(), input_data.ncat[col], (bool)(workspace.buffer_cat_cnt[input_data.ncat[col]] > 0),
         | 
| 681 742 | 
             
                                            &(workspace.this_split_NA), &(workspace.this_split_ix));
         | 
| 682 743 |  | 
| 683 744 | 
             
                        /* NA branch */
         | 
| 684 745 | 
             
                        if ((workspace.this_split_NA - workspace.st) > model_params.min_size_numeric) {
         | 
| 685 746 |  | 
| 686 747 | 
             
                            (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
         | 
| 687 | 
            -
                            workspace.clusters->emplace_back(Categorical, col, IsNa, (char*)NULL, (int)0, true);
         | 
| 688 | 
            -
                            workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col,  | 
| 689 | 
            -
                                                                              workspace.this_split_NA - 1,  | 
| 690 | 
            -
                                                                               | 
| 748 | 
            +
                            workspace.clusters->emplace_back(Categorical, col, IsNa, (signed char*)NULL, (int)0, true);
         | 
| 749 | 
            +
                            workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.st,
         | 
| 750 | 
            +
                                                                              workspace.this_split_NA - 1, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
         | 
| 751 | 
            +
                                                                              workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
         | 
| 691 752 | 
             
                                                                              workspace.clusters->size() - 1, tree_from, curr_depth + 1,
         | 
| 692 753 | 
             
                                                                              workspace.log_transf, workspace.log_minval, workspace.exp_transf,
         | 
| 693 754 | 
             
                                                                              workspace.orig_mean, workspace.orig_sd,
         | 
| @@ -697,9 +758,12 @@ void recursive_split_numeric(Workspace &workspace, | |
| 697 758 |  | 
| 698 759 | 
             
                            if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
         | 
| 699 760 | 
             
                                (*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
         | 
| 700 | 
            -
                                workspace.tree->emplace_back(tree_from, col, IsNa, (char*)NULL, 0);
         | 
| 761 | 
            +
                                workspace.tree->emplace_back(tree_from, col, IsNa, (signed char*)NULL, 0);
         | 
| 701 762 | 
             
                                backup_recursion_state(workspace, *state_backup);
         | 
| 702 763 | 
             
                                workspace.end = workspace.this_split_NA - 1;
         | 
| 764 | 
            +
                                ExhaustedColumnsLevel level_col_tracker2;
         | 
| 765 | 
            +
                                level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
         | 
| 766 | 
            +
                                workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
         | 
| 703 767 | 
             
                                recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, true);
         | 
| 704 768 | 
             
                                restore_recursion_state(workspace, *state_backup);
         | 
| 705 769 | 
             
                            }
         | 
| @@ -708,10 +772,10 @@ void recursive_split_numeric(Workspace &workspace, | |
| 708 772 |  | 
| 709 773 | 
             
                        /* left branch */
         | 
| 710 774 | 
             
                        (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
         | 
| 711 | 
            -
                        workspace.clusters->emplace_back(Categorical, col, InSubset,  | 
| 712 | 
            -
                        workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col,  | 
| 713 | 
            -
                                                                          workspace.this_split_ix - 1,  | 
| 714 | 
            -
                                                                           | 
| 775 | 
            +
                        workspace.clusters->emplace_back(Categorical, col, InSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col], is_NA_branch);
         | 
| 776 | 
            +
                        workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.this_split_NA,
         | 
| 777 | 
            +
                                                                          workspace.this_split_ix - 1, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
         | 
| 778 | 
            +
                                                                          workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
         | 
| 715 779 | 
             
                                                                          workspace.clusters->size() - 1, tree_from, curr_depth + 1,
         | 
| 716 780 | 
             
                                                                          workspace.log_transf, workspace.log_minval, workspace.exp_transf,
         | 
| 717 781 | 
             
                                                                          workspace.orig_mean, workspace.orig_sd,
         | 
| @@ -721,20 +785,25 @@ void recursive_split_numeric(Workspace &workspace, | |
| 721 785 |  | 
| 722 786 | 
             
                        if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
         | 
| 723 787 | 
             
                            (*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
         | 
| 724 | 
            -
                            workspace.tree->emplace_back(tree_from, col, InSubset,  | 
| 788 | 
            +
                            workspace.tree->emplace_back(tree_from, col, InSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col]);
         | 
| 725 789 | 
             
                            backup_recursion_state(workspace, *state_backup);
         | 
| 726 790 | 
             
                            workspace.st = workspace.this_split_NA;
         | 
| 727 791 | 
             
                            workspace.end = workspace.this_split_ix - 1;
         | 
| 792 | 
            +
                            ExhaustedColumnsLevel level_col_tracker2;
         | 
| 793 | 
            +
                            if (workspace.is_binary_split) {
         | 
| 794 | 
            +
                                level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
         | 
| 795 | 
            +
                                workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
         | 
| 796 | 
            +
                            }
         | 
| 728 797 | 
             
                            recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
         | 
| 729 798 | 
             
                            restore_recursion_state(workspace, *state_backup);
         | 
| 730 799 | 
             
                        }
         | 
| 731 800 |  | 
| 732 801 | 
             
                        /* right branch */
         | 
| 733 802 | 
             
                        (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
         | 
| 734 | 
            -
                        workspace.clusters->emplace_back(Categorical, col, NotInSubset,  | 
| 735 | 
            -
                        workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col,  | 
| 736 | 
            -
                                                                          workspace.end,  | 
| 737 | 
            -
                                                                           | 
| 803 | 
            +
                        workspace.clusters->emplace_back(Categorical, col, NotInSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col], is_NA_branch);
         | 
| 804 | 
            +
                        workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.this_split_ix,
         | 
| 805 | 
            +
                                                                          workspace.end, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
         | 
| 806 | 
            +
                                                                          workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
         | 
| 738 807 | 
             
                                                                          workspace.clusters->size() - 1, tree_from, curr_depth + 1,
         | 
| 739 808 | 
             
                                                                          workspace.log_transf, workspace.log_minval, workspace.exp_transf,
         | 
| 740 809 | 
             
                                                                          workspace.orig_mean, workspace.orig_sd,
         | 
| @@ -744,9 +813,14 @@ void recursive_split_numeric(Workspace &workspace, | |
| 744 813 |  | 
| 745 814 | 
             
                        if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
         | 
| 746 815 | 
             
                            (*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
         | 
| 747 | 
            -
                            workspace.tree->emplace_back(tree_from, col, NotInSubset,  | 
| 816 | 
            +
                            workspace.tree->emplace_back(tree_from, col, NotInSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col]);
         | 
| 748 817 | 
             
                            backup_recursion_state(workspace, *state_backup);
         | 
| 749 818 | 
             
                            workspace.st = workspace.this_split_ix;
         | 
| 819 | 
            +
                            ExhaustedColumnsLevel level_col_tracker2;
         | 
| 820 | 
            +
                            if (workspace.is_binary_split) {
         | 
| 821 | 
            +
                                level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
         | 
| 822 | 
            +
                                workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
         | 
| 823 | 
            +
                            }
         | 
| 750 824 | 
             
                            recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
         | 
| 751 825 | 
             
                            restore_recursion_state(workspace, *state_backup);
         | 
| 752 826 | 
             
                        }
         | 
| @@ -755,7 +829,8 @@ void recursive_split_numeric(Workspace &workspace, | |
| 755 829 | 
             
                            workspace.best_gain = workspace.this_gain;
         | 
| 756 830 | 
             
                            workspace.column_type_best = Categorical;
         | 
| 757 831 | 
             
                            workspace.col_best = col;
         | 
| 758 | 
            -
                             | 
| 832 | 
            +
                            workspace.best_cat_split_is_binary = workspace.is_binary_split;
         | 
| 833 | 
            +
                            memcpy(workspace.buffer_subset_categ_best.data(), workspace.buffer_subset_categ.data(), input_data.ncat[col] * sizeof(signed char));
         | 
| 759 834 | 
             
                        }
         | 
| 760 835 |  | 
| 761 836 | 
             
                    }
         | 
| @@ -766,18 +841,25 @@ void recursive_split_numeric(Workspace &workspace, | |
| 766 841 | 
             
                for (size_t col = 0; col < input_data.ncols_ord; col++) {
         | 
| 767 842 |  | 
| 768 843 | 
             
                    if (input_data.skip_col[col + input_data.ncols_numeric + input_data.ncols_categ]) continue;
         | 
| 844 | 
            +
                    if (workspace.exhausted_col_tracker.is_exhausted[col + input_data.ncols_numeric + input_data.ncols_categ]) continue;
         | 
| 769 845 |  | 
| 770 846 | 
             
                    /* same code as for categorical, but this time with split level as int instead of boolean array as subset */
         | 
| 771 | 
            -
                    split_categx_numericy( | 
| 772 | 
            -
                                          workspace.target_numeric_col, workspace.sd_y, workspace.mean_y, true, input_data.ncat_ord[col],  | 
| 773 | 
            -
                                           | 
| 847 | 
            +
                    split_categx_numericy(workspace.ix_arr.data(), workspace.st, workspace.end, input_data.ordinal_data + col * input_data.nrows,
         | 
| 848 | 
            +
                                          workspace.target_numeric_col, workspace.sd_y, workspace.mean_y, true, input_data.ncat_ord[col], workspace.buffer_cat_cnt.data(),
         | 
| 849 | 
            +
                                          workspace.buffer_cat_sum.data(), workspace.buffer_cat_sum_sq.data(), workspace.buffer_cat_sorted.data(),
         | 
| 774 850 | 
             
                                          (bool)(input_data.has_NA[col + input_data.ncols_numeric + input_data.ncols_categ]), model_params.min_size_numeric,
         | 
| 775 | 
            -
                                          &(workspace.this_gain),  | 
| 851 | 
            +
                                          &(workspace.this_gain), workspace.buffer_subset_categ.data(), &(workspace.this_split_lev),
         | 
| 852 | 
            +
                                          &workspace.has_zero_variance, &workspace.is_binary_split);
         | 
| 853 | 
            +
                    if (workspace.has_zero_variance) {
         | 
| 854 | 
            +
                        workspace.has_zero_variance = false;
         | 
| 855 | 
            +
                        workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_categ);
         | 
| 856 | 
            +
                        continue;
         | 
| 857 | 
            +
                    }
         | 
| 776 858 | 
             
                    if (model_params.gain_as_pct) workspace.this_gain /= workspace.sd_y;
         | 
| 777 859 |  | 
| 778 860 | 
             
                    if (workspace.this_gain >= model_params.min_gain) {
         | 
| 779 861 |  | 
| 780 | 
            -
                        divide_subset_split( | 
| 862 | 
            +
                        divide_subset_split(workspace.ix_arr.data(), input_data.ordinal_data + col * input_data.nrows, workspace.st, workspace.end,
         | 
| 781 863 | 
             
                                            workspace.this_split_lev, (bool)(workspace.buffer_cat_cnt[ input_data.ncat_ord[col] ] > 0),
         | 
| 782 864 | 
             
                                            &(workspace.this_split_NA), &(workspace.this_split_ix) );
         | 
| 783 865 |  | 
| @@ -785,9 +867,9 @@ void recursive_split_numeric(Workspace &workspace, | |
| 785 867 |  | 
| 786 868 | 
             
                            (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
         | 
| 787 869 | 
             
                            workspace.clusters->emplace_back(Ordinal, col, IsNa, (int)0, true);
         | 
| 788 | 
            -
                            workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col,  | 
| 789 | 
            -
                                                                              workspace.this_split_NA - 1,  | 
| 790 | 
            -
                                                                               | 
| 870 | 
            +
                            workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.st,
         | 
| 871 | 
            +
                                                                              workspace.this_split_NA - 1, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
         | 
| 872 | 
            +
                                                                              workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
         | 
| 791 873 | 
             
                                                                              workspace.clusters->size() - 1, tree_from, curr_depth + 1,
         | 
| 792 874 | 
             
                                                                              workspace.log_transf, workspace.log_minval, workspace.exp_transf,
         | 
| 793 875 | 
             
                                                                              workspace.orig_mean, workspace.orig_sd,
         | 
| @@ -800,6 +882,9 @@ void recursive_split_numeric(Workspace &workspace, | |
| 800 882 | 
             
                                workspace.tree->emplace_back(tree_from, col, (int)-1, IsNa);
         | 
| 801 883 | 
             
                                backup_recursion_state(workspace, *state_backup);
         | 
| 802 884 | 
             
                                workspace.end = workspace.this_split_NA - 1;
         | 
| 885 | 
            +
                                ExhaustedColumnsLevel level_col_tracker2;
         | 
| 886 | 
            +
                                level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
         | 
| 887 | 
            +
                                workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_ord);
         | 
| 803 888 | 
             
                                recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, true);
         | 
| 804 889 | 
             
                                restore_recursion_state(workspace, *state_backup);
         | 
| 805 890 | 
             
                            }
         | 
| @@ -809,9 +894,9 @@ void recursive_split_numeric(Workspace &workspace, | |
| 809 894 | 
             
                        /* left branch */
         | 
| 810 895 | 
             
                        (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
         | 
| 811 896 | 
             
                        workspace.clusters->emplace_back(Ordinal, col, LessOrEqual, workspace.this_split_lev, is_NA_branch);
         | 
| 812 | 
            -
                        workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col,  | 
| 813 | 
            -
                                                                          workspace.this_split_ix - 1,  | 
| 814 | 
            -
                                                                           | 
| 897 | 
            +
                        workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.this_split_NA,
         | 
| 898 | 
            +
                                                                          workspace.this_split_ix - 1, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
         | 
| 899 | 
            +
                                                                          workspace.outlier_trees.data(), &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
         | 
| 815 900 | 
             
                                                                          workspace.clusters->size() - 1, tree_from, curr_depth + 1,
         | 
| 816 901 | 
             
                                                                          workspace.log_transf, workspace.log_minval, workspace.exp_transf,
         | 
| 817 902 | 
             
                                                                          workspace.orig_mean, workspace.orig_sd,
         | 
| @@ -825,6 +910,11 @@ void recursive_split_numeric(Workspace &workspace, | |
| 825 910 | 
             
                            backup_recursion_state(workspace, *state_backup);
         | 
| 826 911 | 
             
                            workspace.st = workspace.this_split_NA;
         | 
| 827 912 | 
             
                            workspace.end = workspace.this_split_ix - 1;
         | 
| 913 | 
            +
                            ExhaustedColumnsLevel level_col_tracker2;
         | 
| 914 | 
            +
                            if (workspace.is_binary_split) {
         | 
| 915 | 
            +
                                level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
         | 
| 916 | 
            +
                                workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_ord);
         | 
| 917 | 
            +
                            }
         | 
| 828 918 | 
             
                            recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
         | 
| 829 919 | 
             
                            restore_recursion_state(workspace, *state_backup);
         | 
| 830 920 | 
             
                        }
         | 
| @@ -834,9 +924,9 @@ void recursive_split_numeric(Workspace &workspace, | |
| 834 924 | 
             
                        /* right branch */
         | 
| 835 925 | 
             
                        (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
         | 
| 836 926 | 
             
                        workspace.clusters->emplace_back(Ordinal, col, Greater, workspace.this_split_lev, is_NA_branch);
         | 
| 837 | 
            -
                        workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col,  | 
| 838 | 
            -
                                                                          workspace.end,  | 
| 839 | 
            -
                                                                           | 
| 927 | 
            +
                        workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.this_split_ix,
         | 
| 928 | 
            +
                                                                          workspace.end, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
         | 
| 929 | 
            +
                                                                          workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
         | 
| 840 930 | 
             
                                                                          workspace.clusters->size() - 1, tree_from, curr_depth + 1,
         | 
| 841 931 | 
             
                                                                          workspace.log_transf, workspace.log_minval, workspace.exp_transf,
         | 
| 842 932 | 
             
                                                                          workspace.orig_mean, workspace.orig_sd,
         | 
| @@ -849,6 +939,11 @@ void recursive_split_numeric(Workspace &workspace, | |
| 849 939 | 
             
                            workspace.tree->emplace_back(tree_from, col, workspace.this_split_lev, Greater);
         | 
| 850 940 | 
             
                            backup_recursion_state(workspace, *state_backup);
         | 
| 851 941 | 
             
                            workspace.st = workspace.this_split_ix;
         | 
| 942 | 
            +
                            ExhaustedColumnsLevel level_col_tracker2;
         | 
| 943 | 
            +
                            if (workspace.is_binary_split) {
         | 
| 944 | 
            +
                                level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
         | 
| 945 | 
            +
                                workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_ord);
         | 
| 946 | 
            +
                            }
         | 
| 852 947 | 
             
                            recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
         | 
| 853 948 | 
             
                            restore_recursion_state(workspace, *state_backup);
         | 
| 854 949 | 
             
                        }
         | 
| @@ -858,6 +953,7 @@ void recursive_split_numeric(Workspace &workspace, | |
| 858 953 | 
             
                            workspace.column_type_best = Ordinal;
         | 
| 859 954 | 
             
                            workspace.col_best = col;
         | 
| 860 955 | 
             
                            workspace.split_lev_best = workspace.this_split_lev;
         | 
| 956 | 
            +
                            workspace.best_cat_split_is_binary = workspace.is_binary_split;
         | 
| 861 957 | 
             
                        }
         | 
| 862 958 |  | 
| 863 959 | 
             
                    }
         | 
| @@ -867,7 +963,7 @@ void recursive_split_numeric(Workspace &workspace, | |
| 867 963 | 
             
                /* avoid unnecessary memory usage */
         | 
| 868 964 | 
             
                workspace.col_has_outliers = workspace.lev_has_outliers? true : workspace.col_has_outliers;
         | 
| 869 965 | 
             
                (*workspace.tree)[tree_from].clusters.shrink_to_fit();
         | 
| 870 | 
            -
                if ((*workspace.tree)[tree_from].all_branches. | 
| 966 | 
            +
                if (!(*workspace.tree)[tree_from].all_branches.empty()) (*workspace.tree)[tree_from].all_branches.shrink_to_fit();
         | 
| 871 967 |  | 
| 872 968 |  | 
| 873 969 | 
             
                /* continue splitting further if meeting threshold criteria */
         | 
| @@ -879,13 +975,13 @@ void recursive_split_numeric(Workspace &workspace, | |
| 879 975 |  | 
| 880 976 | 
             
                    /* discard outliers if any */
         | 
| 881 977 | 
             
                    if (workspace.lev_has_outliers)
         | 
| 882 | 
            -
                        workspace.st = move_outliers_to_front( | 
| 978 | 
            +
                        workspace.st = move_outliers_to_front(workspace.ix_arr.data(), workspace.outlier_scores.data(), workspace.st, workspace.end);
         | 
| 883 979 |  | 
| 884 980 | 
             
                    /* assign rows to their corresponding branch */
         | 
| 885 981 | 
             
                    switch(workspace.column_type_best) {
         | 
| 886 982 | 
             
                        case Numeric:
         | 
| 887 983 | 
             
                        {
         | 
| 888 | 
            -
                            divide_subset_split( | 
| 984 | 
            +
                            divide_subset_split(workspace.ix_arr.data(), input_data.numeric_data + workspace.col_best * input_data.nrows,
         | 
| 889 985 | 
             
                                                workspace.st, workspace.end, workspace.split_point_best,
         | 
| 890 986 | 
             
                                                (bool)(input_data.has_NA[workspace.col_best]),
         | 
| 891 987 | 
             
                                                &(workspace.this_split_NA), &(workspace.this_split_ix) );
         | 
| @@ -896,28 +992,35 @@ void recursive_split_numeric(Workspace &workspace, | |
| 896 992 |  | 
| 897 993 | 
             
                        case Categorical:
         | 
| 898 994 | 
             
                        {
         | 
| 899 | 
            -
                            divide_subset_split( | 
| 900 | 
            -
                                                workspace.st, workspace.end,  | 
| 995 | 
            +
                            divide_subset_split(workspace.ix_arr.data(), input_data.categorical_data + workspace.col_best * input_data.nrows,
         | 
| 996 | 
            +
                                                workspace.st, workspace.end, workspace.buffer_subset_categ_best.data(), input_data.ncat[workspace.col_best],
         | 
| 901 997 | 
             
                                                (bool)(input_data.has_NA[workspace.col_best + input_data.ncols_numeric]),
         | 
| 902 998 | 
             
                                                &(workspace.this_split_NA), &(workspace.this_split_ix) );
         | 
| 903 999 | 
             
                            spl1 = InSubset; spl2 = NotInSubset;
         | 
| 904 1000 | 
             
                            set_tree_as_categorical(workspace.tree->back(), input_data.ncat[workspace.col_best],
         | 
| 905 | 
            -
                                                     | 
| 1001 | 
            +
                                                    workspace.buffer_subset_categ_best.data(), workspace.col_best);
         | 
| 1002 | 
            +
                            if (input_data.ncat[workspace.col_best] == 2 || workspace.best_cat_split_is_binary)
         | 
| 1003 | 
            +
                                workspace.exhausted_col_tracker.push_col(workspace.col_best + input_data.ncols_numeric);
         | 
| 906 1004 | 
             
                            break;
         | 
| 907 1005 | 
             
                        }
         | 
| 908 1006 |  | 
| 909 1007 | 
             
                        case Ordinal:
         | 
| 910 1008 | 
             
                        {
         | 
| 911 | 
            -
                            divide_subset_split( | 
| 1009 | 
            +
                            divide_subset_split(workspace.ix_arr.data(), input_data.ordinal_data + workspace.col_best * input_data.nrows,
         | 
| 912 1010 | 
             
                                                workspace.st, workspace.end, workspace.split_lev_best,
         | 
| 913 1011 | 
             
                                                (bool)(input_data.has_NA[workspace.col_best + input_data.ncols_numeric + input_data.ncols_categ]),
         | 
| 914 1012 | 
             
                                                &(workspace.this_split_NA), &(workspace.this_split_ix) );
         | 
| 915 1013 | 
             
                            spl1 = LessOrEqual; spl2 = Greater;
         | 
| 916 1014 | 
             
                            set_tree_as_ordinal(workspace.tree->back(), workspace.split_lev_best, workspace.col_best);
         | 
| 1015 | 
            +
                            if (input_data.ncat_ord[workspace.col_best] == 2 || workspace.best_cat_split_is_binary)
         | 
| 1016 | 
            +
                                workspace.exhausted_col_tracker.push_col(workspace.col_best + input_data.ncols_numeric + input_data.ncols_categ);
         | 
| 917 1017 | 
             
                            break;
         | 
| 918 1018 | 
             
                        }
         | 
| 919 1019 |  | 
| 920 | 
            -
             | 
| 1020 | 
            +
                        default:
         | 
| 1021 | 
            +
                        {
         | 
| 1022 | 
            +
                            unexpected_error();
         | 
| 1023 | 
            +
                        }
         | 
| 921 1024 | 
             
                    }
         | 
| 922 1025 |  | 
| 923 1026 | 
             
                    /* continue splitting recursively - need to remember from where */
         | 
| @@ -956,11 +1059,12 @@ void recursive_split_numeric(Workspace &workspace, | |
| 956 1059 | 
             
                }
         | 
| 957 1060 |  | 
| 958 1061 | 
             
                /* if tree has no clusters and no subtrees, disconnect it from parent and then drop */
         | 
| 1062 | 
            +
                abandon_column:
         | 
| 959 1063 | 
             
                if (check_tree_is_not_needed((*workspace.tree)[tree_from])) {
         | 
| 960 1064 |  | 
| 961 1065 | 
             
                    if (tree_from == 0) {
         | 
| 962 1066 | 
             
                        workspace.tree->clear();
         | 
| 963 | 
            -
                    } else if ((*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches. | 
| 1067 | 
            +
                    } else if (!(*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.empty()) {
         | 
| 964 1068 | 
             
                        (*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.pop_back();
         | 
| 965 1069 | 
             
                        workspace.tree->pop_back();
         | 
| 966 1070 | 
             
                    } else {
         | 
| @@ -995,6 +1099,11 @@ void recursive_split_numeric(Workspace &workspace, | |
| 995 1099 | 
             
                                (*workspace.tree)[(*workspace.tree)[tree_from].parent].tree_right = 0;
         | 
| 996 1100 | 
             
                                break;
         | 
| 997 1101 | 
             
                            }
         | 
| 1102 | 
            +
             | 
| 1103 | 
            +
                            default:
         | 
| 1104 | 
            +
                            {
         | 
| 1105 | 
            +
                                unexpected_error();
         | 
| 1106 | 
            +
                            }
         | 
| 998 1107 | 
             
                        }
         | 
| 999 1108 | 
             
                        workspace.tree->pop_back();
         | 
| 1000 1109 | 
             
                    }
         | 
| @@ -1010,6 +1119,8 @@ void process_categ_col(std::vector<Cluster> &cluster_root, | |
| 1010 1119 | 
             
                                   ModelParams &model_params,
         | 
| 1011 1120 | 
             
                                   ModelOutputs &model_outputs)
         | 
| 1012 1121 | 
             
            {
         | 
| 1122 | 
            +
                if (interrupt_switch) return;
         | 
| 1123 | 
            +
             | 
| 1013 1124 | 
             
                if (model_params.max_depth <= 0) return;
         | 
| 1014 1125 |  | 
| 1015 1126 | 
             
                /* extract necesary info from column and discard NAs */
         | 
| @@ -1024,10 +1135,10 @@ void process_categ_col(std::vector<Cluster> &cluster_root, | |
| 1024 1135 | 
             
                }
         | 
| 1025 1136 | 
             
                workspace.untransf_target_col = workspace.target_categ_col;
         | 
| 1026 1137 | 
             
                workspace.end = input_data.nrows - 1;
         | 
| 1027 | 
            -
                workspace.st = move_NAs_to_front( | 
| 1138 | 
            +
                workspace.st = move_NAs_to_front(workspace.ix_arr.data(), workspace.target_categ_col, 0, workspace.end);
         | 
| 1028 1139 | 
             
                workspace.col_has_outliers = false;
         | 
| 1029 1140 | 
             
                workspace.col_is_bin = workspace.ncat_this <= 2;
         | 
| 1030 | 
            -
                workspace.prop_small_this =  | 
| 1141 | 
            +
                workspace.prop_small_this = model_params.prop_small.data() + model_outputs.start_ix_cat_counts[workspace.target_col_num];
         | 
| 1031 1142 | 
             
                workspace.prior_prob = &model_outputs.prop_categ[ model_outputs.start_ix_cat_counts[workspace.target_col_num] ];
         | 
| 1032 1143 |  | 
| 1033 1144 | 
             
                /* create cluster root and reset outlier scores for this column */
         | 
| @@ -1038,20 +1149,20 @@ void process_categ_col(std::vector<Cluster> &cluster_root, | |
| 1038 1149 |  | 
| 1039 1150 |  | 
| 1040 1151 | 
             
                /* at first, see if there's a category with 1-2 observations among only categories with large counts */
         | 
| 1041 | 
            -
                workspace.col_has_outliers = find_outlier_categories_no_cond( | 
| 1152 | 
            +
                workspace.col_has_outliers = find_outlier_categories_no_cond(input_data.cat_counts.data() + model_outputs.start_ix_cat_counts[workspace.target_col_num],
         | 
| 1042 1153 | 
             
                                                                             workspace.ncat_this, workspace.end - workspace.st + 1,
         | 
| 1043 | 
            -
                                                                              | 
| 1154 | 
            +
                                                                             workspace.buffer_subset_categ.data(), &(workspace.orig_mean));
         | 
| 1044 1155 |  | 
| 1045 1156 | 
             
                /* if there is any such case, create a cluster for them */
         | 
| 1046 1157 | 
             
                if (workspace.col_has_outliers) {
         | 
| 1047 1158 | 
             
                    workspace.tree->back().clusters.push_back(0);
         | 
| 1048 1159 | 
             
                    workspace.clusters->emplace_back(NoType, Root);
         | 
| 1049 | 
            -
                    define_categ_cluster_no_cond(workspace.untransf_target_col,  | 
| 1050 | 
            -
                                                  | 
| 1051 | 
            -
                                                  | 
| 1052 | 
            -
                                                  | 
| 1053 | 
            -
                                                  | 
| 1054 | 
            -
                    workspace.st = move_outliers_to_front( | 
| 1160 | 
            +
                    define_categ_cluster_no_cond(workspace.untransf_target_col, workspace.ix_arr.data(), workspace.st, workspace.end, workspace.ncat_this,
         | 
| 1161 | 
            +
                                                 workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
         | 
| 1162 | 
            +
                                                 workspace.outlier_depth.data(), workspace.clusters->back(),
         | 
| 1163 | 
            +
                                                 input_data.cat_counts.data() + model_outputs.start_ix_cat_counts[workspace.target_col_num],
         | 
| 1164 | 
            +
                                                 workspace.buffer_subset_categ.data(), workspace.orig_mean);
         | 
| 1165 | 
            +
                    workspace.st = move_outliers_to_front(workspace.ix_arr.data(), workspace.outlier_scores.data(), workspace.st, workspace.end);
         | 
| 1055 1166 | 
             
                }
         | 
| 1056 1167 |  | 
| 1057 1168 | 
             
                /* if no conditional outliers are required, stop there */
         | 
| @@ -1069,6 +1180,8 @@ void process_categ_col(std::vector<Cluster> &cluster_root, | |
| 1069 1180 | 
             
                }
         | 
| 1070 1181 | 
             
                if (should_skip) return;
         | 
| 1071 1182 |  | 
| 1183 | 
            +
                workspace.exhausted_col_tracker.initialize(input_data.tot_cols, model_params.max_depth);
         | 
| 1184 | 
            +
             | 
| 1072 1185 |  | 
| 1073 1186 | 
             
                /* if the column is already binary, or if using multiple categories, or if there are no more categorical columns, split the data as-is */
         | 
| 1074 1187 | 
             
                if (
         | 
| @@ -1080,7 +1193,7 @@ void process_categ_col(std::vector<Cluster> &cluster_root, | |
| 1080 1193 | 
             
                {
         | 
| 1081 1194 |  | 
| 1082 1195 | 
             
                    /* calculate base information */
         | 
| 1083 | 
            -
                    workspace.base_info = total_info( | 
| 1196 | 
            +
                    workspace.base_info = total_info(input_data.cat_counts.data() + model_outputs.start_ix_cat_counts[workspace.target_col_num],
         | 
| 1084 1197 | 
             
                                                     workspace.ncat_this, workspace.end - workspace.st + 1);
         | 
| 1085 1198 | 
             
                    workspace.base_info_orig = workspace.base_info;
         | 
| 1086 1199 |  | 
| @@ -1096,7 +1209,7 @@ void process_categ_col(std::vector<Cluster> &cluster_root, | |
| 1096 1209 | 
             
                    size_t cat_counts_bin[2];
         | 
| 1097 1210 | 
             
                    workspace.col_is_bin = true;
         | 
| 1098 1211 | 
             
                    workspace.already_split_main = false;
         | 
| 1099 | 
            -
                    workspace.base_info_orig = total_info( | 
| 1212 | 
            +
                    workspace.base_info_orig = total_info(input_data.cat_counts.data() + model_outputs.start_ix_cat_counts[workspace.target_col_num],
         | 
| 1100 1213 | 
             
                                                          workspace.ncat_this, workspace.end - workspace.st + 1);
         | 
| 1101 1214 | 
             
                    workspace.tree->back().column_type = NoType;
         | 
| 1102 1215 |  | 
| @@ -1130,7 +1243,7 @@ void process_categ_col(std::vector<Cluster> &cluster_root, | |
| 1130 1243 | 
             
                        }
         | 
| 1131 1244 |  | 
| 1132 1245 | 
             
                        if (cat_counts_bin[0] > 0 && cat_counts_bin[1] > 0) {
         | 
| 1133 | 
            -
                            workspace.target_categ_col =  | 
| 1246 | 
            +
                            workspace.target_categ_col = workspace.buffer_bin_y.data();
         | 
| 1134 1247 | 
             
                            workspace.base_info = total_info(cat_counts_bin, 2, workspace.end - workspace.st + 1);
         | 
| 1135 1248 | 
             
                            (*workspace.tree)[0].binary_branches.push_back(workspace.tree->size());
         | 
| 1136 1249 | 
             
                            workspace.tree->emplace_back(0, SubTrees);
         | 
| @@ -1150,6 +1263,8 @@ void recursive_split_categ(Workspace &workspace, | |
| 1150 1263 | 
             
                                       ModelParams &model_params,
         | 
| 1151 1264 | 
             
                                       size_t curr_depth, bool is_NA_branch)
         | 
| 1152 1265 | 
             
            {
         | 
| 1266 | 
            +
                if (interrupt_switch) return;
         | 
| 1267 | 
            +
                
         | 
| 1153 1268 | 
             
                /*    idea is the same as its numeric counterpart, only splitting by another categorical
         | 
| 1154 1269 | 
             
                    is less clear how to do and offers different options */
         | 
| 1155 1270 | 
             
                workspace.best_gain = -HUGE_VAL;
         | 
| @@ -1158,14 +1273,15 @@ void recursive_split_categ(Workspace &workspace, | |
| 1158 1273 | 
             
                size_t ix1, ix2, ix3;
         | 
| 1159 1274 | 
             
                SplitType spl1, spl2;
         | 
| 1160 1275 | 
             
                size_t tree_from = workspace.tree->size() - 1;
         | 
| 1276 | 
            +
                ExhaustedColumnsLevel level_col_tracker;
         | 
| 1161 1277 |  | 
| 1162 1278 | 
             
                /* when using 'follow_all' need to keep track of a lot more things */
         | 
| 1163 1279 | 
             
                std::unique_ptr<RecursionState> state_backup;
         | 
| 1164 1280 | 
             
                if (model_params.follow_all) state_backup = std::unique_ptr<RecursionState>(new RecursionState);
         | 
| 1165 1281 |  | 
| 1166 1282 | 
             
                if (curr_depth > 0) {
         | 
| 1167 | 
            -
                    workspace.base_info_orig = total_info( | 
| 1168 | 
            -
                                                          workspace.ncat_this,  | 
| 1283 | 
            +
                    workspace.base_info_orig = total_info(workspace.ix_arr.data(), workspace.untransf_target_col, workspace.st, workspace.end,
         | 
| 1284 | 
            +
                                                          workspace.ncat_this, workspace.buffer_cat_cnt.data());
         | 
| 1169 1285 |  | 
| 1170 1286 | 
             
                    /* check that there's still more than 1 category */
         | 
| 1171 1287 | 
             
                    size_t ncat_present = 0;
         | 
| @@ -1175,14 +1291,16 @@ void recursive_split_categ(Workspace &workspace, | |
| 1175 1291 | 
             
                    }
         | 
| 1176 1292 | 
             
                    if (ncat_present < 2) goto drop_if_not_needed;
         | 
| 1177 1293 | 
             
                    if (workspace.col_is_bin && workspace.ncat_this > 2) {
         | 
| 1178 | 
            -
                        workspace.base_info = total_info( | 
| 1179 | 
            -
                                                         2,  | 
| 1294 | 
            +
                        workspace.base_info = total_info(workspace.ix_arr.data(), workspace.target_categ_col, workspace.st, workspace.end,
         | 
| 1295 | 
            +
                                                         2, workspace.buffer_cat_cnt.data());
         | 
| 1180 1296 | 
             
                        if (workspace.buffer_cat_cnt[0] < model_params.min_size_categ || workspace.buffer_cat_cnt[1] == model_params.min_size_categ) goto drop_if_not_needed;
         | 
| 1181 1297 | 
             
                    } else {
         | 
| 1182 1298 | 
             
                        workspace.base_info = workspace.base_info_orig;
         | 
| 1183 1299 | 
             
                    }
         | 
| 1184 1300 | 
             
                }
         | 
| 1185 1301 |  | 
| 1302 | 
            +
                level_col_tracker.initialize(&workspace.exhausted_col_tracker);
         | 
| 1303 | 
            +
             | 
| 1186 1304 | 
             
                /* split with each other column */
         | 
| 1187 1305 |  | 
| 1188 1306 |  | 
| @@ -1191,11 +1309,17 @@ void recursive_split_categ(Workspace &workspace, | |
| 1191 1309 |  | 
| 1192 1310 | 
             
                    if (curr_depth == 0 && workspace.col_is_bin && workspace.ncat_this > 2 && workspace.already_split_main) break;
         | 
| 1193 1311 | 
             
                    if (input_data.skip_col[col]) continue;
         | 
| 1194 | 
            -
                     | 
| 1312 | 
            +
                    if (workspace.exhausted_col_tracker.is_exhausted[col]) continue;
         | 
| 1313 | 
            +
                    split_numericx_categy(workspace.ix_arr.data(), workspace.st, workspace.end, input_data.numeric_data + col * input_data.nrows,
         | 
| 1195 1314 | 
             
                                          workspace.untransf_target_col, workspace.ncat_this, workspace.base_info_orig,
         | 
| 1196 | 
            -
                                           | 
| 1197 | 
            -
                                          model_params.take_mid, & | 
| 1198 | 
            -
                                          & | 
| 1315 | 
            +
                                          workspace.buffer_cat_cnt.data(), (bool)(input_data.has_NA[col]), model_params.min_size_categ,
         | 
| 1316 | 
            +
                                          model_params.take_mid, &workspace.this_gain, &workspace.this_split_point,
         | 
| 1317 | 
            +
                                          &workspace.this_split_ix, &workspace.this_split_NA, &workspace.has_zero_variance);
         | 
| 1318 | 
            +
                    if (workspace.has_zero_variance) {
         | 
| 1319 | 
            +
                        workspace.has_zero_variance = false;
         | 
| 1320 | 
            +
                        workspace.exhausted_col_tracker.push_col(col);
         | 
| 1321 | 
            +
                        continue;
         | 
| 1322 | 
            +
                    }
         | 
| 1199 1323 | 
             
                    if (model_params.gain_as_pct) workspace.this_gain /= workspace.base_info_orig;
         | 
| 1200 1324 |  | 
| 1201 1325 | 
             
                    if (workspace.this_gain >= model_params.min_gain) {
         | 
| @@ -1207,15 +1331,15 @@ void recursive_split_categ(Workspace &workspace, | |
| 1207 1331 | 
             
                            (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
         | 
| 1208 1332 | 
             
                            workspace.clusters->emplace_back(Numeric, col, IsNa, -HUGE_VAL, true);
         | 
| 1209 1333 | 
             
                            workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
         | 
| 1210 | 
            -
                                                                           | 
| 1334 | 
            +
                                                                          workspace.ix_arr.data(), workspace.st, workspace.this_split_NA - 1,
         | 
| 1211 1335 | 
             
                                                                          workspace.ncat_this, model_params.categ_from_maj,
         | 
| 1212 | 
            -
                                                                           | 
| 1213 | 
            -
                                                                           | 
| 1336 | 
            +
                                                                          workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
         | 
| 1337 | 
            +
                                                                          workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
         | 
| 1214 1338 | 
             
                                                                          workspace.clusters->size() - 1, tree_from, curr_depth + 1,
         | 
| 1215 1339 | 
             
                                                                          model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
         | 
| 1216 1340 | 
             
                                                                          workspace.prop_small_this, workspace.prior_prob,
         | 
| 1217 | 
            -
                                                                           | 
| 1218 | 
            -
                                                                           | 
| 1341 | 
            +
                                                                          workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
         | 
| 1342 | 
            +
                                                                          workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
         | 
| 1219 1343 | 
             
                            workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
         | 
| 1220 1344 | 
             
                            if (workspace.drop_cluster) {
         | 
| 1221 1345 | 
             
                                workspace.clusters->pop_back();
         | 
| @@ -1227,6 +1351,9 @@ void recursive_split_categ(Workspace &workspace, | |
| 1227 1351 | 
             
                                workspace.tree->emplace_back(tree_from, col, HUGE_VAL, IsNa);
         | 
| 1228 1352 | 
             
                                backup_recursion_state(workspace, *state_backup);
         | 
| 1229 1353 | 
             
                                workspace.end = workspace.this_split_NA - 1;
         | 
| 1354 | 
            +
                                ExhaustedColumnsLevel level_col_tracker2;
         | 
| 1355 | 
            +
                                level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
         | 
| 1356 | 
            +
                                workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
         | 
| 1230 1357 | 
             
                                recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, true);
         | 
| 1231 1358 | 
             
                                restore_recursion_state(workspace, *state_backup);
         | 
| 1232 1359 | 
             
                            }
         | 
| @@ -1237,15 +1364,15 @@ void recursive_split_categ(Workspace &workspace, | |
| 1237 1364 | 
             
                        (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
         | 
| 1238 1365 | 
             
                        workspace.clusters->emplace_back(Numeric, col, LessOrEqual, workspace.this_split_point, is_NA_branch);
         | 
| 1239 1366 | 
             
                        workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
         | 
| 1240 | 
            -
                                                                       | 
| 1367 | 
            +
                                                                      workspace.ix_arr.data(), workspace.this_split_NA, workspace.this_split_ix,
         | 
| 1241 1368 | 
             
                                                                      workspace.ncat_this, model_params.categ_from_maj,
         | 
| 1242 | 
            -
                                                                       | 
| 1243 | 
            -
                                                                       | 
| 1369 | 
            +
                                                                      workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
         | 
| 1370 | 
            +
                                                                      workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
         | 
| 1244 1371 | 
             
                                                                      workspace.clusters->size() - 1, tree_from, curr_depth + 1,
         | 
| 1245 1372 | 
             
                                                                      model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
         | 
| 1246 1373 | 
             
                                                                      workspace.prop_small_this, workspace.prior_prob,
         | 
| 1247 | 
            -
                                                                       | 
| 1248 | 
            -
                                                                       | 
| 1374 | 
            +
                                                                      workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
         | 
| 1375 | 
            +
                                                                      workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
         | 
| 1249 1376 | 
             
                        workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
         | 
| 1250 1377 | 
             
                        if (workspace.drop_cluster) {
         | 
| 1251 1378 | 
             
                            workspace.clusters->pop_back();
         | 
| @@ -1267,15 +1394,15 @@ void recursive_split_categ(Workspace &workspace, | |
| 1267 1394 | 
             
                        (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
         | 
| 1268 1395 | 
             
                        workspace.clusters->emplace_back(Numeric, col, Greater, workspace.this_split_point, is_NA_branch);
         | 
| 1269 1396 | 
             
                        workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
         | 
| 1270 | 
            -
                                                                       | 
| 1397 | 
            +
                                                                      workspace.ix_arr.data(), workspace.this_split_ix + 1, workspace.end,
         | 
| 1271 1398 | 
             
                                                                      workspace.ncat_this, model_params.categ_from_maj,
         | 
| 1272 | 
            -
                                                                       | 
| 1273 | 
            -
                                                                       | 
| 1399 | 
            +
                                                                      workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
         | 
| 1400 | 
            +
                                                                      workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
         | 
| 1274 1401 | 
             
                                                                      workspace.clusters->size() - 1, tree_from, curr_depth + 1,
         | 
| 1275 1402 | 
             
                                                                      model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
         | 
| 1276 1403 | 
             
                                                                      workspace.prop_small_this, workspace.prior_prob,
         | 
| 1277 | 
            -
                                                                       | 
| 1278 | 
            -
                                                                       | 
| 1404 | 
            +
                                                                      workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
         | 
| 1405 | 
            +
                                                                      workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
         | 
| 1279 1406 | 
             
                        workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
         | 
| 1280 1407 | 
             
                        if (workspace.drop_cluster) {
         | 
| 1281 1408 | 
             
                            workspace.clusters->pop_back();
         | 
| @@ -1313,15 +1440,23 @@ void recursive_split_categ(Workspace &workspace, | |
| 1313 1440 |  | 
| 1314 1441 | 
             
                    if (col == workspace.target_col_num && !workspace.target_col_is_ord) continue;
         | 
| 1315 1442 | 
             
                    if (input_data.skip_col[col + input_data.ncols_numeric]) continue;
         | 
| 1443 | 
            +
                    if (workspace.exhausted_col_tracker.is_exhausted[col + input_data.ncols_numeric]) continue;
         | 
| 1444 | 
            +
                    workspace.is_binary_split = false;
         | 
| 1316 1445 |  | 
| 1317 1446 | 
             
                    if (workspace.col_is_bin) {
         | 
| 1318 1447 |  | 
| 1319 | 
            -
                        split_categx_biny( | 
| 1448 | 
            +
                        split_categx_biny(workspace.ix_arr.data(), workspace.st, workspace.end,
         | 
| 1320 1449 | 
             
                                          input_data.categorical_data + col * input_data.nrows, workspace.target_categ_col,
         | 
| 1321 | 
            -
                                          input_data.ncat[col], workspace.base_info,  | 
| 1322 | 
            -
                                           | 
| 1450 | 
            +
                                          input_data.ncat[col], workspace.base_info, workspace.buffer_cat_cnt.data(),
         | 
| 1451 | 
            +
                                          workspace.buffer_crosstab.data(), workspace.buffer_cat_sorted.data(),
         | 
| 1323 1452 | 
             
                                          (bool)(input_data.has_NA[col + input_data.ncols_numeric]), model_params.min_size_categ,
         | 
| 1324 | 
            -
                                          &(workspace.this_gain),  | 
| 1453 | 
            +
                                          &(workspace.this_gain), workspace.buffer_subset_categ.data(),
         | 
| 1454 | 
            +
                                          &workspace.has_zero_variance, &workspace.is_binary_split);
         | 
| 1455 | 
            +
                        if (workspace.has_zero_variance) {
         | 
| 1456 | 
            +
                            workspace.has_zero_variance = false;
         | 
| 1457 | 
            +
                            workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
         | 
| 1458 | 
            +
                            continue;
         | 
| 1459 | 
            +
                        }
         | 
| 1325 1460 |  | 
| 1326 1461 | 
             
                        /* If it was forcibly binarized, need to calculate the gain on the original categories to make it comparable */
         | 
| 1327 1462 | 
             
                        if (
         | 
| @@ -1332,29 +1467,35 @@ void recursive_split_categ(Workspace &workspace, | |
| 1332 1467 | 
             
                                )
         | 
| 1333 1468 | 
             
                            )
         | 
| 1334 1469 | 
             
                        {
         | 
| 1335 | 
            -
                            divide_subset_split( | 
| 1336 | 
            -
                                                workspace.st, workspace.end,  | 
| 1470 | 
            +
                            divide_subset_split(workspace.ix_arr.data(), input_data.categorical_data + col * input_data.nrows,
         | 
| 1471 | 
            +
                                                workspace.st, workspace.end, workspace.buffer_subset_categ.data(), input_data.ncat[col],
         | 
| 1337 1472 | 
             
                                                (bool)input_data.has_NA[col + input_data.ncols_numeric],
         | 
| 1338 1473 | 
             
                                                &(workspace.this_split_NA), &(workspace.this_split_ix) );
         | 
| 1339 | 
            -
                            workspace.this_gain = categ_gain_from_split( | 
| 1474 | 
            +
                            workspace.this_gain = categ_gain_from_split(workspace.ix_arr.data(), workspace.untransf_target_col, workspace.st,
         | 
| 1340 1475 | 
             
                                                                        workspace.this_split_NA, workspace.this_split_ix, workspace.end,
         | 
| 1341 | 
            -
                                                                        workspace.ncat_this,  | 
| 1476 | 
            +
                                                                        workspace.ncat_this, workspace.buffer_cat_cnt.data(), workspace.base_info_orig);
         | 
| 1342 1477 | 
             
                        }
         | 
| 1343 1478 |  | 
| 1344 1479 | 
             
                    } else {
         | 
| 1345 1480 |  | 
| 1346 1481 | 
             
                        if (model_params.cat_bruteforce_subset && input_data.ncat[col] > 2) {
         | 
| 1347 | 
            -
                            split_categx_categy_subset( | 
| 1482 | 
            +
                            split_categx_categy_subset(workspace.ix_arr.data(), workspace.st, workspace.end,
         | 
| 1348 1483 | 
             
                                                       input_data.categorical_data + col * input_data.nrows, workspace.target_categ_col,
         | 
| 1349 1484 | 
             
                                                       input_data.ncat[col], workspace.ncat_this, workspace.base_info_orig,
         | 
| 1350 | 
            -
                                                        | 
| 1485 | 
            +
                                                       workspace.buffer_cat_sorted.data(), workspace.buffer_crosstab.data(), workspace.buffer_cat_cnt.data(),
         | 
| 1351 1486 | 
             
                                                       (bool)(input_data.has_NA[col + input_data.ncols_numeric]), model_params.min_size_categ,
         | 
| 1352 | 
            -
                                                       &(workspace.this_gain),  | 
| 1487 | 
            +
                                                       &(workspace.this_gain), workspace.buffer_subset_categ.data(),
         | 
| 1488 | 
            +
                                                       &workspace.has_zero_variance, &workspace.is_binary_split);
         | 
| 1489 | 
            +
                            if (workspace.has_zero_variance) {
         | 
| 1490 | 
            +
                                workspace.has_zero_variance = false;
         | 
| 1491 | 
            +
                                workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
         | 
| 1492 | 
            +
                                continue;
         | 
| 1493 | 
            +
                            }
         | 
| 1353 1494 | 
             
                        } else {
         | 
| 1354 | 
            -
                            split_categx_categy_separate( | 
| 1495 | 
            +
                            split_categx_categy_separate(workspace.ix_arr.data(), workspace.st, workspace.end,
         | 
| 1355 1496 | 
             
                                                         input_data.categorical_data + col * input_data.nrows, workspace.target_categ_col,
         | 
| 1356 1497 | 
             
                                                         input_data.ncat[col], workspace.ncat_this, workspace.base_info_orig,
         | 
| 1357 | 
            -
                                                          | 
| 1498 | 
            +
                                                         workspace.buffer_cat_cnt.data(), workspace.buffer_crosstab.data(),
         | 
| 1358 1499 | 
             
                                                         (bool)(input_data.has_NA[col + input_data.ncols_numeric]),
         | 
| 1359 1500 | 
             
                                                         model_params.min_size_categ, &(workspace.this_gain));
         | 
| 1360 1501 | 
             
                        }
         | 
| @@ -1365,21 +1506,21 @@ void recursive_split_categ(Workspace &workspace, | |
| 1365 1506 | 
             
                    if (workspace.this_gain >= model_params.min_gain) {
         | 
| 1366 1507 |  | 
| 1367 1508 | 
             
                        /* NA branch */
         | 
| 1368 | 
            -
                        workspace.this_split_NA = move_NAs_to_front( | 
| 1509 | 
            +
                        workspace.this_split_NA = move_NAs_to_front(workspace.ix_arr.data(), input_data.categorical_data + col * input_data.nrows, workspace.st, workspace.end);
         | 
| 1369 1510 | 
             
                        if ((workspace.this_split_NA - workspace.st) > model_params.min_size_categ) {
         | 
| 1370 1511 |  | 
| 1371 1512 | 
             
                            (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
         | 
| 1372 | 
            -
                            workspace.clusters->emplace_back(Categorical, col, IsNa, (char*)NULL, (int)0, true);
         | 
| 1513 | 
            +
                            workspace.clusters->emplace_back(Categorical, col, IsNa, (signed char*)NULL, (int)0, true);
         | 
| 1373 1514 | 
             
                            workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
         | 
| 1374 | 
            -
                                                                           | 
| 1515 | 
            +
                                                                          workspace.ix_arr.data(), workspace.st, workspace.this_split_NA - 1,
         | 
| 1375 1516 | 
             
                                                                          workspace.ncat_this, model_params.categ_from_maj,
         | 
| 1376 | 
            -
                                                                           | 
| 1377 | 
            -
                                                                           | 
| 1517 | 
            +
                                                                          workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
         | 
| 1518 | 
            +
                                                                          workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
         | 
| 1378 1519 | 
             
                                                                          workspace.clusters->size() - 1, tree_from, curr_depth + 1,
         | 
| 1379 1520 | 
             
                                                                          model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
         | 
| 1380 1521 | 
             
                                                                          workspace.prop_small_this, workspace.prior_prob,
         | 
| 1381 | 
            -
                                                                           | 
| 1382 | 
            -
                                                                           | 
| 1522 | 
            +
                                                                          workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
         | 
| 1523 | 
            +
                                                                          workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
         | 
| 1383 1524 | 
             
                            workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
         | 
| 1384 1525 | 
             
                            if (workspace.drop_cluster) {
         | 
| 1385 1526 | 
             
                                workspace.clusters->pop_back();
         | 
| @@ -1388,9 +1529,12 @@ void recursive_split_categ(Workspace &workspace, | |
| 1388 1529 |  | 
| 1389 1530 | 
             
                            if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
         | 
| 1390 1531 | 
             
                                (*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
         | 
| 1391 | 
            -
                                workspace.tree->emplace_back(tree_from, col, IsNa, (char*)NULL, 0);
         | 
| 1532 | 
            +
                                workspace.tree->emplace_back(tree_from, col, IsNa, (signed char*)NULL, 0);
         | 
| 1392 1533 | 
             
                                backup_recursion_state(workspace, *state_backup);
         | 
| 1393 1534 | 
             
                                workspace.end = workspace.this_split_NA - 1;
         | 
| 1535 | 
            +
                                ExhaustedColumnsLevel level_col_tracker2;
         | 
| 1536 | 
            +
                                level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
         | 
| 1537 | 
            +
                                workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
         | 
| 1394 1538 | 
             
                                recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, true);
         | 
| 1395 1539 | 
             
                                restore_recursion_state(workspace, *state_backup);
         | 
| 1396 1540 | 
             
                            }
         | 
| @@ -1401,7 +1545,7 @@ void recursive_split_categ(Workspace &workspace, | |
| 1401 1545 |  | 
| 1402 1546 | 
             
                            /* sort by the splitting variable and iterate over to determine the split points */
         | 
| 1403 1547 | 
             
                            workspace.temp_ptr_x = input_data.categorical_data + col * input_data.nrows;
         | 
| 1404 | 
            -
                            std::sort( | 
| 1548 | 
            +
                            std::sort(workspace.ix_arr.data() + workspace.this_split_NA, workspace.ix_arr.data() + workspace.end + 1,
         | 
| 1405 1549 | 
             
                                      [&workspace](const size_t a, const size_t b){return workspace.temp_ptr_x[a] < workspace.temp_ptr_x[b];});
         | 
| 1406 1550 | 
             
                            workspace.this_split_ix = workspace.this_split_NA;
         | 
| 1407 1551 |  | 
| @@ -1416,15 +1560,15 @@ void recursive_split_categ(Workspace &workspace, | |
| 1416 1560 | 
             
                                        (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
         | 
| 1417 1561 | 
             
                                        workspace.clusters->emplace_back(col, workspace.temp_ptr_x[workspace.ix_arr[row-1]], input_data.ncat[col], is_NA_branch);
         | 
| 1418 1562 | 
             
                                        workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
         | 
| 1419 | 
            -
                                                                                       | 
| 1563 | 
            +
                                                                                      workspace.ix_arr.data(), workspace.this_split_ix, row - 1,
         | 
| 1420 1564 | 
             
                                                                                      workspace.ncat_this, model_params.categ_from_maj,
         | 
| 1421 | 
            -
                                                                                       | 
| 1422 | 
            -
                                                                                       | 
| 1565 | 
            +
                                                                                      workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
         | 
| 1566 | 
            +
                                                                                      workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
         | 
| 1423 1567 | 
             
                                                                                      workspace.clusters->size() - 1, tree_from, curr_depth + 1,
         | 
| 1424 1568 | 
             
                                                                                      model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
         | 
| 1425 1569 | 
             
                                                                                      workspace.prop_small_this, workspace.prior_prob,
         | 
| 1426 | 
            -
                                                                                       | 
| 1427 | 
            -
                                                                                       | 
| 1570 | 
            +
                                                                                      workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
         | 
| 1571 | 
            +
                                                                                      workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
         | 
| 1428 1572 | 
             
                                        workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
         | 
| 1429 1573 | 
             
                                        if (workspace.drop_cluster) {
         | 
| 1430 1574 | 
             
                                            workspace.clusters->pop_back();
         | 
| @@ -1436,6 +1580,9 @@ void recursive_split_categ(Workspace &workspace, | |
| 1436 1580 | 
             
                                            backup_recursion_state(workspace, *state_backup);
         | 
| 1437 1581 | 
             
                                            workspace.st = workspace.this_split_ix;
         | 
| 1438 1582 | 
             
                                            workspace.end = row - 1;
         | 
| 1583 | 
            +
                                            ExhaustedColumnsLevel level_col_tracker2;
         | 
| 1584 | 
            +
                                            level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
         | 
| 1585 | 
            +
                                            workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
         | 
| 1439 1586 | 
             
                                            recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
         | 
| 1440 1587 | 
             
                                            restore_recursion_state(workspace, *state_backup);
         | 
| 1441 1588 | 
             
                                        }
         | 
| @@ -1448,15 +1595,15 @@ void recursive_split_categ(Workspace &workspace, | |
| 1448 1595 | 
             
                                (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
         | 
| 1449 1596 | 
             
                                workspace.clusters->emplace_back(col, workspace.temp_ptr_x[workspace.ix_arr[workspace.end]], input_data.ncat[col], is_NA_branch);
         | 
| 1450 1597 | 
             
                                workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
         | 
| 1451 | 
            -
                                                                               | 
| 1598 | 
            +
                                                                              workspace.ix_arr.data(), workspace.this_split_ix, workspace.end,
         | 
| 1452 1599 | 
             
                                                                              workspace.ncat_this, model_params.categ_from_maj,
         | 
| 1453 | 
            -
                                                                               | 
| 1454 | 
            -
                                                                               | 
| 1600 | 
            +
                                                                              workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
         | 
| 1601 | 
            +
                                                                              workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
         | 
| 1455 1602 | 
             
                                                                              workspace.clusters->size() - 1, tree_from, curr_depth + 1,
         | 
| 1456 1603 | 
             
                                                                              model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
         | 
| 1457 1604 | 
             
                                                                              workspace.prop_small_this, workspace.prior_prob,
         | 
| 1458 | 
            -
                                                                               | 
| 1459 | 
            -
                                                                               | 
| 1605 | 
            +
                                                                              workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
         | 
| 1606 | 
            +
                                                                              workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
         | 
| 1460 1607 | 
             
                                workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
         | 
| 1461 1608 | 
             
                                if (workspace.drop_cluster) {
         | 
| 1462 1609 | 
             
                                    workspace.clusters->pop_back();
         | 
| @@ -1467,6 +1614,9 @@ void recursive_split_categ(Workspace &workspace, | |
| 1467 1614 | 
             
                                    workspace.tree->emplace_back(tree_from, col, workspace.temp_ptr_x[workspace.ix_arr[workspace.end]]);
         | 
| 1468 1615 | 
             
                                    backup_recursion_state(workspace, *state_backup);
         | 
| 1469 1616 | 
             
                                    workspace.st = workspace.this_split_ix;
         | 
| 1617 | 
            +
                                    ExhaustedColumnsLevel level_col_tracker2;
         | 
| 1618 | 
            +
                                    level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
         | 
| 1619 | 
            +
                                    workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
         | 
| 1470 1620 | 
             
                                    recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
         | 
| 1471 1621 | 
             
                                    restore_recursion_state(workspace, *state_backup);
         | 
| 1472 1622 | 
             
                                }
         | 
| @@ -1477,6 +1627,7 @@ void recursive_split_categ(Workspace &workspace, | |
| 1477 1627 | 
             
                                workspace.best_gain = workspace.this_gain;
         | 
| 1478 1628 | 
             
                                workspace.column_type_best = Categorical;
         | 
| 1479 1629 | 
             
                                workspace.col_best = col;
         | 
| 1630 | 
            +
                                workspace.best_cat_split_is_binary = false;
         | 
| 1480 1631 | 
             
                            }
         | 
| 1481 1632 |  | 
| 1482 1633 |  | 
| @@ -1488,7 +1639,7 @@ void recursive_split_categ(Workspace &workspace, | |
| 1488 1639 |  | 
| 1489 1640 | 
             
                                workspace.buffer_subset_categ[0] = 1;
         | 
| 1490 1641 | 
             
                                workspace.buffer_subset_categ[1] = 0;
         | 
| 1491 | 
            -
                                divide_subset_split( | 
| 1642 | 
            +
                                divide_subset_split(workspace.ix_arr.data(), input_data.categorical_data + col * input_data.nrows, workspace.this_split_NA, workspace.end,
         | 
| 1492 1643 | 
             
                                                    (int)0, false, &(workspace.this_split_NA), &(workspace.this_split_ix));
         | 
| 1493 1644 | 
             
                                if (
         | 
| 1494 1645 | 
             
                                    (workspace.end - workspace.this_split_ix) < model_params.min_size_categ ||
         | 
| @@ -1497,24 +1648,24 @@ void recursive_split_categ(Workspace &workspace, | |
| 1497 1648 |  | 
| 1498 1649 | 
             
                            } else {
         | 
| 1499 1650 |  | 
| 1500 | 
            -
                                divide_subset_split( | 
| 1501 | 
            -
                                                     | 
| 1651 | 
            +
                                divide_subset_split(workspace.ix_arr.data(), input_data.categorical_data + col * input_data.nrows, workspace.this_split_NA, workspace.end,
         | 
| 1652 | 
            +
                                                    workspace.buffer_subset_categ.data(), input_data.ncat[col], false,
         | 
| 1502 1653 | 
             
                                                    &(workspace.this_split_NA), &(workspace.this_split_ix));
         | 
| 1503 1654 | 
             
                            }
         | 
| 1504 1655 |  | 
| 1505 1656 | 
             
                            /* left branch */
         | 
| 1506 1657 | 
             
                            (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
         | 
| 1507 | 
            -
                            workspace.clusters->emplace_back(Categorical, col, InSubset,  | 
| 1658 | 
            +
                            workspace.clusters->emplace_back(Categorical, col, InSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col], is_NA_branch);
         | 
| 1508 1659 | 
             
                            workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
         | 
| 1509 | 
            -
                                                                           | 
| 1660 | 
            +
                                                                          workspace.ix_arr.data(), workspace.this_split_NA, workspace.this_split_ix - 1,
         | 
| 1510 1661 | 
             
                                                                          workspace.ncat_this, model_params.categ_from_maj,
         | 
| 1511 | 
            -
                                                                           | 
| 1512 | 
            -
                                                                           | 
| 1662 | 
            +
                                                                          workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
         | 
| 1663 | 
            +
                                                                          workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
         | 
| 1513 1664 | 
             
                                                                          workspace.clusters->size() - 1, tree_from, curr_depth + 1,
         | 
| 1514 1665 | 
             
                                                                          model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
         | 
| 1515 1666 | 
             
                                                                          workspace.prop_small_this, workspace.prior_prob,
         | 
| 1516 | 
            -
                                                                           | 
| 1517 | 
            -
                                                                           | 
| 1667 | 
            +
                                                                          workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
         | 
| 1668 | 
            +
                                                                          workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
         | 
| 1518 1669 | 
             
                            workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
         | 
| 1519 1670 | 
             
                            if (workspace.drop_cluster) {
         | 
| 1520 1671 | 
             
                                workspace.clusters->pop_back();
         | 
| @@ -1523,27 +1674,32 @@ void recursive_split_categ(Workspace &workspace, | |
| 1523 1674 |  | 
| 1524 1675 | 
             
                            if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
         | 
| 1525 1676 | 
             
                                (*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
         | 
| 1526 | 
            -
                                workspace.tree->emplace_back(tree_from, col, InSubset,  | 
| 1677 | 
            +
                                workspace.tree->emplace_back(tree_from, col, InSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col]);
         | 
| 1527 1678 | 
             
                                backup_recursion_state(workspace, *state_backup);
         | 
| 1528 1679 | 
             
                                workspace.st = workspace.this_split_NA;
         | 
| 1529 1680 | 
             
                                workspace.end = workspace.this_split_ix - 1;
         | 
| 1681 | 
            +
                                ExhaustedColumnsLevel level_col_tracker2;
         | 
| 1682 | 
            +
                                if (input_data.ncat[col] == 2 || workspace.is_binary_split) {
         | 
| 1683 | 
            +
                                    level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
         | 
| 1684 | 
            +
                                    workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
         | 
| 1685 | 
            +
                                }
         | 
| 1530 1686 | 
             
                                recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
         | 
| 1531 1687 | 
             
                                restore_recursion_state(workspace, *state_backup);
         | 
| 1532 1688 | 
             
                            }
         | 
| 1533 1689 |  | 
| 1534 1690 | 
             
                            /* right branch */
         | 
| 1535 1691 | 
             
                            (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
         | 
| 1536 | 
            -
                            workspace.clusters->emplace_back(Categorical, col, NotInSubset,  | 
| 1692 | 
            +
                            workspace.clusters->emplace_back(Categorical, col, NotInSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col], is_NA_branch);
         | 
| 1537 1693 | 
             
                            workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
         | 
| 1538 | 
            -
                                                                           | 
| 1694 | 
            +
                                                                          workspace.ix_arr.data(), workspace.this_split_ix, workspace.end,
         | 
| 1539 1695 | 
             
                                                                          workspace.ncat_this, model_params.categ_from_maj,
         | 
| 1540 | 
            -
                                                                           | 
| 1541 | 
            -
                                                                           | 
| 1696 | 
            +
                                                                          workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
         | 
| 1697 | 
            +
                                                                          workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
         | 
| 1542 1698 | 
             
                                                                          workspace.clusters->size() - 1, tree_from, curr_depth + 1,
         | 
| 1543 1699 | 
             
                                                                          model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
         | 
| 1544 1700 | 
             
                                                                          workspace.prop_small_this, workspace.prior_prob,
         | 
| 1545 | 
            -
                                                                           | 
| 1546 | 
            -
                                                                           | 
| 1701 | 
            +
                                                                          workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
         | 
| 1702 | 
            +
                                                                          workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
         | 
| 1547 1703 | 
             
                            workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
         | 
| 1548 1704 | 
             
                            if (workspace.drop_cluster) {
         | 
| 1549 1705 | 
             
                                workspace.clusters->pop_back();
         | 
| @@ -1552,9 +1708,14 @@ void recursive_split_categ(Workspace &workspace, | |
| 1552 1708 |  | 
| 1553 1709 | 
             
                            if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
         | 
| 1554 1710 | 
             
                                (*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
         | 
| 1555 | 
            -
                                workspace.tree->emplace_back(tree_from, col, NotInSubset,  | 
| 1711 | 
            +
                                workspace.tree->emplace_back(tree_from, col, NotInSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col]);
         | 
| 1556 1712 | 
             
                                backup_recursion_state(workspace, *state_backup);
         | 
| 1557 1713 | 
             
                                workspace.st = workspace.this_split_ix;
         | 
| 1714 | 
            +
                                ExhaustedColumnsLevel level_col_tracker2;
         | 
| 1715 | 
            +
                                if (input_data.ncat[col] == 2 || workspace.is_binary_split) {
         | 
| 1716 | 
            +
                                    level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
         | 
| 1717 | 
            +
                                    workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
         | 
| 1718 | 
            +
                                }
         | 
| 1558 1719 | 
             
                                recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
         | 
| 1559 1720 | 
             
                                restore_recursion_state(workspace, *state_backup);
         | 
| 1560 1721 | 
             
                            }
         | 
| @@ -1563,7 +1724,8 @@ void recursive_split_categ(Workspace &workspace, | |
| 1563 1724 | 
             
                                workspace.best_gain = workspace.this_gain;
         | 
| 1564 1725 | 
             
                                workspace.column_type_best = Categorical;
         | 
| 1565 1726 | 
             
                                workspace.col_best = col;
         | 
| 1566 | 
            -
                                 | 
| 1727 | 
            +
                                workspace.best_cat_split_is_binary = workspace.is_binary_split;
         | 
| 1728 | 
            +
                                memcpy(workspace.buffer_subset_categ_best.data(), workspace.buffer_subset_categ.data(), input_data.ncat[col] * sizeof(signed char));
         | 
| 1567 1729 | 
             
                            }
         | 
| 1568 1730 |  | 
| 1569 1731 | 
             
                        }
         | 
| @@ -1579,18 +1741,25 @@ void recursive_split_categ(Workspace &workspace, | |
| 1579 1741 | 
             
                    if (curr_depth == 0 && workspace.col_is_bin && workspace.ncat_this > 2 && workspace.already_split_main) break;
         | 
| 1580 1742 | 
             
                    if (input_data.skip_col[col + input_data.ncols_numeric + input_data.ncols_categ]) continue;
         | 
| 1581 1743 | 
             
                    if (workspace.target_col_is_ord && col == (workspace.target_col_num - input_data.ncols_categ)) continue;
         | 
| 1744 | 
            +
                    if (workspace.exhausted_col_tracker.is_exhausted[col + input_data.ncols_numeric + input_data.ncols_categ]) continue;
         | 
| 1582 1745 |  | 
| 1583 | 
            -
                    split_ordx_categy( | 
| 1746 | 
            +
                    split_ordx_categy(workspace.ix_arr.data(), workspace.st, workspace.end,
         | 
| 1584 1747 | 
             
                                      input_data.ordinal_data + col * input_data.nrows, workspace.untransf_target_col,
         | 
| 1585 1748 | 
             
                                      input_data.ncat_ord[col], workspace.ncat_this,
         | 
| 1586 | 
            -
                                      workspace.base_info_orig,  | 
| 1749 | 
            +
                                      workspace.base_info_orig, workspace.buffer_cat_cnt.data(), workspace.buffer_crosstab.data(), workspace.buffer_cat_sorted.data(),
         | 
| 1587 1750 | 
             
                                      (bool)(input_data.has_NA[col + input_data.ncols_numeric + input_data.ncols_categ]),
         | 
| 1588 | 
            -
                                      model_params.min_size_categ, &(workspace.this_gain), &(workspace.this_split_lev) | 
| 1751 | 
            +
                                      model_params.min_size_categ, &(workspace.this_gain), &(workspace.this_split_lev),
         | 
| 1752 | 
            +
                                      &workspace.has_zero_variance, &workspace.is_binary_split);
         | 
| 1753 | 
            +
                    if (workspace.has_zero_variance) {
         | 
| 1754 | 
            +
                        workspace.has_zero_variance = false;
         | 
| 1755 | 
            +
                        workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_categ);
         | 
| 1756 | 
            +
                        continue;
         | 
| 1757 | 
            +
                    }
         | 
| 1589 1758 | 
             
                    if (model_params.gain_as_pct) workspace.this_gain /= workspace.base_info_orig;
         | 
| 1590 1759 |  | 
| 1591 1760 | 
             
                    if (workspace.this_gain >= model_params.min_gain) {
         | 
| 1592 1761 |  | 
| 1593 | 
            -
                        divide_subset_split( | 
| 1762 | 
            +
                        divide_subset_split(workspace.ix_arr.data(), input_data.ordinal_data + col * input_data.nrows, workspace.st, workspace.end,
         | 
| 1594 1763 | 
             
                                            workspace.this_split_lev, (bool)(workspace.buffer_cat_cnt[ input_data.ncat_ord[col] ] > 0),
         | 
| 1595 1764 | 
             
                                            &(workspace.this_split_NA), &(workspace.this_split_ix) );
         | 
| 1596 1765 |  | 
| @@ -1600,15 +1769,15 @@ void recursive_split_categ(Workspace &workspace, | |
| 1600 1769 | 
             
                            (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
         | 
| 1601 1770 | 
             
                            workspace.clusters->emplace_back(Ordinal, col, IsNa, (int)0, true);
         | 
| 1602 1771 | 
             
                            workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
         | 
| 1603 | 
            -
                                                                           | 
| 1772 | 
            +
                                                                          workspace.ix_arr.data(), workspace.st, workspace.this_split_NA - 1,
         | 
| 1604 1773 | 
             
                                                                          workspace.ncat_this, model_params.categ_from_maj,
         | 
| 1605 | 
            -
                                                                           | 
| 1606 | 
            -
                                                                           | 
| 1774 | 
            +
                                                                          workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
         | 
| 1775 | 
            +
                                                                          workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
         | 
| 1607 1776 | 
             
                                                                          workspace.clusters->size() - 1, tree_from, curr_depth + 1,
         | 
| 1608 1777 | 
             
                                                                          model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
         | 
| 1609 1778 | 
             
                                                                          workspace.prop_small_this, workspace.prior_prob,
         | 
| 1610 | 
            -
                                                                           | 
| 1611 | 
            -
                                                                           | 
| 1779 | 
            +
                                                                          workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
         | 
| 1780 | 
            +
                                                                          workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
         | 
| 1612 1781 | 
             
                            workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
         | 
| 1613 1782 | 
             
                            if (workspace.drop_cluster) {
         | 
| 1614 1783 | 
             
                                workspace.clusters->pop_back();
         | 
| @@ -1620,6 +1789,9 @@ void recursive_split_categ(Workspace &workspace, | |
| 1620 1789 | 
             
                                workspace.tree->emplace_back(tree_from, col, (int)-1, IsNa);
         | 
| 1621 1790 | 
             
                                backup_recursion_state(workspace, *state_backup);
         | 
| 1622 1791 | 
             
                                workspace.end = workspace.this_split_NA - 1;
         | 
| 1792 | 
            +
                                ExhaustedColumnsLevel level_col_tracker2;
         | 
| 1793 | 
            +
                                level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
         | 
| 1794 | 
            +
                                workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_categ);
         | 
| 1623 1795 | 
             
                                recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, true);
         | 
| 1624 1796 | 
             
                                restore_recursion_state(workspace, *state_backup);
         | 
| 1625 1797 | 
             
                            }
         | 
| @@ -1630,15 +1802,15 @@ void recursive_split_categ(Workspace &workspace, | |
| 1630 1802 | 
             
                        (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
         | 
| 1631 1803 | 
             
                        workspace.clusters->emplace_back(Ordinal, col, LessOrEqual, workspace.this_split_lev, is_NA_branch);
         | 
| 1632 1804 | 
             
                        workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
         | 
| 1633 | 
            -
                                                                       | 
| 1805 | 
            +
                                                                      workspace.ix_arr.data(), workspace.this_split_NA, workspace.this_split_ix - 1,
         | 
| 1634 1806 | 
             
                                                                      workspace.ncat_this, model_params.categ_from_maj,
         | 
| 1635 | 
            -
                                                                       | 
| 1636 | 
            -
                                                                       | 
| 1807 | 
            +
                                                                      workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
         | 
| 1808 | 
            +
                                                                      workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
         | 
| 1637 1809 | 
             
                                                                      workspace.clusters->size() - 1, tree_from, curr_depth + 1,
         | 
| 1638 1810 | 
             
                                                                      model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
         | 
| 1639 1811 | 
             
                                                                      workspace.prop_small_this, workspace.prior_prob,
         | 
| 1640 | 
            -
                                                                       | 
| 1641 | 
            -
                                                                       | 
| 1812 | 
            +
                                                                      workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
         | 
| 1813 | 
            +
                                                                      workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
         | 
| 1642 1814 | 
             
                        workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
         | 
| 1643 1815 | 
             
                        if (workspace.drop_cluster) {
         | 
| 1644 1816 | 
             
                            workspace.clusters->pop_back();
         | 
| @@ -1651,6 +1823,11 @@ void recursive_split_categ(Workspace &workspace, | |
| 1651 1823 | 
             
                            backup_recursion_state(workspace, *state_backup);
         | 
| 1652 1824 | 
             
                            workspace.st = workspace.this_split_NA;
         | 
| 1653 1825 | 
             
                            workspace.end = workspace.this_split_ix - 1;
         | 
| 1826 | 
            +
                            ExhaustedColumnsLevel level_col_tracker2;
         | 
| 1827 | 
            +
                            if (workspace.is_binary_split) {
         | 
| 1828 | 
            +
                                level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
         | 
| 1829 | 
            +
                                workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_categ);
         | 
| 1830 | 
            +
                            }
         | 
| 1654 1831 | 
             
                            recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
         | 
| 1655 1832 | 
             
                            restore_recursion_state(workspace, *state_backup);
         | 
| 1656 1833 | 
             
                        }
         | 
| @@ -1659,15 +1836,15 @@ void recursive_split_categ(Workspace &workspace, | |
| 1659 1836 | 
             
                        (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
         | 
| 1660 1837 | 
             
                        workspace.clusters->emplace_back(Ordinal, col, Greater, workspace.this_split_lev, is_NA_branch);
         | 
| 1661 1838 | 
             
                        workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
         | 
| 1662 | 
            -
                                                                       | 
| 1839 | 
            +
                                                                      workspace.ix_arr.data(), workspace.this_split_ix, workspace.end,
         | 
| 1663 1840 | 
             
                                                                      workspace.ncat_this, model_params.categ_from_maj,
         | 
| 1664 | 
            -
                                                                       | 
| 1665 | 
            -
                                                                       | 
| 1841 | 
            +
                                                                      workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
         | 
| 1842 | 
            +
                                                                      workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
         | 
| 1666 1843 | 
             
                                                                      workspace.clusters->size() - 1, tree_from, curr_depth + 1,
         | 
| 1667 1844 | 
             
                                                                      model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
         | 
| 1668 1845 | 
             
                                                                      workspace.prop_small_this, workspace.prior_prob,
         | 
| 1669 | 
            -
                                                                       | 
| 1670 | 
            -
                                                                       | 
| 1846 | 
            +
                                                                      workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
         | 
| 1847 | 
            +
                                                                      workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
         | 
| 1671 1848 | 
             
                        workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
         | 
| 1672 1849 | 
             
                        if (workspace.drop_cluster) {
         | 
| 1673 1850 | 
             
                            workspace.clusters->pop_back();
         | 
| @@ -1679,6 +1856,11 @@ void recursive_split_categ(Workspace &workspace, | |
| 1679 1856 | 
             
                            workspace.tree->emplace_back(tree_from, col, workspace.this_split_lev, Greater);
         | 
| 1680 1857 | 
             
                            backup_recursion_state(workspace, *state_backup);
         | 
| 1681 1858 | 
             
                            workspace.st = workspace.this_split_ix;
         | 
| 1859 | 
            +
                            ExhaustedColumnsLevel level_col_tracker2;
         | 
| 1860 | 
            +
                            if (workspace.is_binary_split) {
         | 
| 1861 | 
            +
                                level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
         | 
| 1862 | 
            +
                                workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_categ);
         | 
| 1863 | 
            +
                            }
         | 
| 1682 1864 | 
             
                            recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
         | 
| 1683 1865 | 
             
                            restore_recursion_state(workspace, *state_backup);
         | 
| 1684 1866 | 
             
                        }
         | 
| @@ -1689,6 +1871,7 @@ void recursive_split_categ(Workspace &workspace, | |
| 1689 1871 | 
             
                            workspace.column_type_best = Ordinal;
         | 
| 1690 1872 | 
             
                            workspace.col_best = col;
         | 
| 1691 1873 | 
             
                            workspace.split_lev_best = workspace.this_split_lev;
         | 
| 1874 | 
            +
                            workspace.best_cat_split_is_binary = workspace.is_binary_split;
         | 
| 1692 1875 | 
             
                        }
         | 
| 1693 1876 |  | 
| 1694 1877 | 
             
                    }
         | 
| @@ -1699,7 +1882,7 @@ void recursive_split_categ(Workspace &workspace, | |
| 1699 1882 | 
             
                /* avoid unnecessary memory usage or repeats */
         | 
| 1700 1883 | 
             
                workspace.col_has_outliers = workspace.lev_has_outliers? true : workspace.col_has_outliers;
         | 
| 1701 1884 | 
             
                (*workspace.tree)[tree_from].clusters.shrink_to_fit();
         | 
| 1702 | 
            -
                if ((*workspace.tree)[tree_from].all_branches. | 
| 1885 | 
            +
                if (!(*workspace.tree)[tree_from].all_branches.empty()) (*workspace.tree)[tree_from].all_branches.shrink_to_fit();
         | 
| 1703 1886 | 
             
                if (curr_depth == 0 && workspace.col_is_bin && workspace.ncat_this > 2 && !workspace.already_split_main)
         | 
| 1704 1887 | 
             
                    workspace.already_split_main = true;
         | 
| 1705 1888 |  | 
| @@ -1712,13 +1895,13 @@ void recursive_split_categ(Workspace &workspace, | |
| 1712 1895 |  | 
| 1713 1896 | 
             
                    /* discard outliers if any */
         | 
| 1714 1897 | 
             
                    if (workspace.lev_has_outliers)
         | 
| 1715 | 
            -
                        workspace.st = move_outliers_to_front( | 
| 1898 | 
            +
                        workspace.st = move_outliers_to_front(workspace.ix_arr.data(), workspace.outlier_scores.data(), workspace.st, workspace.end);
         | 
| 1716 1899 |  | 
| 1717 1900 | 
             
                    /* assign rows to their corresponding branch */
         | 
| 1718 1901 | 
             
                    switch(workspace.column_type_best) {
         | 
| 1719 1902 | 
             
                        case Numeric:
         | 
| 1720 1903 | 
             
                        {
         | 
| 1721 | 
            -
                            divide_subset_split( | 
| 1904 | 
            +
                            divide_subset_split(workspace.ix_arr.data(), input_data.numeric_data + workspace.col_best * input_data.nrows,
         | 
| 1722 1905 | 
             
                                                workspace.st, workspace.end, workspace.split_point_best,
         | 
| 1723 1906 | 
             
                                                (bool)(input_data.has_NA[workspace.col_best]),
         | 
| 1724 1907 | 
             
                                                &(workspace.this_split_NA), &(workspace.this_split_ix) );
         | 
| @@ -1729,12 +1912,14 @@ void recursive_split_categ(Workspace &workspace, | |
| 1729 1912 |  | 
| 1730 1913 | 
             
                        case Ordinal:
         | 
| 1731 1914 | 
             
                        {
         | 
| 1732 | 
            -
                            divide_subset_split( | 
| 1915 | 
            +
                            divide_subset_split(workspace.ix_arr.data(), input_data.ordinal_data + workspace.col_best * input_data.nrows,
         | 
| 1733 1916 | 
             
                                                workspace.st, workspace.end, workspace.split_lev_best,
         | 
| 1734 1917 | 
             
                                                (bool)(input_data.has_NA[workspace.col_best + input_data.ncols_numeric + input_data.ncols_categ]),
         | 
| 1735 1918 | 
             
                                                &(workspace.this_split_NA), &(workspace.this_split_ix) );
         | 
| 1736 1919 | 
             
                            spl1 = LessOrEqual; spl2 = Greater;
         | 
| 1737 1920 | 
             
                            set_tree_as_ordinal(workspace.tree->back(), workspace.split_lev_best, workspace.col_best);
         | 
| 1921 | 
            +
                            if (input_data.ncat_ord[workspace.col_best] == 2 || workspace.best_cat_split_is_binary)
         | 
| 1922 | 
            +
                                workspace.exhausted_col_tracker.push_col(workspace.col_best + input_data.ncols_numeric + input_data.ncols_categ);
         | 
| 1738 1923 | 
             
                            break;
         | 
| 1739 1924 | 
             
                        }
         | 
| 1740 1925 |  | 
| @@ -1743,27 +1928,30 @@ void recursive_split_categ(Workspace &workspace, | |
| 1743 1928 |  | 
| 1744 1929 | 
             
                            if (input_data.ncat[workspace.col_best] == 2) {
         | 
| 1745 1930 |  | 
| 1746 | 
            -
                                divide_subset_split( | 
| 1931 | 
            +
                                divide_subset_split(workspace.ix_arr.data(), input_data.categorical_data + workspace.col_best * input_data.nrows,
         | 
| 1747 1932 | 
             
                                                    workspace.st, workspace.end, (int)0,
         | 
| 1748 1933 | 
             
                                                    (bool)(input_data.has_NA[workspace.col_best + input_data.ncols_numeric]),
         | 
| 1749 1934 | 
             
                                                    &(workspace.this_split_NA), &(workspace.this_split_ix) );
         | 
| 1750 1935 | 
             
                                spl1 = InSubset; spl2 = NotInSubset;
         | 
| 1751 1936 | 
             
                                set_tree_as_categorical(workspace.tree->back(), workspace.col_best);
         | 
| 1937 | 
            +
                                workspace.exhausted_col_tracker.push_col(workspace.col_best + input_data.ncols_numeric);
         | 
| 1752 1938 |  | 
| 1753 1939 | 
             
                            } else if (workspace.col_is_bin || model_params.cat_bruteforce_subset) {
         | 
| 1754 1940 |  | 
| 1755 | 
            -
                                divide_subset_split( | 
| 1756 | 
            -
                                                    workspace.st, workspace.end,  | 
| 1941 | 
            +
                                divide_subset_split(workspace.ix_arr.data(), input_data.categorical_data + workspace.col_best * input_data.nrows,
         | 
| 1942 | 
            +
                                                    workspace.st, workspace.end, workspace.buffer_subset_categ_best.data(), input_data.ncat[workspace.col_best],
         | 
| 1757 1943 | 
             
                                                    (bool)(input_data.has_NA[workspace.col_best + input_data.ncols_numeric]),
         | 
| 1758 1944 | 
             
                                                    &(workspace.this_split_NA), &(workspace.this_split_ix) );
         | 
| 1759 1945 | 
             
                                spl1 = InSubset; spl2 = NotInSubset;
         | 
| 1760 1946 | 
             
                                set_tree_as_categorical(workspace.tree->back(), input_data.ncat[workspace.col_best],
         | 
| 1761 | 
            -
                                                         | 
| 1947 | 
            +
                                                        workspace.buffer_subset_categ_best.data(), workspace.col_best);
         | 
| 1948 | 
            +
                                if (workspace.best_cat_split_is_binary)
         | 
| 1949 | 
            +
                                    workspace.exhausted_col_tracker.push_col(workspace.col_best + input_data.ncols_numeric);
         | 
| 1762 1950 |  | 
| 1763 1951 | 
             
                            } else {
         | 
| 1764 1952 | 
             
                                spl1 = SingleCateg;
         | 
| 1765 1953 | 
             
                                workspace.temp_ptr_x = input_data.categorical_data + workspace.col_best * input_data.nrows;
         | 
| 1766 | 
            -
                                std::sort( | 
| 1954 | 
            +
                                std::sort(workspace.ix_arr.data() + workspace.st, workspace.ix_arr.data() + workspace.end + 1,
         | 
| 1767 1955 | 
             
                                          [&workspace](const size_t a, const size_t b){return workspace.temp_ptr_x[a] < workspace.temp_ptr_x[b];});
         | 
| 1768 1956 | 
             
                                set_tree_as_categorical(workspace.tree->back(), workspace.col_best, input_data.ncat[workspace.col_best]);
         | 
| 1769 1957 |  | 
| @@ -1773,11 +1961,15 @@ void recursive_split_categ(Workspace &workspace, | |
| 1773 1961 | 
             
                                        break;
         | 
| 1774 1962 | 
             
                                    }
         | 
| 1775 1963 | 
             
                                }
         | 
| 1964 | 
            +
                                workspace.exhausted_col_tracker.push_col(workspace.col_best + input_data.ncols_numeric);
         | 
| 1776 1965 | 
             
                            }
         | 
| 1777 1966 | 
             
                            break;
         | 
| 1778 1967 | 
             
                        }
         | 
| 1779 1968 |  | 
| 1780 | 
            -
             | 
| 1969 | 
            +
                        default:
         | 
| 1970 | 
            +
                        {
         | 
| 1971 | 
            +
                            unexpected_error();
         | 
| 1972 | 
            +
                        }
         | 
| 1781 1973 | 
             
                    }
         | 
| 1782 1974 |  | 
| 1783 1975 |  | 
| @@ -1871,7 +2063,7 @@ void recursive_split_categ(Workspace &workspace, | |
| 1871 2063 |  | 
| 1872 2064 | 
             
                        if (tree_from == 0) {
         | 
| 1873 2065 | 
             
                            workspace.tree->clear();
         | 
| 1874 | 
            -
                        } else if ((*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches. | 
| 2066 | 
            +
                        } else if (!(*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.empty()) {
         | 
| 1875 2067 | 
             
                            (*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.pop_back();
         | 
| 1876 2068 | 
             
                            workspace.tree->pop_back();
         | 
| 1877 2069 | 
             
                        } else {
         | 
| @@ -1925,6 +2117,11 @@ void recursive_split_categ(Workspace &workspace, | |
| 1925 2117 | 
             
                                    workspace.tree->pop_back();
         | 
| 1926 2118 | 
             
                                    break;
         | 
| 1927 2119 | 
             
                                }
         | 
| 2120 | 
            +
             | 
| 2121 | 
            +
                                default:
         | 
| 2122 | 
            +
                                {
         | 
| 2123 | 
            +
                                    unexpected_error();
         | 
| 2124 | 
            +
                                }
         | 
| 1928 2125 | 
             
                            }
         | 
| 1929 2126 | 
             
                        }
         | 
| 1930 2127 | 
             
                    }
         |