outliertree 0.1.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/NOTICE.txt +1 -1
- data/README.md +11 -10
- data/ext/outliertree/ext.cpp +104 -105
- data/ext/outliertree/extconf.rb +1 -1
- data/lib/outliertree/result.rb +3 -3
- data/lib/outliertree/version.rb +1 -1
- data/vendor/outliertree/README.md +77 -40
- data/vendor/outliertree/src/Makevars.in +4 -0
- data/vendor/outliertree/src/Makevars.win +4 -0
- data/vendor/outliertree/src/RcppExports.cpp +20 -9
- data/vendor/outliertree/src/Rwrapper.cpp +256 -57
- data/vendor/outliertree/src/cat_outlier.cpp +6 -6
- data/vendor/outliertree/src/clusters.cpp +114 -9
- data/vendor/outliertree/src/fit_model.cpp +505 -308
- data/vendor/outliertree/src/misc.cpp +165 -4
- data/vendor/outliertree/src/outlier_tree.hpp +159 -51
- data/vendor/outliertree/src/outliertree-win.def +3 -0
- data/vendor/outliertree/src/predict.cpp +33 -0
- data/vendor/outliertree/src/split.cpp +124 -20
- metadata +10 -8
- data/vendor/outliertree/src/Makevars +0 -3
| @@ -41,23 +41,34 @@ | |
| 41 41 | 
             
            #include <algorithm>
         | 
| 42 42 | 
             
            #include <numeric>
         | 
| 43 43 | 
             
            #include <unordered_set>
         | 
| 44 | 
            +
            #include <exception>
         | 
| 45 | 
            +
            #include <stdexcept>
         | 
| 46 | 
            +
            #include <cassert>
         | 
| 44 47 | 
             
            #include <math.h>
         | 
| 45 48 | 
             
            #include <cmath>
         | 
| 46 49 | 
             
            #include <stddef.h>
         | 
| 47 50 | 
             
            #include <limits.h>
         | 
| 51 | 
            +
            #include <limits>
         | 
| 48 52 | 
             
            #include <stdlib.h>
         | 
| 49 53 | 
             
            #include <stddef.h>
         | 
| 50 54 | 
             
            #include <string.h>
         | 
| 55 | 
            +
            #include <stdint.h>
         | 
| 51 56 | 
             
            #ifdef _OPENMP
         | 
| 52 57 | 
             
                #include <omp.h>
         | 
| 53 58 | 
             
            #endif
         | 
| 59 | 
            +
            #ifdef _FOR_R
         | 
| 60 | 
            +
                #include <Rcpp.h>
         | 
| 61 | 
            +
            #endif
         | 
| 62 | 
            +
            #include <signal.h>
         | 
| 63 | 
            +
            typedef void (*sig_t_)(int);
         | 
| 64 | 
            +
             | 
| 54 65 |  | 
| 55 66 | 
             
            /************************
         | 
| 56 67 | 
             
                Short Functions
         | 
| 57 68 | 
             
            *************************/
         | 
| 58 69 | 
             
            #define extract_bit(number, bit) (((number) >> (bit)) & 1) /* https://stackoverflow.com/questions/2249731/how-do-i-get-bit-by-bit-data-from-an-integer-value-in-c */
         | 
| 59 70 | 
             
            #define pow2(n) ( ((size_t) 1) << (n) ) /* https://stackoverflow.com/questions/101439/the-most-efficient-way-to-implement-an-integer-based-power-function-powint-int */
         | 
| 60 | 
            -
            #define avg_between(a, b) (( | 
| 71 | 
            +
            #define avg_between(a, b) ((a) + 0.5*((b) - (a)))
         | 
| 61 72 | 
             
            #define square(x) ((x) * (x))
         | 
| 62 73 | 
             
            #ifndef isinf
         | 
| 63 74 | 
             
                #define isinf std::isinf
         | 
| @@ -68,7 +79,7 @@ | |
| 68 79 | 
             
            #define is_na_or_inf(x) (isnan(x) || isinf(x))
         | 
| 69 80 |  | 
| 70 81 | 
             
            /* Aliasing for compiler optimizations */
         | 
| 71 | 
            -
            #if defined(__GNUG__) || defined(__GNUC__) || defined(_MSC_VER) || defined(__clang__) || defined(__INTEL_COMPILER)
         | 
| 82 | 
            +
            #if defined(__GNUG__) || defined(__GNUC__) || defined(_MSC_VER) || defined(__clang__) || defined(__INTEL_COMPILER) || defined(__IBMCPP__) || defined(__ibmxl__) || defined(SUPPORTS_RESTRICT)
         | 
| 72 83 | 
             
                #define restrict __restrict
         | 
| 73 84 | 
             
            #else
         | 
| 74 85 | 
             
                #define restrict 
         | 
| @@ -89,6 +100,8 @@ | |
| 89 100 | 
             
                #define omp_get_thread_num() 0
         | 
| 90 101 | 
             
            #endif
         | 
| 91 102 |  | 
| 103 | 
            +
            #define unexpected_error() throw std::runtime_error("Unexpected error. Please open an issue in GitHub.\n")
         | 
| 104 | 
            +
             | 
| 92 105 |  | 
| 93 106 | 
             
            /****************************************************************
         | 
| 94 107 | 
             
                Data types and structs that are returned from this module
         | 
| @@ -103,6 +116,8 @@ typedef enum SplitType { | |
| 103 116 | 
             
            } SplitType;
         | 
| 104 117 | 
             
            typedef enum ColTransf {NoTransf, Log, Exp} ColTransf; /* transformation to apply to numeric column */
         | 
| 105 118 |  | 
| 119 | 
            +
            /* TODO: should write serializers for the model objects, but need to somehow deal with long double types */
         | 
| 120 | 
            +
             | 
| 106 121 | 
             
            /*    
         | 
| 107 122 | 
             
            *    1-d clusters that define homogeneous groups in which observations can be outliers.
         | 
| 108 123 | 
             
            *    Note that these are associated to a tree and define one extra condition from what
         | 
| @@ -114,7 +129,7 @@ typedef struct Cluster { | |
| 114 129 | 
             
                size_t col_num = 0; /* numer of the column by which its being split, the target column is given by index of the cluster vector */
         | 
| 115 130 | 
             
                SplitType split_type = Root;
         | 
| 116 131 | 
             
                double split_point = HUGE_VAL; /* numerical */
         | 
| 117 | 
            -
                std::vector<char> split_subset | 
| 132 | 
            +
                std::vector<signed char> split_subset; /* categorical */
         | 
| 118 133 | 
             
                int split_lev = INT_MAX;    /* ordinal */
         | 
| 119 134 | 
             
                bool has_NA_branch = false; /* this is in order to determine the best outlier cluster when it fits under more than 1 */
         | 
| 120 135 |  | 
| @@ -127,7 +142,7 @@ typedef struct Cluster { | |
| 127 142 | 
             
                double    display_lim_high = -HUGE_VAL;                /* numerical target column */
         | 
| 128 143 | 
             
                double    display_mean = -HUGE_VAL;                    /* numerical target column */
         | 
| 129 144 | 
             
                double    display_sd = -HUGE_VAL;                      /* numerical target column */
         | 
| 130 | 
            -
                std::vector<char> subset_common | 
| 145 | 
            +
                std::vector<signed char> subset_common; /* categorical or ordinal target column (=0 is common) */
         | 
| 131 146 | 
             
                double    perc_in_subset = HUGE_VAL;                   /* categorical or ordinal target column */
         | 
| 132 147 | 
             
                double    perc_next_most_comm = -HUGE_VAL;             /* categorical or ordinal target column */ /* TODO */
         | 
| 133 148 | 
             
                int       categ_maj = -1;                              /* when using majority-criterion for categorical outliers */
         | 
| @@ -156,7 +171,7 @@ typedef struct Cluster { | |
| 156 171 | 
             
                }
         | 
| 157 172 |  | 
| 158 173 | 
             
                /* categorical split */
         | 
| 159 | 
            -
                Cluster(ColType column_type, size_t col_num, SplitType split_type, char *split_subset, int ncat, bool has_NA_branch = false)
         | 
| 174 | 
            +
                Cluster(ColType column_type, size_t col_num, SplitType split_type, signed char *split_subset, int ncat, bool has_NA_branch = false)
         | 
| 160 175 | 
             
                {
         | 
| 161 176 | 
             
                    this->column_type = column_type;
         | 
| 162 177 | 
             
                    this->col_num = col_num;
         | 
| @@ -245,21 +260,21 @@ typedef struct Cluster { | |
| 245 260 | 
             
            typedef struct ClusterTree {
         | 
| 246 261 | 
             
                size_t parent = 0;              /* index in a vector */
         | 
| 247 262 | 
             
                SplitType parent_branch = Root; /* this tree follows this branch in the split given by its parent */
         | 
| 248 | 
            -
                std::vector<size_t> clusters | 
| 263 | 
            +
                std::vector<size_t> clusters; /* these clusters define additional splits */
         | 
| 249 264 |  | 
| 250 265 | 
             
                SplitType split_this_branch = Root;                        /* when using 'follow_all' */
         | 
| 251 | 
            -
                std::vector<size_t> all_branches | 
| 266 | 
            +
                std::vector<size_t> all_branches;  /* when using 'follow_all' */
         | 
| 252 267 |  | 
| 253 268 | 
             
                ColType   column_type = NoType;
         | 
| 254 269 | 
             
                size_t    col_num = 0;
         | 
| 255 270 | 
             
                double    split_point = HUGE_VAL;
         | 
| 256 | 
            -
                std::vector<char> split_subset | 
| 271 | 
            +
                std::vector<signed char> split_subset;
         | 
| 257 272 | 
             
                int split_lev = INT_MAX;
         | 
| 258 273 |  | 
| 259 274 | 
             
                size_t tree_NA = 0;    /* binary splits */
         | 
| 260 275 | 
             
                size_t tree_left = 0;  /* binary splits */
         | 
| 261 276 | 
             
                size_t tree_right = 0; /* binary splits */
         | 
| 262 | 
            -
                std::vector<size_t> binary_branches | 
| 277 | 
            +
                std::vector<size_t> binary_branches; /* multiple splits (single category or binarized categories) */
         | 
| 263 278 |  | 
| 264 279 | 
             
                ClusterTree(size_t parent, SplitType parent_branch)
         | 
| 265 280 | 
             
                {
         | 
| @@ -286,7 +301,7 @@ typedef struct ClusterTree { | |
| 286 301 | 
             
                    this->split_lev = split_lev;
         | 
| 287 302 | 
             
                }
         | 
| 288 303 |  | 
| 289 | 
            -
                ClusterTree(size_t parent, size_t col_num, SplitType split_this_branch, char *split_subset, int ncat)
         | 
| 304 | 
            +
                ClusterTree(size_t parent, size_t col_num, SplitType split_this_branch, signed char *split_subset, int ncat)
         | 
| 290 305 | 
             
                {
         | 
| 291 306 | 
             
                    this->parent = parent;
         | 
| 292 307 | 
             
                    this->col_num = col_num;
         | 
| @@ -336,6 +351,8 @@ typedef struct ClusterTree { | |
| 336 351 |  | 
| 337 352 | 
             
            } ClusterTree;
         | 
| 338 353 |  | 
| 354 | 
            +
            /* TODO: should separate the results from the actual model object */
         | 
| 355 | 
            +
             | 
| 339 356 | 
             
            /* these are needed for prediction time, and are thus returned from the function that fits the model */
         | 
| 340 357 | 
             
            typedef struct ModelOutputs {
         | 
| 341 358 | 
             
                std::vector< std::vector<ClusterTree> > all_trees;  /* clusters in which observations can be outliers, required for prediction time */
         | 
| @@ -370,11 +387,6 @@ typedef struct ModelOutputs { | |
| 370 387 | 
             
                    archive(
         | 
| 371 388 | 
             
                            this->all_trees,
         | 
| 372 389 | 
             
                            this->all_clusters,
         | 
| 373 | 
            -
                            this->outlier_scores_final,
         | 
| 374 | 
            -
                            this->outlier_clusters_final,
         | 
| 375 | 
            -
                            this->outlier_columns_final,
         | 
| 376 | 
            -
                            this->outlier_trees_final,
         | 
| 377 | 
            -
                            this->outlier_depth_final,
         | 
| 378 390 | 
             
                            this->start_ix_cat_counts,
         | 
| 379 391 | 
             
                            this->prop_categ,
         | 
| 380 392 | 
             
                            this->col_transf,
         | 
| @@ -421,7 +433,69 @@ bool fit_outliers_models(ModelOutputs &model_outputs, | |
| 421 433 | 
             
                                     size_t max_depth = 3, double max_perc_outliers = 0.01, size_t min_size_numeric = 25, size_t min_size_categ = 50,
         | 
| 422 434 | 
             
                                     double min_gain = 1e-2, bool gain_as_pct = false, bool follow_all = false, double z_norm = 2.67, double z_outlier = 8.0);
         | 
| 423 435 |  | 
| 424 | 
            -
             | 
| 436 | 
            +
            class ExhaustedColumnTracker
         | 
| 437 | 
            +
            {
         | 
| 438 | 
            +
            public:
         | 
| 439 | 
            +
                std::vector<bool> is_exhausted;
         | 
| 440 | 
            +
                std::vector<size_t> col_indices;
         | 
| 441 | 
            +
                std::vector<size_t> n_held;
         | 
| 442 | 
            +
             | 
| 443 | 
            +
                void initialize(size_t ncols, size_t max_depth)
         | 
| 444 | 
            +
                {
         | 
| 445 | 
            +
                    this->is_exhausted.assign(ncols, false);
         | 
| 446 | 
            +
                    this->n_held.clear();
         | 
| 447 | 
            +
                    this->n_held.reserve(max_depth+1);
         | 
| 448 | 
            +
                    this->col_indices.clear();
         | 
| 449 | 
            +
                    this->col_indices.reserve(ncols);
         | 
| 450 | 
            +
                }
         | 
| 451 | 
            +
             | 
| 452 | 
            +
                void push_branch()
         | 
| 453 | 
            +
                {
         | 
| 454 | 
            +
                    this->n_held.push_back(0);
         | 
| 455 | 
            +
                }
         | 
| 456 | 
            +
             | 
| 457 | 
            +
                void push_col(size_t col)
         | 
| 458 | 
            +
                {
         | 
| 459 | 
            +
                    this->is_exhausted[col] = true;
         | 
| 460 | 
            +
                    this->col_indices.push_back(col);
         | 
| 461 | 
            +
                    this->n_held.back() += 1;
         | 
| 462 | 
            +
                }
         | 
| 463 | 
            +
             | 
| 464 | 
            +
                void pop_branch()
         | 
| 465 | 
            +
                {
         | 
| 466 | 
            +
                    size_t col;
         | 
| 467 | 
            +
                    while (this->n_held.back() > 0)
         | 
| 468 | 
            +
                    {
         | 
| 469 | 
            +
                        col = this->col_indices.back();
         | 
| 470 | 
            +
                        this->is_exhausted[col] = false;
         | 
| 471 | 
            +
                        this->col_indices.pop_back();
         | 
| 472 | 
            +
                        this->n_held.back() -= 1;
         | 
| 473 | 
            +
                    }
         | 
| 474 | 
            +
             | 
| 475 | 
            +
                    this->n_held.pop_back();
         | 
| 476 | 
            +
                }
         | 
| 477 | 
            +
            };
         | 
| 478 | 
            +
             | 
| 479 | 
            +
            class ExhaustedColumnsLevel
         | 
| 480 | 
            +
            {
         | 
| 481 | 
            +
            public:
         | 
| 482 | 
            +
                bool pop = false;
         | 
| 483 | 
            +
                ExhaustedColumnTracker* tracker = nullptr;
         | 
| 484 | 
            +
                ExhaustedColumnsLevel() = default;
         | 
| 485 | 
            +
                void initialize(ExhaustedColumnTracker* tracker) {
         | 
| 486 | 
            +
                    this->pop = true;
         | 
| 487 | 
            +
                    this->tracker = tracker;
         | 
| 488 | 
            +
                    this->tracker->push_branch();
         | 
| 489 | 
            +
                }
         | 
| 490 | 
            +
                ~ExhaustedColumnsLevel() {
         | 
| 491 | 
            +
                    if (this->pop) {
         | 
| 492 | 
            +
                        this->tracker->pop_branch();
         | 
| 493 | 
            +
                        this->pop = false;
         | 
| 494 | 
            +
                    }
         | 
| 495 | 
            +
                }
         | 
| 496 | 
            +
            };
         | 
| 497 | 
            +
             | 
| 498 | 
            +
            struct Workspace {
         | 
| 425 499 |  | 
| 426 500 | 
             
                std::vector<size_t> ix_arr;           /* indices from the target column */
         | 
| 427 501 | 
             
                size_t st;                            /* chunk of the indices to take for current function calls */
         | 
| @@ -460,7 +534,7 @@ typedef struct { | |
| 460 534 | 
             
                int *untransf_target_col;             /* column as it was before forcibly binarizing (dynamic pointer) */
         | 
| 461 535 | 
             
                int *temp_ptr_x;                      /* dynamic pointer */
         | 
| 462 536 |  | 
| 463 | 
            -
                std::vector<char> buffer_subset_categ_best;  /* categorical split that gave the best gain */
         | 
| 537 | 
            +
                std::vector<signed char> buffer_subset_categ_best;  /* categorical split that gave the best gain */
         | 
| 464 538 | 
             
                long double this_gain;                       /* buffer where to store gain */
         | 
| 465 539 | 
             
                double this_split_point;                     /* numeric split threshold */
         | 
| 466 540 | 
             
                int this_split_lev;                          /* ordinal split threshold */
         | 
| @@ -477,8 +551,8 @@ typedef struct { | |
| 477 551 | 
             
                std::vector<size_t>      buffer_crosstab;        /* buffer arrays where to allocate values required by functions and not used outside them */
         | 
| 478 552 | 
             
                std::vector<size_t>      buffer_cat_cnt;         /* buffer arrays where to allocate values required by functions and not used outside them */
         | 
| 479 553 | 
             
                std::vector<size_t>      buffer_cat_sorted;      /* buffer arrays where to allocate values required by functions and not used outside them */
         | 
| 480 | 
            -
                std::vector<char> | 
| 481 | 
            -
                std::vector<char> | 
| 554 | 
            +
                std::vector<signed char> buffer_subset_categ;    /* buffer arrays where to allocate values required by functions and not used outside them */
         | 
| 555 | 
            +
                std::vector<signed char> buffer_subset_outlier;  /* buffer arrays where to allocate values required by functions and not used outside them */
         | 
| 482 556 | 
             
                std::vector<long double> buffer_sd;              /* used for a more numerically-stable two-pass gain calculation */
         | 
| 483 557 |  | 
| 484 558 | 
             
                bool drop_cluster;          /* for categorical and ordinal variables, not all clusters can flag observations as outliers, so those are not kept */
         | 
| @@ -486,10 +560,14 @@ typedef struct { | |
| 486 560 | 
             
                bool target_col_is_ord;     /* whether the target column is ordinal (rest is the same as in categoricals) */
         | 
| 487 561 | 
             
                int  ncat_this;             /* number of categories in the target column */
         | 
| 488 562 |  | 
| 489 | 
            -
             | 
| 563 | 
            +
                ExhaustedColumnTracker exhausted_col_tracker;
         | 
| 564 | 
            +
                bool has_zero_variance;
         | 
| 565 | 
            +
                bool is_binary_split;
         | 
| 566 | 
            +
                bool best_cat_split_is_binary;
         | 
| 567 | 
            +
            };
         | 
| 490 568 |  | 
| 491 569 | 
             
            /* info holders to shorten function call arguments */
         | 
| 492 | 
            -
             | 
| 570 | 
            +
            struct ModelParams {
         | 
| 493 571 | 
             
                bool    categ_as_bin;
         | 
| 494 572 | 
             
                bool    ord_as_bin;
         | 
| 495 573 | 
             
                bool    cat_bruteforce_subset;
         | 
| @@ -506,16 +584,16 @@ typedef struct { | |
| 506 584 | 
             
                double  z_outlier;
         | 
| 507 585 | 
             
                double  z_tail;
         | 
| 508 586 | 
             
                std::vector<long double> prop_small; /* this is not a parameter, but a shared array determined from the parameters and data */
         | 
| 509 | 
            -
            } | 
| 587 | 
            +
            };
         | 
| 510 588 |  | 
| 511 589 | 
             
            /* Note: the vectors here are filled within the function that fits the model, while the pointers are passed from outside */
         | 
| 512 | 
            -
             | 
| 590 | 
            +
            struct InputData {
         | 
| 513 591 | 
             
                double  *restrict numeric_data;     size_t ncols_numeric;
         | 
| 514 592 | 
             
                int     *restrict categorical_data; size_t ncols_categ;   int *restrict ncat;
         | 
| 515 593 | 
             
                int     *restrict ordinal_data;     size_t ncols_ord;     int *restrict ncat_ord;
         | 
| 516 594 | 
             
                size_t  nrows; size_t tot_cols; std::vector<char> has_NA; std::vector<char> skip_col; int max_categ;
         | 
| 517 595 | 
             
                std::vector<size_t> cat_counts;
         | 
| 518 | 
            -
            } | 
| 596 | 
            +
            };
         | 
| 519 597 |  | 
| 520 598 |  | 
| 521 599 | 
             
            void process_numeric_col(std::vector<Cluster> &cluster_root,
         | 
| @@ -547,12 +625,12 @@ void recursive_split_categ(Workspace &workspace, | |
| 547 625 | 
             
                (This is the module from which
         | 
| 548 626 | 
             
                 new data can be flagged as outliers)
         | 
| 549 627 | 
             
            ********************************************/
         | 
| 550 | 
            -
             | 
| 628 | 
            +
            struct PredictionData {
         | 
| 551 629 | 
             
                double  *restrict numeric_data;
         | 
| 552 630 | 
             
                int     *restrict categorical_data;
         | 
| 553 631 | 
             
                int     *restrict ordinal_data;
         | 
| 554 632 | 
             
                size_t nrows;
         | 
| 555 | 
            -
            } | 
| 633 | 
            +
            };
         | 
| 556 634 |  | 
| 557 635 | 
             
            bool find_new_outliers(double *restrict numeric_data,
         | 
| 558 636 | 
             
                                   int    *restrict categorical_data,
         | 
| @@ -570,19 +648,21 @@ bool check_is_outlier_in_tree(std::vector<size_t> &clusters_in_tree, size_t curr | |
| 570 648 | 
             
            *********************************/
         | 
| 571 649 | 
             
            #define SD_REG 1e-5 /* Regularization for standard deviation estimation */
         | 
| 572 650 |  | 
| 573 | 
            -
             | 
| 651 | 
            +
            /* TODO: should make long doubles optional */
         | 
| 652 | 
            +
             | 
| 653 | 
            +
            struct NumericBranch {
         | 
| 574 654 | 
             
                size_t      cnt;
         | 
| 575 655 | 
             
                long double sum;
         | 
| 576 656 | 
             
                long double sum_sq;
         | 
| 577 | 
            -
            } | 
| 657 | 
            +
            };
         | 
| 578 658 |  | 
| 579 | 
            -
             | 
| 659 | 
            +
            struct NumericSplit {
         | 
| 580 660 | 
             
                NumericBranch NA_branch    = {0, 0, 0};
         | 
| 581 661 | 
             
                NumericBranch left_branch  = {0, 0, 0};
         | 
| 582 662 | 
             
                NumericBranch right_branch = {0, 0, 0};
         | 
| 583 | 
            -
            } | 
| 663 | 
            +
            };
         | 
| 584 664 |  | 
| 585 | 
            -
             | 
| 665 | 
            +
            struct CategSplit {
         | 
| 586 666 | 
             
                size_t *restrict NA_branch;     /* array of counts of the target variable's categories */
         | 
| 587 667 | 
             
                size_t *restrict left_branch;   /* array of counts of the target variable's categories */
         | 
| 588 668 | 
             
                size_t *restrict right_branch;  /* array of counts of the target variable's categories */
         | 
| @@ -591,11 +671,11 @@ typedef struct { | |
| 591 671 | 
             
                size_t size_NA    = 0;
         | 
| 592 672 | 
             
                size_t size_left  = 0;
         | 
| 593 673 | 
             
                size_t size_right = 0;
         | 
| 594 | 
            -
            } | 
| 674 | 
            +
            };
         | 
| 595 675 |  | 
| 596 | 
            -
            void subset_to_onehot(size_t ix_arr[], size_t n_true, size_t n_tot,  | 
| 676 | 
            +
            void subset_to_onehot(size_t ix_arr[], size_t n_true, size_t n_tot, signed char onehot[]);
         | 
| 597 677 | 
             
            size_t move_zero_count_to_front(size_t *restrict cat_sorted, size_t *restrict cat_cnt, size_t ncat_x);
         | 
| 598 | 
            -
            void flag_zero_counts(char split_subset[], size_t buffer_cat_cnt[], size_t ncat_x);
         | 
| 678 | 
            +
            void flag_zero_counts(signed char split_subset[], size_t buffer_cat_cnt[], size_t ncat_x);
         | 
| 599 679 | 
             
            long double calc_sd(size_t cnt, long double sum, long double sum_sq);
         | 
| 600 680 | 
             
            long double calc_sd(NumericBranch &branch);
         | 
| 601 681 | 
             
            long double calc_sd(size_t ix_arr[], double *restrict x, size_t st, size_t end, double *restrict mean);
         | 
| @@ -610,23 +690,25 @@ long double categ_gain_from_split(size_t *restrict ix_arr, int *restrict x, size | |
| 610 690 | 
             
                                              size_t ncat, size_t *restrict buffer_cat_cnt, long double base_info);
         | 
| 611 691 | 
             
            void split_numericx_numericy(size_t *restrict ix_arr, size_t st, size_t end, double *restrict x, double *restrict y,
         | 
| 612 692 | 
             
                                         long double sd_y, bool has_na, size_t min_size, bool take_mid, long double *restrict buffer_sd,
         | 
| 613 | 
            -
                                         long double *restrict gain, double *restrict split_point, size_t *restrict split_left, size_t *restrict split_NA);
         | 
| 693 | 
            +
                                         long double *restrict gain, double *restrict split_point, size_t *restrict split_left, size_t *restrict split_NA, bool *restrict has_zero_variance);
         | 
| 614 694 | 
             
            void split_categx_numericy(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, double *restrict y, long double sd_y, double ymean,
         | 
| 615 695 | 
             
                                       bool x_is_ordinal, size_t ncat_x, size_t *restrict buffer_cat_cnt, long double *restrict buffer_cat_sum,
         | 
| 616 696 | 
             
                                       long double *restrict buffer_cat_sum_sq, size_t *restrict buffer_cat_sorted,
         | 
| 617 | 
            -
                                       bool has_na, size_t min_size, long double *gain, char *restrict split_subset, int *restrict split_point);
         | 
| 697 | 
            +
                                       bool has_na, size_t min_size, long double *gain, signed char *restrict split_subset, int *restrict split_point, bool *restrict has_zero_variance, bool *restrict binary_split);
         | 
| 618 698 | 
             
            void split_numericx_categy(size_t *restrict ix_arr, size_t st, size_t end, double *restrict x, int *restrict y,
         | 
| 619 699 | 
             
                                       size_t ncat_y, long double base_info, size_t *restrict buffer_cat_cnt,
         | 
| 620 700 | 
             
                                       bool has_na, size_t min_size, bool take_mid, long double *restrict gain, double *restrict split_point,
         | 
| 621 | 
            -
                                       size_t *restrict split_left, size_t *restrict split_NA);
         | 
| 701 | 
            +
                                       size_t *restrict split_left, size_t *restrict split_NA, bool *restrict has_zero_variance);
         | 
| 622 702 | 
             
            void split_ordx_categy(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, int *restrict y,
         | 
| 623 703 | 
             
                                   size_t ncat_y, size_t ncat_x, long double base_info,
         | 
| 624 704 | 
             
                                   size_t *restrict buffer_cat_cnt, size_t *restrict buffer_crosstab, size_t *restrict buffer_ord_cnt,
         | 
| 625 | 
            -
                                   bool has_na, size_t min_size, long double *gain, int *split_point | 
| 705 | 
            +
                                   bool has_na, size_t min_size, long double *gain, int *split_point,
         | 
| 706 | 
            +
                                   bool *restrict has_zero_variance, bool *restrict binary_split);
         | 
| 626 707 | 
             
            void split_categx_biny(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, int *restrict y,
         | 
| 627 708 | 
             
                                   size_t ncat_x, long double base_info,
         | 
| 628 709 | 
             
                                   size_t *restrict buffer_cat_cnt, size_t *restrict buffer_crosstab, size_t *restrict buffer_cat_sorted,
         | 
| 629 | 
            -
                                   bool has_na, size_t min_size, long double *gain, char *restrict split_subset | 
| 710 | 
            +
                                   bool has_na, size_t min_size, long double *gain, signed char *restrict split_subset,
         | 
| 711 | 
            +
                                   bool *restrict has_zero_variance, bool *restrict binary_split);
         | 
| 630 712 | 
             
            void split_categx_categy_separate(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, int *restrict y,
         | 
| 631 713 | 
             
                                              size_t ncat_x, size_t ncat_y, long double base_info,
         | 
| 632 714 | 
             
                                              size_t *restrict buffer_cat_cnt, size_t *restrict buffer_crosstab,
         | 
| @@ -634,7 +716,8 @@ void split_categx_categy_separate(size_t *restrict ix_arr, size_t st, size_t end | |
| 634 716 | 
             
            void split_categx_categy_subset(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, int *restrict y,
         | 
| 635 717 | 
             
                                            size_t ncat_x, size_t ncat_y, long double base_info,
         | 
| 636 718 | 
             
                                            size_t *restrict buffer_cat_cnt, size_t *restrict buffer_crosstab, size_t *restrict buffer_split,
         | 
| 637 | 
            -
                                            bool has_na, size_t min_size, long double *gain, char *restrict split_subset | 
| 719 | 
            +
                                            bool has_na, size_t min_size, long double *gain, signed char *restrict split_subset,
         | 
| 720 | 
            +
                                            bool *restrict has_zero_variance, bool *restrict binary_split);
         | 
| 638 721 |  | 
| 639 722 |  | 
| 640 723 |  | 
| @@ -642,8 +725,8 @@ void split_categx_categy_subset(size_t *restrict ix_arr, size_t st, size_t end, | |
| 642 725 | 
             
                Prototypes from clusters.cpp
         | 
| 643 726 | 
             
            ************************************/
         | 
| 644 727 | 
             
            #define calculate_max_outliers(n, perc) (  (n) * (perc) + (long double)2 * sqrtl( (n) * (perc) * ((long double)1 - perc) ) + (long double)1  )
         | 
| 645 | 
            -
            #define z_score(x, mu, sd) (  ((x) - (mu)) / (sd)  )
         | 
| 646 | 
            -
            #define chebyshyov_bound( | 
| 728 | 
            +
            #define z_score(x, mu, sd) (  ((x) - (mu)) / std::max((sd), 1e-12)  )
         | 
| 729 | 
            +
            #define chebyshyov_bound(zval) (1.0 / std::max(square(zval), 1.))
         | 
| 647 730 |  | 
| 648 731 | 
             
            bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_t st, size_t end,
         | 
| 649 732 | 
             
                                          double *restrict outlier_scores, size_t *restrict outlier_clusters, size_t *restrict outlier_trees,
         | 
| @@ -654,7 +737,7 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_ | |
| 654 737 | 
             
            void define_categ_cluster_no_cond(int *restrict x, size_t *restrict ix_arr, size_t st, size_t end, size_t ncateg,
         | 
| 655 738 | 
             
                                              double *restrict outlier_scores, size_t *restrict outlier_clusters, size_t *restrict outlier_trees,
         | 
| 656 739 | 
             
                                              size_t *restrict outlier_depth, Cluster &cluster,
         | 
| 657 | 
            -
                                              size_t *restrict categ_counts, char *restrict is_outlier, double perc_next_most_comm);
         | 
| 740 | 
            +
                                              size_t *restrict categ_counts, signed char *restrict is_outlier, double perc_next_most_comm);
         | 
| 658 741 | 
             
            bool define_categ_cluster(int *restrict x, size_t *restrict ix_arr, size_t st, size_t end, size_t ncateg, bool by_maj,
         | 
| 659 742 | 
             
                                      double *restrict outlier_scores, size_t *restrict outlier_clusters, size_t *restrict outlier_trees,
         | 
| 660 743 | 
             
                                      size_t *restrict outlier_depth, Cluster &cluster, std::vector<Cluster> &clusters,
         | 
| @@ -662,7 +745,7 @@ bool define_categ_cluster(int *restrict x, size_t *restrict ix_arr, size_t st, s | |
| 662 745 | 
             
                                      double max_perc_outliers, double z_norm, double z_outlier,
         | 
| 663 746 | 
             
                                      long double *restrict perc_threshold, long double *restrict prop_prior,
         | 
| 664 747 | 
             
                                      size_t *restrict buffer_categ_counts, long double *restrict buffer_categ_pct,
         | 
| 665 | 
            -
                                      size_t *restrict buffer_categ_ix, char *restrict buffer_outliers,
         | 
| 748 | 
            +
                                      size_t *restrict buffer_categ_ix, signed char *restrict buffer_outliers,
         | 
| 666 749 | 
             
                                      bool *restrict drop_cluster);
         | 
| 667 750 | 
             
            void simplify_when_equal_cond(std::vector<Cluster> &clusters, int ncat_ord[]);
         | 
| 668 751 | 
             
            void simplify_when_equal_cond(std::vector<ClusterTree> &trees, int ncat_ord[]);
         | 
| @@ -680,12 +763,12 @@ void calculate_cluster_poss_categs(ModelOutputs &model_outputs, size_t col, size | |
| 680 763 | 
             
            #define calculate_max_cat_outliers(n, perc, z_norm) ((long double)1 + ((n) * (perc) / z_norm)) /* Note: this is not anyhow probabilistic, nor based on provable bounds */
         | 
| 681 764 | 
             
            void find_outlier_categories(size_t categ_counts[], size_t ncateg, size_t tot, double max_perc_outliers,
         | 
| 682 765 | 
             
                                         long double perc_threshold[], size_t buffer_ix[], long double buffer_perc[],
         | 
| 683 | 
            -
                                         double z_norm, char is_outlier[], bool *found_outliers, bool *new_is_outlier, double *next_most_comm);
         | 
| 766 | 
            +
                                         double z_norm, signed char is_outlier[], bool *found_outliers, bool *new_is_outlier, double *next_most_comm);
         | 
| 684 767 | 
             
            void find_outlier_categories_by_maj(size_t categ_counts[], size_t ncateg, size_t tot, double max_perc_outliers,
         | 
| 685 | 
            -
                                                long double prior_prob[], double z_outlier, char is_outlier[],
         | 
| 768 | 
            +
                                                long double prior_prob[], double z_outlier, signed char is_outlier[],
         | 
| 686 769 | 
             
                                                bool *found_outliers, bool *new_is_outlier, int *categ_maj);
         | 
| 687 770 | 
             
            bool find_outlier_categories_no_cond(size_t categ_counts[], size_t ncateg, size_t tot,
         | 
| 688 | 
            -
                                                 char is_outlier[], double *next_most_comm);
         | 
| 771 | 
            +
                                                 signed char is_outlier[], double *next_most_comm);
         | 
| 689 772 |  | 
| 690 773 |  | 
| 691 774 |  | 
| @@ -699,7 +782,7 @@ typedef struct { | |
| 699 782 | 
             
                double gain_best_restore;
         | 
| 700 783 | 
             
                double split_point_restore;
         | 
| 701 784 | 
             
                int    split_lev_restore;
         | 
| 702 | 
            -
                std::vector<char> split_subset_restore;
         | 
| 785 | 
            +
                std::vector<signed char> split_subset_restore;
         | 
| 703 786 | 
             
                size_t ix1_restore;
         | 
| 704 787 | 
             
                size_t ix2_restore;
         | 
| 705 788 | 
             
                size_t ix3_restore;
         | 
| @@ -709,12 +792,13 @@ typedef struct { | |
| 709 792 | 
             
                ColType col_type_best_rememer;
         | 
| 710 793 | 
             
                double split_point_best_restore;
         | 
| 711 794 | 
             
                int    split_lev_best_restore;
         | 
| 712 | 
            -
                std::vector<char> split_subset_best_restore;
         | 
| 795 | 
            +
                std::vector<signed char> split_subset_best_restore;
         | 
| 713 796 | 
             
                long double base_info_restore;
         | 
| 714 797 | 
             
                long double base_info_orig_restore;
         | 
| 715 798 | 
             
                double sd_y_restore;
         | 
| 716 799 | 
             
                bool has_outliers_restore;
         | 
| 717 800 | 
             
                bool lev_has_outliers_restore;
         | 
| 801 | 
            +
                bool is_binary_split_restore;
         | 
| 718 802 | 
             
            } RecursionState;
         | 
| 719 803 |  | 
| 720 804 |  | 
| @@ -739,14 +823,14 @@ size_t move_outliers_to_front(size_t ix_arr[], double outlier_scores[], size_t s | |
| 739 823 | 
             
            size_t move_NAs_to_front(size_t ix_arr[], double x[], size_t st, size_t end, bool inf_as_NA);
         | 
| 740 824 | 
             
            size_t move_NAs_to_front(size_t ix_arr[], int x[], size_t st, size_t end);
         | 
| 741 825 | 
             
            void divide_subset_split(size_t ix_arr[], double x[], size_t st, size_t end, double split_point, bool has_NA, size_t *split_NA, size_t *st_right);
         | 
| 742 | 
            -
            void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, char subset_categ[], int ncat, bool has_NA, size_t *split_NA, size_t *st_right);
         | 
| 826 | 
            +
            void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, signed char subset_categ[], int ncat, bool has_NA, size_t *split_NA, size_t *st_right);
         | 
| 743 827 | 
             
            void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, int split_lev, bool has_NA, size_t *split_NA, size_t *st_right);
         | 
| 744 828 | 
             
            bool check_workspace_is_allocated(Workspace &workspace);
         | 
| 745 829 | 
             
            void allocate_thread_workspace(Workspace &workspace, size_t nrows, int max_categ);
         | 
| 746 830 | 
             
            void backup_recursion_state(Workspace &workspace, RecursionState &state_backup);
         | 
| 747 831 | 
             
            void restore_recursion_state(Workspace &workspace, RecursionState &state_backup);
         | 
| 748 832 | 
             
            void set_tree_as_numeric(ClusterTree &tree, double split_point, size_t col);
         | 
| 749 | 
            -
            void set_tree_as_categorical(ClusterTree &tree, int ncat, char *split_subset, size_t col);
         | 
| 833 | 
            +
            void set_tree_as_categorical(ClusterTree &tree, int ncat, signed char *split_subset, size_t col);
         | 
| 750 834 | 
             
            void set_tree_as_categorical(ClusterTree &tree, size_t col);
         | 
| 751 835 | 
             
            void set_tree_as_categorical(ClusterTree &tree, size_t col, int ncat);
         | 
| 752 836 | 
             
            void set_tree_as_ordinal(ClusterTree &tree, int split_lev, size_t col);
         | 
| @@ -756,3 +840,27 @@ void check_more_two_values(double arr_num[], size_t nrows, size_t ncols, int nth | |
| 756 840 | 
             
            void calc_min_decimals_to_print(ModelOutputs &model_outputs, double *restrict numeric_data, int nthreads);
         | 
| 757 841 | 
             
            int decimals_diff(double val1, double val2);
         | 
| 758 842 | 
             
            void dealloc_ModelOutputs(ModelOutputs &model_outputs);
         | 
| 843 | 
            +
            ModelOutputs get_empty_ModelOutputs();
         | 
| 844 | 
            +
            bool get_has_openmp();
         | 
| 845 | 
            +
             | 
| 846 | 
            +
            extern bool interrupt_switch;
         | 
| 847 | 
            +
            extern bool handle_is_locked;
         | 
| 848 | 
            +
            void set_interrup_global_variable(int s);
         | 
| 849 | 
            +
            class SignalSwitcher
         | 
| 850 | 
            +
            {
         | 
| 851 | 
            +
            public:
         | 
| 852 | 
            +
                sig_t_ old_sig;
         | 
| 853 | 
            +
                bool is_active;
         | 
| 854 | 
            +
                SignalSwitcher();
         | 
| 855 | 
            +
                ~SignalSwitcher();
         | 
| 856 | 
            +
                void restore_handle();
         | 
| 857 | 
            +
            };
         | 
| 858 | 
            +
            void check_interrupt_switch(SignalSwitcher &ss);
         | 
| 859 | 
            +
            #ifdef _FOR_PYTHON
         | 
| 860 | 
            +
            bool cy_check_interrupt_switch();
         | 
| 861 | 
            +
            void cy_tick_off_interrupt_switch();
         | 
| 862 | 
            +
            #endif
         | 
| 863 | 
            +
            size_t log2ceil(size_t v);
         | 
| 864 | 
            +
            #ifdef _FOR_PYTHON
         | 
| 865 | 
            +
            ModelOutputs deepcopy(const ModelOutputs &inp);
         | 
| 866 | 
            +
            #endif
         | 
| @@ -226,6 +226,11 @@ bool follow_tree(ModelOutputs &model_outputs, PredictionData &prediction_data, s | |
| 226 226 | 
             
                                                                 true : found_outliers;
         | 
| 227 227 | 
             
                                            break;
         | 
| 228 228 | 
             
                                        }
         | 
| 229 | 
            +
             | 
| 230 | 
            +
                                        default:
         | 
| 231 | 
            +
                                        {
         | 
| 232 | 
            +
                                            assert(0);
         | 
| 233 | 
            +
                                        }
         | 
| 229 234 | 
             
                                    }
         | 
| 230 235 | 
             
                                    break;
         | 
| 231 236 | 
             
                                }
         | 
| @@ -279,6 +284,11 @@ bool follow_tree(ModelOutputs &model_outputs, PredictionData &prediction_data, s | |
| 279 284 | 
             
                                                                 true : found_outliers;
         | 
| 280 285 | 
             
                                            break;
         | 
| 281 286 | 
             
                                        }
         | 
| 287 | 
            +
             | 
| 288 | 
            +
                                        default:
         | 
| 289 | 
            +
                                        {
         | 
| 290 | 
            +
                                            assert(0);
         | 
| 291 | 
            +
                                        }
         | 
| 282 292 | 
             
                                    }
         | 
| 283 293 | 
             
                                    break;
         | 
| 284 294 | 
             
                                }
         | 
| @@ -332,10 +342,16 @@ bool follow_tree(ModelOutputs &model_outputs, PredictionData &prediction_data, s | |
| 332 342 | 
             
                                                                 true : found_outliers;
         | 
| 333 343 | 
             
                                            break;
         | 
| 334 344 | 
             
                                        }
         | 
| 345 | 
            +
             | 
| 346 | 
            +
                                        default:
         | 
| 347 | 
            +
                                        {
         | 
| 348 | 
            +
                                            assert(0);
         | 
| 349 | 
            +
                                        }
         | 
| 335 350 | 
             
                                    }
         | 
| 336 351 | 
             
                                    break;
         | 
| 337 352 | 
             
                                }
         | 
| 338 353 |  | 
| 354 | 
            +
                                default: {}
         | 
| 339 355 | 
             
                            }
         | 
| 340 356 | 
             
                        }
         | 
| 341 357 | 
             
                    }
         | 
| @@ -548,6 +564,11 @@ bool check_is_outlier_in_tree(std::vector<size_t> &clusters_in_tree, size_t curr | |
| 548 564 | 
             
                                        if (!isnan(num_val_other) && num_val_other > model_outputs.all_clusters[col][cl].split_point) flag_this_cluster = true;
         | 
| 549 565 | 
             
                                        break;
         | 
| 550 566 | 
             
                                    }
         | 
| 567 | 
            +
             | 
| 568 | 
            +
                                    default:
         | 
| 569 | 
            +
                                    {
         | 
| 570 | 
            +
                                        assert(0);
         | 
| 571 | 
            +
                                    }
         | 
| 551 572 | 
             
                                }
         | 
| 552 573 | 
             
                                break;
         | 
| 553 574 | 
             
                            }
         | 
| @@ -587,6 +608,11 @@ bool check_is_outlier_in_tree(std::vector<size_t> &clusters_in_tree, size_t curr | |
| 587 608 | 
             
                                        break;
         | 
| 588 609 | 
             
                                    }
         | 
| 589 610 |  | 
| 611 | 
            +
                                    default:
         | 
| 612 | 
            +
                                    {
         | 
| 613 | 
            +
                                        assert(0);
         | 
| 614 | 
            +
                                    }
         | 
| 615 | 
            +
             | 
| 590 616 | 
             
                                    /* Note: type 'SingleCateg' is only used temporarily, later gets converted to 'Equal' */
         | 
| 591 617 | 
             
                                }
         | 
| 592 618 | 
             
                                break;
         | 
| @@ -626,6 +652,11 @@ bool check_is_outlier_in_tree(std::vector<size_t> &clusters_in_tree, size_t curr | |
| 626 652 | 
             
                                        if (cat_val_other >=0 && cat_val_other != model_outputs.all_clusters[col][cl].split_lev) flag_this_cluster = true;
         | 
| 627 653 | 
             
                                        break;
         | 
| 628 654 | 
             
                                    }
         | 
| 655 | 
            +
             | 
| 656 | 
            +
                                    default:
         | 
| 657 | 
            +
                                    {
         | 
| 658 | 
            +
                                        assert(0);
         | 
| 659 | 
            +
                                    }
         | 
| 629 660 | 
             
                                }
         | 
| 630 661 | 
             
                                break;
         | 
| 631 662 | 
             
                            }
         | 
| @@ -645,6 +676,8 @@ bool check_is_outlier_in_tree(std::vector<size_t> &clusters_in_tree, size_t curr | |
| 645 676 | 
             
                                                                        model_outputs.all_clusters[col][cl].cluster_sd
         | 
| 646 677 | 
             
                                                                        )
         | 
| 647 678 | 
             
                                );
         | 
| 679 | 
            +
                                if (is_na_or_inf(outlier_score))
         | 
| 680 | 
            +
                                    outlier_score = 1. - 1e-15;
         | 
| 648 681 | 
             
                            } else {
         | 
| 649 682 | 
             
                                outlier_score = model_outputs.all_clusters[col][cl].score_categ[cat_val_this];
         | 
| 650 683 | 
             
                            }
         |