RubyGems - outliertree - Versions diffs - 0.2.1 → 0.3.1 - Mend

outliertree 0.2.1 → 0.3.1

Files changed (23) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +10 -0
data/NOTICE.txt +1 -1
data/README.md +11 -10
data/ext/outliertree/ext.cpp +23 -0
data/ext/outliertree/extconf.rb +1 -1
data/lib/outliertree/result.rb +3 -3
data/lib/outliertree/version.rb +1 -1
data/vendor/outliertree/README.md +83 -41
data/vendor/outliertree/src/Makevars.in +3 -0
data/vendor/outliertree/src/Makevars.win +3 -0
data/vendor/outliertree/src/RcppExports.cpp +17 -27
data/vendor/outliertree/src/Rwrapper.cpp +354 -62
data/vendor/outliertree/src/cat_outlier.cpp +6 -6
data/vendor/outliertree/src/clusters.cpp +114 -9
data/vendor/outliertree/src/fit_model.cpp +525 -331
data/vendor/outliertree/src/misc.cpp +166 -17
data/vendor/outliertree/src/outlier_tree.hpp +164 -56
data/vendor/outliertree/src/outliertree-win.def +3 -0
data/vendor/outliertree/src/predict.cpp +33 -0
data/vendor/outliertree/src/split.cpp +124 -20
metadata +8 -6
data/vendor/outliertree/src/Makevars +0 -3

data/vendor/outliertree/src/outlier_tree.hpp CHANGED Viewed

@@ -41,23 +41,34 @@
 #include <algorithm>
 #include <numeric>
 #include <unordered_set>
+#include <exception>
+#include <stdexcept>
+#include <cassert>
 #include <math.h>
 #include <cmath>
 #include <stddef.h>
 #include <limits.h>
+#include <limits>
 #include <stdlib.h>
 #include <stddef.h>
 #include <string.h>
+#include <stdint.h>
 #ifdef _OPENMP
     #include <omp.h>
 #endif
+#ifdef _FOR_R
+    #include <Rcpp.h>
+#endif
+#include <signal.h>
+typedef void (*sig_t_)(int);
 /************************
     Short Functions
 *************************/
 #define extract_bit(number, bit) (((number) >> (bit)) & 1) /* https://stackoverflow.com/questions/2249731/how-do-i-get-bit-by-bit-data-from-an-integer-value-in-c */
 #define pow2(n) ( ((size_t) 1) << (n) ) /* https://stackoverflow.com/questions/101439/the-most-efficient-way-to-implement-an-integer-based-power-function-powint-int */
-#define avg_between(a, b) (((a) + (b)) * 0.5)
+#define avg_between(a, b) ((a) + 0.5*((b) - (a)))
 #define square(x) ((x) * (x))
 #ifndef isinf
     #define isinf std::isinf
@@ -68,7 +79,7 @@
 #define is_na_or_inf(x) (isnan(x) || isinf(x))
 /* Aliasing for compiler optimizations */
-#if defined(__GNUG__) || defined(__GNUC__) || defined(_MSC_VER) || defined(__clang__) || defined(__INTEL_COMPILER)
+#if defined(__GNUG__) || defined(__GNUC__) || defined(_MSC_VER) || defined(__clang__) || defined(__INTEL_COMPILER) || defined(__IBMCPP__) || defined(__ibmxl__) || defined(SUPPORTS_RESTRICT)
     #define restrict __restrict
 #else
     #define restrict
@@ -89,6 +100,8 @@
     #define omp_get_thread_num() 0
 #endif
+#define unexpected_error() throw std::runtime_error("Unexpected error. Please open an issue in GitHub.\n")
 /****************************************************************
     Data types and structs that are returned from this module
@@ -103,6 +116,8 @@ typedef enum SplitType {
 } SplitType;
 typedef enum ColTransf {NoTransf, Log, Exp} ColTransf; /* transformation to apply to numeric column */
+/* TODO: should write serializers for the model objects, but need to somehow deal with long double types */
 /*
 *    1-d clusters that define homogeneous groups in which observations can be outliers.
 *    Note that these are associated to a tree and define one extra condition from what
@@ -114,7 +129,7 @@ typedef struct Cluster {
     size_t col_num = 0; /* numer of the column by which its being split, the target column is given by index of the cluster vector */
     SplitType split_type = Root;
     double split_point = HUGE_VAL; /* numerical */
-    std::vector<char> split_subset = std::vector<char>(); /* categorical */
+    std::vector<signed char> split_subset; /* categorical */
     int split_lev = INT_MAX;    /* ordinal */
     bool has_NA_branch = false; /* this is in order to determine the best outlier cluster when it fits under more than 1 */
@@ -127,7 +142,7 @@ typedef struct Cluster {
     double    display_lim_high = -HUGE_VAL;                /* numerical target column */
     double    display_mean = -HUGE_VAL;                    /* numerical target column */
     double    display_sd = -HUGE_VAL;                      /* numerical target column */
-    std::vector<char> subset_common = std::vector<char>(); /* categorical or ordinal target column (=0 is common) */
+    std::vector<signed char> subset_common; /* categorical or ordinal target column (=0 is common) */
     double    perc_in_subset = HUGE_VAL;                   /* categorical or ordinal target column */
     double    perc_next_most_comm = -HUGE_VAL;             /* categorical or ordinal target column */ /* TODO */
     int       categ_maj = -1;                              /* when using majority-criterion for categorical outliers */
@@ -156,7 +171,7 @@ typedef struct Cluster {
     }
     /* categorical split */
-    Cluster(ColType column_type, size_t col_num, SplitType split_type, char *split_subset, int ncat, bool has_NA_branch = false)
+    Cluster(ColType column_type, size_t col_num, SplitType split_type, signed char *split_subset, int ncat, bool has_NA_branch = false)
     {
         this->column_type = column_type;
         this->col_num = col_num;
@@ -245,21 +260,21 @@ typedef struct Cluster {
 typedef struct ClusterTree {
     size_t parent = 0;              /* index in a vector */
     SplitType parent_branch = Root; /* this tree follows this branch in the split given by its parent */
-    std::vector<size_t> clusters = std::vector<size_t>(); /* these clusters define additional splits */
+    std::vector<size_t> clusters; /* these clusters define additional splits */
     SplitType split_this_branch = Root;                        /* when using 'follow_all' */
-    std::vector<size_t> all_branches = std::vector<size_t>();  /* when using 'follow_all' */
+    std::vector<size_t> all_branches;  /* when using 'follow_all' */
     ColType   column_type = NoType;
     size_t    col_num = 0;
     double    split_point = HUGE_VAL;
-    std::vector<char> split_subset = std::vector<char>();
+    std::vector<signed char> split_subset;
     int split_lev = INT_MAX;
     size_t tree_NA = 0;    /* binary splits */
     size_t tree_left = 0;  /* binary splits */
     size_t tree_right = 0; /* binary splits */
-    std::vector<size_t> binary_branches = std::vector<size_t>(); /* multiple splits (single category or binarized categories) */
+    std::vector<size_t> binary_branches; /* multiple splits (single category or binarized categories) */
     ClusterTree(size_t parent, SplitType parent_branch)
     {
@@ -286,7 +301,7 @@ typedef struct ClusterTree {
         this->split_lev = split_lev;
     }
-    ClusterTree(size_t parent, size_t col_num, SplitType split_this_branch, char *split_subset, int ncat)
+    ClusterTree(size_t parent, size_t col_num, SplitType split_this_branch, signed char *split_subset, int ncat)
     {
         this->parent = parent;
         this->col_num = col_num;
@@ -336,6 +351,8 @@ typedef struct ClusterTree {
 } ClusterTree;
+/* TODO: should separate the results from the actual model object */
 /* these are needed for prediction time, and are thus returned from the function that fits the model */
 typedef struct ModelOutputs {
     std::vector< std::vector<ClusterTree> > all_trees;  /* clusters in which observations can be outliers, required for prediction time */
@@ -370,11 +387,6 @@ typedef struct ModelOutputs {
         archive(
                 this->all_trees,
                 this->all_clusters,
-                this->outlier_scores_final,
-                this->outlier_clusters_final,
-                this->outlier_columns_final,
-                this->outlier_trees_final,
-                this->outlier_depth_final,
                 this->start_ix_cat_counts,
                 this->prop_categ,
                 this->col_transf,
@@ -421,7 +433,69 @@ bool fit_outliers_models(ModelOutputs &model_outputs,
                          size_t max_depth = 3, double max_perc_outliers = 0.01, size_t min_size_numeric = 25, size_t min_size_categ = 50,
                          double min_gain = 1e-2, bool gain_as_pct = false, bool follow_all = false, double z_norm = 2.67, double z_outlier = 8.0);
-typedef struct {
+class ExhaustedColumnTracker
+{
+public:
+    std::vector<bool> is_exhausted;
+    std::vector<size_t> col_indices;
+    std::vector<size_t> n_held;
+    void initialize(size_t ncols, size_t max_depth)
+    {
+        this->is_exhausted.assign(ncols, false);
+        this->n_held.clear();
+        this->n_held.reserve(max_depth+1);
+        this->col_indices.clear();
+        this->col_indices.reserve(ncols);
+    }
+    void push_branch()
+    {
+        this->n_held.push_back(0);
+    }
+    void push_col(size_t col)
+    {
+        this->is_exhausted[col] = true;
+        this->col_indices.push_back(col);
+        this->n_held.back() += 1;
+    }
+    void pop_branch()
+    {
+        size_t col;
+        while (this->n_held.back() > 0)
+        {
+            col = this->col_indices.back();
+            this->is_exhausted[col] = false;
+            this->col_indices.pop_back();
+            this->n_held.back() -= 1;
+        }
+        this->n_held.pop_back();
+    }
+};
+class ExhaustedColumnsLevel
+{
+public:
+    bool pop = false;
+    ExhaustedColumnTracker* tracker = nullptr;
+    ExhaustedColumnsLevel() = default;
+    void initialize(ExhaustedColumnTracker* tracker) {
+        this->pop = true;
+        this->tracker = tracker;
+        this->tracker->push_branch();
+    }
+    ~ExhaustedColumnsLevel() {
+        if (this->pop) {
+            this->tracker->pop_branch();
+            this->pop = false;
+        }
+    }
+};
+struct Workspace {
     std::vector<size_t> ix_arr;           /* indices from the target column */
     size_t st;                            /* chunk of the indices to take for current function calls */
@@ -460,7 +534,7 @@ typedef struct {
     int *untransf_target_col;             /* column as it was before forcibly binarizing (dynamic pointer) */
     int *temp_ptr_x;                      /* dynamic pointer */
-    std::vector<char> buffer_subset_categ_best;  /* categorical split that gave the best gain */
+    std::vector<signed char> buffer_subset_categ_best;  /* categorical split that gave the best gain */
     long double this_gain;                       /* buffer where to store gain */
     double this_split_point;                     /* numeric split threshold */
     int this_split_lev;                          /* ordinal split threshold */
@@ -477,8 +551,8 @@ typedef struct {
     std::vector<size_t>      buffer_crosstab;        /* buffer arrays where to allocate values required by functions and not used outside them */
     std::vector<size_t>      buffer_cat_cnt;         /* buffer arrays where to allocate values required by functions and not used outside them */
     std::vector<size_t>      buffer_cat_sorted;      /* buffer arrays where to allocate values required by functions and not used outside them */
-    std::vector<char>        buffer_subset_categ;    /* buffer arrays where to allocate values required by functions and not used outside them */
-    std::vector<char>        buffer_subset_outlier;  /* buffer arrays where to allocate values required by functions and not used outside them */
+    std::vector<signed char> buffer_subset_categ;    /* buffer arrays where to allocate values required by functions and not used outside them */
+    std::vector<signed char> buffer_subset_outlier;  /* buffer arrays where to allocate values required by functions and not used outside them */
     std::vector<long double> buffer_sd;              /* used for a more numerically-stable two-pass gain calculation */
     bool drop_cluster;          /* for categorical and ordinal variables, not all clusters can flag observations as outliers, so those are not kept */
@@ -486,10 +560,14 @@ typedef struct {
     bool target_col_is_ord;     /* whether the target column is ordinal (rest is the same as in categoricals) */
     int  ncat_this;             /* number of categories in the target column */
-} Workspace;
+    ExhaustedColumnTracker exhausted_col_tracker;
+    bool has_zero_variance;
+    bool is_binary_split;
+    bool best_cat_split_is_binary;
+};
 /* info holders to shorten function call arguments */
-typedef struct {
+struct ModelParams {
     bool    categ_as_bin;
     bool    ord_as_bin;
     bool    cat_bruteforce_subset;
@@ -506,16 +584,16 @@ typedef struct {
     double  z_outlier;
     double  z_tail;
     std::vector<long double> prop_small; /* this is not a parameter, but a shared array determined from the parameters and data */
-} ModelParams;
+};
 /* Note: the vectors here are filled within the function that fits the model, while the pointers are passed from outside */
-typedef struct {
+struct InputData {
     double  *restrict numeric_data;     size_t ncols_numeric;
     int     *restrict categorical_data; size_t ncols_categ;   int *restrict ncat;
     int     *restrict ordinal_data;     size_t ncols_ord;     int *restrict ncat_ord;
     size_t  nrows; size_t tot_cols; std::vector<char> has_NA; std::vector<char> skip_col; int max_categ;
     std::vector<size_t> cat_counts;
-} InputData;
+};
 void process_numeric_col(std::vector<Cluster> &cluster_root,
@@ -547,12 +625,12 @@ void recursive_split_categ(Workspace &workspace,
     (This is the module from which
      new data can be flagged as outliers)
 ********************************************/
-typedef struct {
+struct PredictionData {
     double  *restrict numeric_data;
     int     *restrict categorical_data;
     int     *restrict ordinal_data;
     size_t nrows;
-} PredictionData;
+};
 bool find_new_outliers(double *restrict numeric_data,
                        int    *restrict categorical_data,
@@ -570,19 +648,21 @@ bool check_is_outlier_in_tree(std::vector<size_t> &clusters_in_tree, size_t curr
 *********************************/
 #define SD_REG 1e-5 /* Regularization for standard deviation estimation */
-typedef struct {
+/* TODO: should make long doubles optional */
+struct NumericBranch {
     size_t      cnt;
     long double sum;
     long double sum_sq;
-} NumericBranch;
+};
-typedef struct {
+struct NumericSplit {
     NumericBranch NA_branch    = {0, 0, 0};
     NumericBranch left_branch  = {0, 0, 0};
     NumericBranch right_branch = {0, 0, 0};
-} NumericSplit;
+};
-typedef struct {
+struct CategSplit {
     size_t *restrict NA_branch;     /* array of counts of the target variable's categories */
     size_t *restrict left_branch;   /* array of counts of the target variable's categories */
     size_t *restrict right_branch;  /* array of counts of the target variable's categories */
@@ -591,11 +671,11 @@ typedef struct {
     size_t size_NA    = 0;
     size_t size_left  = 0;
     size_t size_right = 0;
-} CategSplit;
+};
-void subset_to_onehot(size_t ix_arr[], size_t n_true, size_t n_tot, bool onehot[]);
+void subset_to_onehot(size_t ix_arr[], size_t n_true, size_t n_tot, signed char onehot[]);
 size_t move_zero_count_to_front(size_t *restrict cat_sorted, size_t *restrict cat_cnt, size_t ncat_x);
-void flag_zero_counts(char split_subset[], size_t buffer_cat_cnt[], size_t ncat_x);
+void flag_zero_counts(signed char split_subset[], size_t buffer_cat_cnt[], size_t ncat_x);
 long double calc_sd(size_t cnt, long double sum, long double sum_sq);
 long double calc_sd(NumericBranch &branch);
 long double calc_sd(size_t ix_arr[], double *restrict x, size_t st, size_t end, double *restrict mean);
@@ -610,23 +690,25 @@ long double categ_gain_from_split(size_t *restrict ix_arr, int *restrict x, size
                                   size_t ncat, size_t *restrict buffer_cat_cnt, long double base_info);
 void split_numericx_numericy(size_t *restrict ix_arr, size_t st, size_t end, double *restrict x, double *restrict y,
                              long double sd_y, bool has_na, size_t min_size, bool take_mid, long double *restrict buffer_sd,
-                             long double *restrict gain, double *restrict split_point, size_t *restrict split_left, size_t *restrict split_NA);
+                             long double *restrict gain, double *restrict split_point, size_t *restrict split_left, size_t *restrict split_NA, bool *restrict has_zero_variance);
 void split_categx_numericy(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, double *restrict y, long double sd_y, double ymean,
                            bool x_is_ordinal, size_t ncat_x, size_t *restrict buffer_cat_cnt, long double *restrict buffer_cat_sum,
                            long double *restrict buffer_cat_sum_sq, size_t *restrict buffer_cat_sorted,
-                           bool has_na, size_t min_size, long double *gain, char *restrict split_subset, int *restrict split_point);
+                           bool has_na, size_t min_size, long double *gain, signed char *restrict split_subset, int *restrict split_point, bool *restrict has_zero_variance, bool *restrict binary_split);
 void split_numericx_categy(size_t *restrict ix_arr, size_t st, size_t end, double *restrict x, int *restrict y,
                            size_t ncat_y, long double base_info, size_t *restrict buffer_cat_cnt,
                            bool has_na, size_t min_size, bool take_mid, long double *restrict gain, double *restrict split_point,
-                           size_t *restrict split_left, size_t *restrict split_NA);
+                           size_t *restrict split_left, size_t *restrict split_NA, bool *restrict has_zero_variance);
 void split_ordx_categy(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, int *restrict y,
                        size_t ncat_y, size_t ncat_x, long double base_info,
                        size_t *restrict buffer_cat_cnt, size_t *restrict buffer_crosstab, size_t *restrict buffer_ord_cnt,
-                       bool has_na, size_t min_size, long double *gain, int *split_point);
+                       bool has_na, size_t min_size, long double *gain, int *split_point,
+                       bool *restrict has_zero_variance, bool *restrict binary_split);
 void split_categx_biny(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, int *restrict y,
                        size_t ncat_x, long double base_info,
                        size_t *restrict buffer_cat_cnt, size_t *restrict buffer_crosstab, size_t *restrict buffer_cat_sorted,
-                       bool has_na, size_t min_size, long double *gain, char *restrict split_subset);
+                       bool has_na, size_t min_size, long double *gain, signed char *restrict split_subset,
+                       bool *restrict has_zero_variance, bool *restrict binary_split);
 void split_categx_categy_separate(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, int *restrict y,
                                   size_t ncat_x, size_t ncat_y, long double base_info,
                                   size_t *restrict buffer_cat_cnt, size_t *restrict buffer_crosstab,
@@ -634,7 +716,8 @@ void split_categx_categy_separate(size_t *restrict ix_arr, size_t st, size_t end
 void split_categx_categy_subset(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, int *restrict y,
                                 size_t ncat_x, size_t ncat_y, long double base_info,
                                 size_t *restrict buffer_cat_cnt, size_t *restrict buffer_crosstab, size_t *restrict buffer_split,
-                                bool has_na, size_t min_size, long double *gain, char *restrict split_subset);
+                                bool has_na, size_t min_size, long double *gain, signed char *restrict split_subset,
+                                bool *restrict has_zero_variance, bool *restrict binary_split);
@@ -642,8 +725,8 @@ void split_categx_categy_subset(size_t *restrict ix_arr, size_t st, size_t end,
     Prototypes from clusters.cpp
 ************************************/
 #define calculate_max_outliers(n, perc) (  (n) * (perc) + (long double)2 * sqrtl( (n) * (perc) * ((long double)1 - perc) ) + (long double)1  )
-#define z_score(x, mu, sd) (  ((x) - (mu)) / (sd)  )
-#define chebyshyov_bound(sd) (1.0 / square(sd))
+#define z_score(x, mu, sd) (  ((x) - (mu)) / std::max((sd), 1e-12)  )
+#define chebyshyov_bound(zval) (1.0 / std::max(square(zval), 1.))
 bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_t st, size_t end,
                               double *restrict outlier_scores, size_t *restrict outlier_clusters, size_t *restrict outlier_trees,
@@ -654,7 +737,7 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
 void define_categ_cluster_no_cond(int *restrict x, size_t *restrict ix_arr, size_t st, size_t end, size_t ncateg,
                                   double *restrict outlier_scores, size_t *restrict outlier_clusters, size_t *restrict outlier_trees,
                                   size_t *restrict outlier_depth, Cluster &cluster,
-                                  size_t *restrict categ_counts, char *restrict is_outlier, double perc_next_most_comm);
+                                  size_t *restrict categ_counts, signed char *restrict is_outlier, double perc_next_most_comm);
 bool define_categ_cluster(int *restrict x, size_t *restrict ix_arr, size_t st, size_t end, size_t ncateg, bool by_maj,
                           double *restrict outlier_scores, size_t *restrict outlier_clusters, size_t *restrict outlier_trees,
                           size_t *restrict outlier_depth, Cluster &cluster, std::vector<Cluster> &clusters,
@@ -662,7 +745,7 @@ bool define_categ_cluster(int *restrict x, size_t *restrict ix_arr, size_t st, s
                           double max_perc_outliers, double z_norm, double z_outlier,
                           long double *restrict perc_threshold, long double *restrict prop_prior,
                           size_t *restrict buffer_categ_counts, long double *restrict buffer_categ_pct,
-                          size_t *restrict buffer_categ_ix, char *restrict buffer_outliers,
+                          size_t *restrict buffer_categ_ix, signed char *restrict buffer_outliers,
                           bool *restrict drop_cluster);
 void simplify_when_equal_cond(std::vector<Cluster> &clusters, int ncat_ord[]);
 void simplify_when_equal_cond(std::vector<ClusterTree> &trees, int ncat_ord[]);
@@ -680,12 +763,12 @@ void calculate_cluster_poss_categs(ModelOutputs &model_outputs, size_t col, size
 #define calculate_max_cat_outliers(n, perc, z_norm) ((long double)1 + ((n) * (perc) / z_norm)) /* Note: this is not anyhow probabilistic, nor based on provable bounds */
 void find_outlier_categories(size_t categ_counts[], size_t ncateg, size_t tot, double max_perc_outliers,
                              long double perc_threshold[], size_t buffer_ix[], long double buffer_perc[],
-                             double z_norm, char is_outlier[], bool *found_outliers, bool *new_is_outlier, double *next_most_comm);
+                             double z_norm, signed char is_outlier[], bool *found_outliers, bool *new_is_outlier, double *next_most_comm);
 void find_outlier_categories_by_maj(size_t categ_counts[], size_t ncateg, size_t tot, double max_perc_outliers,
-                                    long double prior_prob[], double z_outlier, char is_outlier[],
+                                    long double prior_prob[], double z_outlier, signed char is_outlier[],
                                     bool *found_outliers, bool *new_is_outlier, int *categ_maj);
 bool find_outlier_categories_no_cond(size_t categ_counts[], size_t ncateg, size_t tot,
-                                     char is_outlier[], double *next_most_comm);
+                                     signed char is_outlier[], double *next_most_comm);
@@ -699,7 +782,7 @@ typedef struct {
     double gain_best_restore;
     double split_point_restore;
     int    split_lev_restore;
-    std::vector<char> split_subset_restore;
+    std::vector<signed char> split_subset_restore;
     size_t ix1_restore;
     size_t ix2_restore;
     size_t ix3_restore;
@@ -709,26 +792,27 @@ typedef struct {
     ColType col_type_best_rememer;
     double split_point_best_restore;
     int    split_lev_best_restore;
-    std::vector<char> split_subset_best_restore;
+    std::vector<signed char> split_subset_best_restore;
     long double base_info_restore;
     long double base_info_orig_restore;
     double sd_y_restore;
     bool has_outliers_restore;
     bool lev_has_outliers_restore;
+    bool is_binary_split_restore;
 } RecursionState;
-int calculate_category_indices(size_t start_ix_cat_counts[], int ncat[], size_t ncols, bool skip_col[], int max_categ = 0);
+int calculate_category_indices(size_t start_ix_cat_counts[], int ncat[], size_t ncols, char skip_col[], int max_categ = 0);
 void calculate_all_cat_counts(size_t start_ix_cat_counts[], size_t cat_counts[], int ncat[],
                               int categorical_data[], size_t ncols, size_t nrows,
-                              bool has_NA[], bool skip_col[], int nthreads);
+                              char has_NA[], char skip_col[], int nthreads);
 void check_cat_col_unsplittable(size_t start_ix_cat_counts[], size_t cat_counts[], int ncat[],
-                                size_t ncols, size_t min_conditioned_size, size_t nrows, bool skip_col[], int nthreads);
+                                size_t ncols, size_t min_conditioned_size, size_t nrows, char skip_col[], int nthreads);
 void calculate_lowerlim_proportion(long double *restrict prop_small, long double *restrict prop,
                                    size_t start_ix_cat_counts[], size_t cat_counts[],
                                    size_t ncols, size_t nrows, double z_norm, double z_tail);
-void check_missing_no_variance(double numeric_data[], size_t ncols, size_t nrows, bool has_NA[],
-                               bool skip_col[], int min_decimals[], int nthreads);
+void check_missing_no_variance(double numeric_data[], size_t ncols, size_t nrows, char has_NA[],
+                               char skip_col[], int min_decimals[], int nthreads);
 void calc_central_mean_and_sd(size_t ix_arr[], size_t st, size_t end, double x[], size_t size_quarter, double *mean_central, double *sd_central);
 void check_for_tails(size_t ix_arr[], size_t st, size_t end, double *restrict x,
                      double z_norm, double max_perc_outliers,
@@ -739,14 +823,14 @@ size_t move_outliers_to_front(size_t ix_arr[], double outlier_scores[], size_t s
 size_t move_NAs_to_front(size_t ix_arr[], double x[], size_t st, size_t end, bool inf_as_NA);
 size_t move_NAs_to_front(size_t ix_arr[], int x[], size_t st, size_t end);
 void divide_subset_split(size_t ix_arr[], double x[], size_t st, size_t end, double split_point, bool has_NA, size_t *split_NA, size_t *st_right);
-void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, char subset_categ[], int ncat, bool has_NA, size_t *split_NA, size_t *st_right);
+void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, signed char subset_categ[], int ncat, bool has_NA, size_t *split_NA, size_t *st_right);
 void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, int split_lev, bool has_NA, size_t *split_NA, size_t *st_right);
 bool check_workspace_is_allocated(Workspace &workspace);
 void allocate_thread_workspace(Workspace &workspace, size_t nrows, int max_categ);
 void backup_recursion_state(Workspace &workspace, RecursionState &state_backup);
 void restore_recursion_state(Workspace &workspace, RecursionState &state_backup);
 void set_tree_as_numeric(ClusterTree &tree, double split_point, size_t col);
-void set_tree_as_categorical(ClusterTree &tree, int ncat, char *split_subset, size_t col);
+void set_tree_as_categorical(ClusterTree &tree, int ncat, signed char *split_subset, size_t col);
 void set_tree_as_categorical(ClusterTree &tree, size_t col);
 void set_tree_as_categorical(ClusterTree &tree, size_t col, int ncat);
 void set_tree_as_ordinal(ClusterTree &tree, int split_lev, size_t col);
@@ -756,3 +840,27 @@ void check_more_two_values(double arr_num[], size_t nrows, size_t ncols, int nth
 void calc_min_decimals_to_print(ModelOutputs &model_outputs, double *restrict numeric_data, int nthreads);
 int decimals_diff(double val1, double val2);
 void dealloc_ModelOutputs(ModelOutputs &model_outputs);
+ModelOutputs get_empty_ModelOutputs();
+bool get_has_openmp();
+extern bool interrupt_switch;
+extern bool handle_is_locked;
+void set_interrup_global_variable(int s);
+class SignalSwitcher
+{
+public:
+    sig_t_ old_sig;
+    bool is_active;
+    SignalSwitcher();
+    ~SignalSwitcher();
+    void restore_handle();
+};
+void check_interrupt_switch(SignalSwitcher &ss);
+#ifdef _FOR_PYTHON
+bool cy_check_interrupt_switch();
+void cy_tick_off_interrupt_switch();
+#endif
+size_t log2ceil(size_t v);
+#ifdef _FOR_PYTHON
+ModelOutputs deepcopy(const ModelOutputs &inp);
+#endif

data/vendor/outliertree/src/outliertree-win.def ADDED Viewed

@@ -0,0 +1,3 @@
+LIBRARY outliertree.dll
+EXPORTS
+ R_init_outliertree

data/vendor/outliertree/src/predict.cpp CHANGED Viewed

@@ -226,6 +226,11 @@ bool follow_tree(ModelOutputs &model_outputs, PredictionData &prediction_data, s
                                                      true : found_outliers;
                                 break;
                             }
+                            default:
+                            {
+                                assert(0);
+                            }
                         }
                         break;
                     }
@@ -279,6 +284,11 @@ bool follow_tree(ModelOutputs &model_outputs, PredictionData &prediction_data, s
                                                      true : found_outliers;
                                 break;
                             }
+                            default:
+                            {
+                                assert(0);
+                            }
                         }
                         break;
                     }
@@ -332,10 +342,16 @@ bool follow_tree(ModelOutputs &model_outputs, PredictionData &prediction_data, s
                                                      true : found_outliers;
                                 break;
                             }
+                            default:
+                            {
+                                assert(0);
+                            }
                         }
                         break;
                     }
+                    default: {}
                 }
             }
         }
@@ -548,6 +564,11 @@ bool check_is_outlier_in_tree(std::vector<size_t> &clusters_in_tree, size_t curr
                             if (!isnan(num_val_other) && num_val_other > model_outputs.all_clusters[col][cl].split_point) flag_this_cluster = true;
                             break;
                         }
+                        default:
+                        {
+                            assert(0);
+                        }
                     }
                     break;
                 }
@@ -587,6 +608,11 @@ bool check_is_outlier_in_tree(std::vector<size_t> &clusters_in_tree, size_t curr
                             break;
                         }
+                        default:
+                        {
+                            assert(0);
+                        }
                         /* Note: type 'SingleCateg' is only used temporarily, later gets converted to 'Equal' */
                     }
                     break;
@@ -626,6 +652,11 @@ bool check_is_outlier_in_tree(std::vector<size_t> &clusters_in_tree, size_t curr
                             if (cat_val_other >=0 && cat_val_other != model_outputs.all_clusters[col][cl].split_lev) flag_this_cluster = true;
                             break;
                         }
+                        default:
+                        {
+                            assert(0);
+                        }
                     }
                     break;
                 }
@@ -645,6 +676,8 @@ bool check_is_outlier_in_tree(std::vector<size_t> &clusters_in_tree, size_t curr
                                                             model_outputs.all_clusters[col][cl].cluster_sd
                                                             )
                     );
+                    if (is_na_or_inf(outlier_score))
+                        outlier_score = 1. - 1e-15;
                 } else {
                     outlier_score = model_outputs.all_clusters[col][cl].score_categ[cat_val_this];
                 }