RubyGems - outliertree - Versions diffs - 0.1.0 - Mend

outliertree 0.1.0

Files changed (25) hide show

checksums.yaml +7 -0
data/CHANGELOG.md +3 -0
data/LICENSE.txt +674 -0
data/NOTICE.txt +14 -0
data/README.md +107 -0
data/ext/outliertree/ext.cpp +260 -0
data/ext/outliertree/extconf.rb +21 -0
data/lib/outliertree.rb +17 -0
data/lib/outliertree/dataset.rb +35 -0
data/lib/outliertree/model.rb +128 -0
data/lib/outliertree/result.rb +190 -0
data/lib/outliertree/version.rb +3 -0
data/vendor/outliertree/LICENSE +674 -0
data/vendor/outliertree/README.md +155 -0
data/vendor/outliertree/src/Makevars +3 -0
data/vendor/outliertree/src/RcppExports.cpp +123 -0
data/vendor/outliertree/src/Rwrapper.cpp +1225 -0
data/vendor/outliertree/src/cat_outlier.cpp +328 -0
data/vendor/outliertree/src/clusters.cpp +972 -0
data/vendor/outliertree/src/fit_model.cpp +1932 -0
data/vendor/outliertree/src/misc.cpp +685 -0
data/vendor/outliertree/src/outlier_tree.hpp +758 -0
data/vendor/outliertree/src/predict.cpp +706 -0
data/vendor/outliertree/src/split.cpp +1098 -0
metadata +150 -0

data/vendor/outliertree/src/cat_outlier.cpp ADDED

@@ -0,0 +1,328 @@
+/********************************************************************************************************************
+*    Explainable outlier detection
+*
+*    Tries to detect outliers by generating decision trees that attempt to predict the values of each column based on
+*    each other column, testing in each branch of every tried split (if it meets some minimum criteria) whether there
+*    are observations that seem too distant from the others in a 1-D distribution for the column that the split tries
+*    to "predict" (will not generate a score for each observation).
+*    Splits are based on gain, while outlierness is based on confidence intervals.
+*    Similar in spirit to the GritBot software developed by RuleQuest research. Reference article is:
+*      Cortes, David. "Explainable outlier detection through decision tree conditioning."
+*      arXiv preprint arXiv:2001.00636 (2020).
+*
+*
+*    Copyright 2020 David Cortes.
+*
+*    Written for C++11 standard and OpenMP 2.0 or later. Code is meant to be wrapped into scripting languages
+*    such as R or Python.
+*
+*    This file is part of OutlierTree.
+*
+*    OutlierTree is free software: you can redistribute it and/or modify
+*    it under the terms of the GNU General Public License as published by
+*    the Free Software Foundation, either version 3 of the License, or
+*    (at your option) any later version.
+*
+*    OutlierTree is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU General Public License for more details.
+*
+*    You should have received a copy of the GNU General Public License
+*    along with OutlierTree.  If not, see <https://www.gnu.org/licenses/>.
+********************************************************************************************************************/
+#include "outlier_tree.hpp"
+/*    Check whether to consider any category as outlier, based on current counts and prior probabilities
+*
+*    Function is to be applied to some subset of the data obtained by splitting by one or more columns.
+*    For outliers before any split there is a separate function. Note that since it required current
+*    probability to be lower than prior probability in order to consider as outlier, it cannot be
+*    used with the full data (only with subsets).
+*
+*    Parameters:
+*    - categ_counts[ncateg] (in)
+*        Counts of each category in the subset (including non-present categories).
+*    - ncateg (in)
+*        Number of categories for this column (including non-present categories).
+*    - tot (in)
+*        Number of rows in the subset.
+*    - max_perc_outliers (in)
+*        Model parameter. Default value is 0.01.
+*    - perc_threshold[ncateg] (in)
+*        Threshold for the proportion/probability of each category below which it can be considered
+*        to be an outlier in a subset of the data. Note that in addition it will build a confidence
+*        interval here which might make it even smaller.
+*    - buffer_ix[ncateg] (temp)
+*        Buffer where to store indices of categories sorted by proportion.
+*    - buffer_perc[ncateg] (temp)
+*        Buffer where to store proportions of counts.
+*    - z_norm (in)
+*        Model parameter. Default value is 2.67.
+*    - is_outlier[ncateg] (out)
+*        Array where to define whether any category is an outlier. Values will be as follows:
+*            (-1) -> Category had zero count, but would be an outlier if it appeared among this group
+*              0  -> Category is not an outlier
+*            (+1) -> Category is an outlier
+*    - found_outliers (out)
+*        Whether there were any outliers identified among the counts.
+*    - new_is_outlier (out)
+*        Whether any of the categories with zero count would be flagged as outlier if they appeared in this group.
+*    - next_most_comm (out)
+*        Proportion of the least common category that is not flagged as outlier.
+*/
+void find_outlier_categories(size_t categ_counts[], size_t ncateg, size_t tot, double max_perc_outliers,
+                             long double perc_threshold[], size_t buffer_ix[], long double buffer_perc[],
+                             double z_norm, char is_outlier[], bool *found_outliers, bool *new_is_outlier,
+                             double *next_most_comm)
+{
+    //TODO: must also establish bounds for new, unseen categories
+    /* initialize parameters as needed */
+    *found_outliers = false;
+    *new_is_outlier = false;
+    size_t st_non_zero = 0;
+    size_t end_tail = 0;
+    size_t max_outliers = (size_t) calculate_max_cat_outliers((long double)tot, max_perc_outliers, z_norm);
+    long double tot_dbl = (long double) tot;
+    long double pct_unseen = (long double)1 / (long double)(tot + 1);
+    size_t size_tail = 0;
+    /* reset the temporary arrays and fill them */
+    memset(is_outlier, 0, ncateg * sizeof(char));
+    for (size_t cat = 0; cat < ncateg; cat++) {
+        buffer_ix[cat] = cat;
+        buffer_perc[cat] = (categ_counts[cat] > 0)? ((long double)categ_counts[cat] / tot_dbl) : 0;
+    }
+    /* sort the categories by counts */
+    std::sort(buffer_ix, buffer_ix + ncateg,
+              [&categ_counts](const size_t a, const size_t b){return categ_counts[a] < categ_counts[b];});
+    /* find the first non-zero */
+    for (size_t cat = 0; cat < ncateg; cat++) {
+        if (categ_counts[ buffer_ix[cat] ] > 0) {
+            st_non_zero = cat;
+            break;
+        }
+    }
+    /* check that least common is not common enough to be normal */
+    if (categ_counts[ buffer_ix[st_non_zero] ] > max_outliers) return;
+    /*    find tail among non-zero proportions
+    *    a tail is considered to be so if:
+    *    - the difference is above z_norm sd's of either proportion
+    *    - the difference is greater than some fraction of the larger
+    *    - the actual proportion here is lower than a CI of the prior proportion
+    *    - the actual proportion here is half or less of the prior proportion
+    */
+    for (size_t cat = st_non_zero; cat < ncateg - 1; cat++) {
+        if (
+                (
+                    (buffer_perc[buffer_ix[cat + 1]] - buffer_perc[buffer_ix[cat]])
+                        >
+                    z_norm * sqrtl(
+                                    fmaxl(
+                                            buffer_perc[buffer_ix[cat + 1]] * ((long double)1 - buffer_perc[buffer_ix[cat + 1]]),
+                                            buffer_perc[buffer_ix[cat]] * ((long double)1 - buffer_perc[buffer_ix[cat]])
+                                        )
+                                        / tot_dbl
+                                )
+                )
+                &&
+                (
+                    buffer_perc[buffer_ix[cat + 1]] * 0.5  >  buffer_perc[buffer_ix[cat]]
+                )
+        )
+        {
+            end_tail = cat;
+            *next_most_comm = buffer_perc[buffer_ix[cat + 1]];
+            break;
+        }
+    }
+    /* if the tail is too long, don't identify any as outlier, but see if unseen categories (with prior > 0) would create a new tail */
+    for (size_t cat = st_non_zero; cat <= end_tail; cat++) size_tail += categ_counts[ buffer_ix[cat] ];
+    if (size_tail >= max_outliers) {
+        if (
+            st_non_zero == 0 ||
+            // ((long double)buffer_ix[buffer_ix[st_non_zero]] / (tot_dbl + 1)) * 0.5 <= pct_unseen ||
+            ( ((long double)buffer_ix[buffer_ix[st_non_zero]] * 0.5) / (tot_dbl + 1)) <= pct_unseen ||
+            ((long double)(buffer_ix[buffer_ix[st_non_zero]] - 1) / (tot_dbl + 1))
+                - (long double)z_norm * sqrtl(buffer_perc[buffer_ix[st_non_zero]] * ((long double)1 - buffer_perc[buffer_ix[st_non_zero]]) / tot_dbl)
+                    >= pct_unseen
+            ) return;
+        for (size_t cat = 0; cat < st_non_zero; cat++) {
+            if (perc_threshold[buffer_ix[cat]] > pct_unseen) {
+                *new_is_outlier = true;
+                is_outlier[buffer_ix[cat]] = -1;
+            }
+        }
+        *next_most_comm = buffer_perc[buffer_ix[st_non_zero]];
+        return;
+    }
+    /* now determine if any category in the tail is an outlier */
+    for (size_t cat = st_non_zero; cat <= end_tail; cat++) {
+        /* must have a proportion below CI and below half of prior */
+        if (buffer_perc[buffer_ix[cat]] < perc_threshold[buffer_ix[cat]]) {
+            is_outlier[buffer_ix[cat]] = 1;
+            *found_outliers = true;
+        }
+    }
+    /* check if any new categories would be outliers */
+    if (st_non_zero > 0) {
+        for (size_t cat = 0; cat < st_non_zero; cat++) {
+            if (perc_threshold[buffer_ix[cat]] > pct_unseen) {
+                *new_is_outlier = true;
+                is_outlier[buffer_ix[cat]] = -1;
+            }
+        }
+    }
+    if (*new_is_outlier && !(*found_outliers)) {
+        *next_most_comm = buffer_perc[buffer_ix[st_non_zero]];
+    }
+}
+/*    Check whether to consider any category as outlier, based on majority category and prior probabilties
+*
+*    Function is to be applied to some subset of the data obtained by splitting by one or more columns.
+*    For outliers before any split there is a separate function. This is an alternative to the "tail"
+*    approach above which is more in line with GritBot.
+*
+*    Parameters:
+*    - categ_counts[ncateg] (in)
+*        Counts of each category in the subset (including non-present categories).
+*    - ncateg (in)
+*        Number of categories for this column (including non-present categories).
+*    - tot (in)
+*        Number of rows in the subset.
+*    - max_perc_outliers (in)
+*        Model parameter. Default value is 0.01.
+*    - prior_prob[ncateg] (in)
+*        Proportions that each category had in the full data.
+*    - z_outlier (in)
+*        Model parameter. Default value is 8.0
+*    - is_outlier[ncateg] (out)
+*        Array where to define whether any category is an outlier. Values will be as follows:
+*            (-1) -> Category had zero count, but would be an outlier if it appeared among this group
+*              0  -> Category is not an outlier
+*            (+1) -> Category is an outlier
+*    - found_outliers (out)
+*        Whether there were any outliers identified among the counts.
+*    - new_is_outlier (out)
+*        Whether any of the categories with zero count would be flagged as outlier if they appeared in this group.
+*    - categ_maj (out)
+*        Category to which the majority of the observations belong.
+*/
+void find_outlier_categories_by_maj(size_t categ_counts[], size_t ncateg, size_t tot, double max_perc_outliers,
+                                    long double prior_prob[], double z_outlier, char is_outlier[],
+                                    bool *found_outliers, bool *new_is_outlier, int *categ_maj)
+{
+    /* initialize parameters as needed */
+    *found_outliers = false;
+    *new_is_outlier = false;
+    memset(is_outlier, 0, ncateg * sizeof(char));
+    size_t max_outliers = (size_t) calculate_max_outliers((long double)tot, max_perc_outliers);
+    long double tot_dbl = (long double) (tot + 1);
+    size_t n_non_maj;
+    long double thr_prop = (double)1 / square(z_outlier);
+    /* check if any can be considered as outlier */
+    size_t *ptr_maj = std::max_element(categ_counts, categ_counts + ncateg);
+    *categ_maj = (int)(ptr_maj - categ_counts);
+    n_non_maj = tot - *ptr_maj;
+    if (n_non_maj > max_outliers)
+        return;
+    /* determine proportions and check for outlierness */
+    long double n_non_maj_dbl = (long double) n_non_maj;
+    for (size_t cat = 0; cat < ncateg; cat++) {
+        if ((int)cat == *categ_maj) continue;
+        if ( (n_non_maj_dbl / (tot_dbl * prior_prob[cat])) < thr_prop ) {
+            if (categ_counts[cat]) {
+                is_outlier[cat] = 1;
+                *found_outliers = true;
+            } else {
+                is_outlier[cat] = -1;
+                *new_is_outlier = true;
+            }
+        }
+    }
+    /* TODO: implement formula for flagging unsen categories (not in the sample, nor the full data) as outliers */
+}
+/*    Check whether to consider any category as outlier before splitting, based on prior counts
+*
+*    Follows very rough criteria: there can be at most 1-3 outliers depending on size of dataset,
+*    and the next most common category must have a count of at least 250.
+*
+*    Parameters:
+*    - categ_counts[ncateg] (in)
+*        Frequencies of each category in the full data.
+*    - ncateg (in)
+*        Number of categories with non-zero count.
+*    - tot (in)
+*        Number of rows.
+*    - is_outlier[ncateg] (out)
+*        Array indicating whether any category is outlier (0 = non-outlier, 1 = outlier).
+*    - next_most_comm (out)
+*        Proportion of the least common non-outlier category.
+*/
+bool find_outlier_categories_no_cond(size_t categ_counts[], size_t ncateg, size_t tot,
+                                     char is_outlier[], double *next_most_comm)
+{
+    /* if sample is too small, don't flag any as outliers */
+    if (tot < 1000) return false;
+    /* set a very low outlier threshold with a hard limit of 3 */
+    size_t max_outliers = (tot < 10000)? 1 : ((tot < 100000)? 2 : 3);
+    /* will only consider a category as outlier if the next most common is very common */
+    size_t max_next_most_comm = 250;
+    /* look if there's any category meeting the first condition and none meeting the second one */
+    bool has_outlier_cat = false;
+    memset(is_outlier, 0, sizeof(char) * ncateg);
+    for (size_t cat = 0; cat < ncateg; cat++) {
+        if (categ_counts[cat] > max_outliers && categ_counts[cat] < max_next_most_comm) {
+            has_outlier_cat = false;
+            break;
+        }
+        if (categ_counts[cat] > 0 && categ_counts[cat] <= max_outliers) {
+            /* can only have 1 outlier category in the whole column */
+            if (has_outlier_cat) { has_outlier_cat = false; break; }
+            has_outlier_cat = true;
+            is_outlier[cat] = 1;
+        }
+    }
+    /* if outlier is found, find next most common frequency for printed statistics */
+    if (has_outlier_cat) {
+        size_t next_most_comm_cat = INT_MAX;
+        for (size_t cat = 0; cat < ncateg; cat++) {
+            if (categ_counts[cat] > 0 && !is_outlier[cat]) {
+                next_most_comm_cat = std::min(next_most_comm_cat, categ_counts[cat]);
+            }
+        }
+        *next_most_comm = (long double)next_most_comm_cat / (long double)tot;
+    }
+    return has_outlier_cat;
+}

data/vendor/outliertree/src/clusters.cpp ADDED

@@ -0,0 +1,972 @@
+/********************************************************************************************************************
+*    Explainable outlier detection
+*
+*    Tries to detect outliers by generating decision trees that attempt to predict the values of each column based on
+*    each other column, testing in each branch of every tried split (if it meets some minimum criteria) whether there
+*    are observations that seem too distant from the others in a 1-D distribution for the column that the split tries
+*    to "predict" (will not generate a score for each observation).
+*    Splits are based on gain, while outlierness is based on confidence intervals.
+*    Similar in spirit to the GritBot software developed by RuleQuest research. Reference article is:
+*      Cortes, David. "Explainable outlier detection through decision tree conditioning."
+*      arXiv preprint arXiv:2001.00636 (2020).
+*
+*
+*    Copyright 2020 David Cortes.
+*
+*    Written for C++11 standard and OpenMP 2.0 or later. Code is meant to be wrapped into scripting languages
+*    such as R or Python.
+*
+*    This file is part of OutlierTree.
+*
+*    OutlierTree is free software: you can redistribute it and/or modify
+*    it under the terms of the GNU General Public License as published by
+*    the Free Software Foundation, either version 3 of the License, or
+*    (at your option) any later version.
+*
+*    OutlierTree is distributed in the hope that it will be useful,
+*    but WITHOUT ANY WARRANTY; without even the implied warranty of
+*    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+*    GNU General Public License for more details.
+*
+*    You should have received a copy of the GNU General Public License
+*    along with OutlierTree.  If not, see <https://www.gnu.org/licenses/>.
+********************************************************************************************************************/
+#include "outlier_tree.hpp"
+/*    Characterize a homogenous 1-dimensional cluster
+*
+*    Calcualtes limits and display statistics on the distribution of one numerical variable,
+*    flagging potential outliers if found. Can be run on the full data or on subsets obtained from splitting
+*    by other variables.
+*
+*    In order to flag an observation as outlier, it must:
+*        * Be in a very small/large percentile of the subset passed here.
+*        * Have a large absolute Z value (standardized and centered).
+*        * Have a large gap in the Z value with respect to the next largest/smallest ovservation.
+*        * Not be in a long tail (unless the variable was transformed by exponentiating or taking logarithm).
+*
+*    Parameters:
+*    - x[n] (in)
+*        Variable for which to define the cluster.
+*    - ix_arr[n] (in)
+*        Indices to take from the array above.
+*    - st (in)
+*        Position at which ix_arr starts (inclusive).
+*    - end (in)
+*        Position at which ix_arr ends (inclusive).
+*    - outlier_scores[n] (in, out)
+*        Outlier scores (based on chebyshyov's inequality) that are already assigned to the observations from this column
+*        from previous runs of this function in larger subsets (should be started to 1).
+*    - outlier_clusters[n] (in, out)
+*        Cluster number under which an observation is the most anomalous.
+*    - outlier_trees[n] (in, out)
+*        Tree under which the outlier cluster assigned lies.
+*    - outlier_depth[n] (in, out)
+*        Tree depth at which the outlier cluster assigned is found.
+*    - cluster (in, out)
+*        Outlier cluster object with statistics and limits.
+*    - clusters (in)
+*        Vector containing all cluster already generated.
+*    - cluster_num (in)
+*        Number to give to this cluster.
+*    - tree_num (in)
+*        Number of the tree under which this cluster is to be found.
+*    - tree_depth (in)
+*        Distance form the tree root at which this tree is to be found.
+*    - is_log_transf (in)
+*        Whether the column 'x' has undergone a logarithmic transformation.
+*    - log_minval (in)
+*        Value that was added to 'x' before taking its logarithm (if it was log-transformed).
+*    - is_exp_transf (in)
+*        Whether the column 'x' has undergone an exponential transformation on its standardized values.
+*    - orig_mean (in)
+*        Mean of the variable 'x' before being standardized (if it was exponentiated).
+*    - orig_sd (in)
+*        Standard deviation of the variable 'x'  before being standardized (if it was exponentiated).
+*    - left_tail (in)
+*        Value of 'x' after which it is considered a long tail, in which outliers will not be searched for.
+*    - rught_tail (in)
+*        Value of 'x' before which it is considered a long tail, in which outliers will not be searched for.
+*    - orig_x (in)
+*        Original values of 'x' if it was transformed (log or exp).
+*    - max_perc_outliers (in)
+*        Model parameter. Default is 0.01.
+*    - z_norm (in)
+*        Model parameter. Default is 2.67.
+*    - z_outlier (in)
+*        Model parameter. Default is 8.0. Must be greater than z_norm.
+*
+*    Returns:
+*        - Whether there were any outliers detected.
+*/
+bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_t st, size_t end,
+                              double *restrict outlier_scores, size_t *restrict outlier_clusters, size_t *restrict outlier_trees,
+                              size_t *restrict outlier_depth, Cluster &cluster, std::vector<Cluster> &clusters,
+                              size_t cluster_num, size_t tree_num, size_t tree_depth,
+                              bool is_log_transf, double log_minval, bool is_exp_transf, double orig_mean, double orig_sd,
+                              double left_tail, double right_tail, double *restrict orig_x,
+                              double max_perc_outliers, double z_norm, double z_outlier)
+{
+    /*  TODO: this function could try to determine if the distribution is multimodal, and if so,
+        take only the most extreme means/sd for outlier comparisons */
+    /*  TODO: statistics like SD, mean; are already available from the splitting function which
+        is called right before this, so these should *only* need to be recalculated them if the column
+        has undergone log or exp transform */
+    /* NAs and Inf should have already been removed, and outliers with fewer conditionals already discarded */
+    bool has_low_values  = false;
+    bool has_high_values = false;
+    long double running_mean = 0;
+    long double mean_prev    = 0;
+    long double running_ssq  = 0;
+    double xval;
+    double mean;
+    double sd;
+    size_t cnt;
+    size_t tail_size     = (size_t) calculate_max_outliers((long double)(end - st + 1), max_perc_outliers);
+    size_t st_non_tail   = st  + tail_size;
+    size_t end_non_tail  = end - tail_size;
+    size_t st_normals    = 0;
+    size_t end_normals   = 0;
+    double min_gap = z_outlier - z_norm;
+    /* TODO: here it's not necessary to sort the whole data, only top/bottom N */
+    /* sort the data */
+    std::sort(ix_arr + st, ix_arr + end + 1, [&x](const size_t a, const size_t b){return x[a] < x[b];});
+    /* calculate statistics with tails and previous outliers excluded */
+    cnt = end_non_tail - st_non_tail + 1;
+    for (size_t row = st_non_tail; row <= end_non_tail; row++) {
+        xval = x[ ix_arr[row] ];
+        running_mean += (xval - running_mean) / (long double)(row - st_non_tail + 1);
+        running_ssq  += (xval - running_mean) * (xval - mean_prev);
+        mean_prev     = running_mean;
+    }
+    mean = (double) running_mean;
+    sd   = (double) sqrtl(running_ssq / (long double)(cnt - 1));
+    /* adjust SD heuristically to account for reduced size, by (N + tail)/(N-tail) --- note that cnt = N-2*tail */
+    sd *= (long double)(cnt + 3 * tail_size) / (long double)(cnt + tail_size);
+    /* re-adjust if there's a one-sided tail and no transformation was applies */
+    if ((!isinf(left_tail) || !isinf(right_tail)) && !is_log_transf && !is_exp_transf) {
+        sd *= 0.5;
+    }
+    cluster.cluster_mean = mean;
+    cluster.cluster_sd = sd;
+    cnt = end - st + 1;
+    /* see if the minimum and/or maximum values qualify for outliers */
+    if (-z_score(x[ix_arr[st]],  mean, sd) >= z_outlier && x[ix_arr[st]]  > left_tail)  has_low_values  = true;
+    if ( z_score(x[ix_arr[end]], mean, sd) >= z_outlier && x[ix_arr[end]] < right_tail) has_high_values = true;
+    /* look for a large gap in the z-scores */
+    if (has_low_values) {
+        for (size_t row = st; row < st + tail_size; row++) {
+            if (( z_score(x[ix_arr[row + 1]], mean, sd) - z_score(x[ix_arr[row]], mean, sd) ) >= min_gap) {
+                st_normals = row + 1;
+                if (is_exp_transf) {
+                    cluster.lower_lim = log(x[ix_arr[row + 1]] - min_gap * sd) * orig_sd + orig_mean;
+                } else if (is_log_transf) {
+                    cluster.lower_lim = exp(x[ix_arr[row + 1]] - min_gap * sd) + log_minval;
+                } else {
+                    cluster.lower_lim = x[ix_arr[row + 1]] - min_gap * sd;
+                }
+                cluster.display_lim_low = orig_x[ix_arr[row + 1]];
+                cluster.perc_above = (long double)(end - st_normals + 1) / (long double)(end - st + 1);
+                break;
+            }
+            if (z_score(x[ix_arr[row]], mean, sd) > -z_outlier) break;
+        }
+        if (st_normals == 0) {
+            has_low_values = false;
+        } else {
+            for (size_t row = st; row < st_normals; row++) {
+                /* assign outlier if it's a better cluster than previously assigned */
+                if (
+                        outlier_scores[ix_arr[row]] >= 1.0 ||
+                        (clusters[outlier_clusters[ix_arr[row]]].has_NA_branch && !cluster.has_NA_branch) ||
+                        (
+                            cluster.has_NA_branch == clusters[outlier_clusters[ix_arr[row]]].has_NA_branch
+                                &&
+                            (
+                                tree_depth < outlier_depth[ix_arr[row]] ||
+                                (
+                                    tree_depth == outlier_depth[ix_arr[row]] &&
+                                    clusters[outlier_clusters[ix_arr[row]]].cluster_size < (cnt - 2 * tail_size)
+                                )
+                            )
+                        )
+                    )
+                {
+                    outlier_scores[ix_arr[row]] = chebyshyov_bound(z_score(x[ix_arr[row]], mean, sd));
+                    if (is_na_or_inf(outlier_scores[ix_arr[row]])) outlier_scores[ix_arr[row]] = 0;
+                    outlier_clusters[ix_arr[row]] = cluster_num;
+                    outlier_trees[ix_arr[row]] = tree_num;
+                    outlier_depth[ix_arr[row]] = tree_depth;
+                }
+            }
+        }
+    }
+    if (!has_low_values) {
+        cluster.perc_above = 1.0;
+        if (!is_log_transf && !is_exp_transf) {
+            if (isinf(left_tail)) {
+                cluster.lower_lim = x[ix_arr[st]] - min_gap * sd;
+            } else {
+                cluster.lower_lim = -HUGE_VAL;
+            }
+        } else if (is_exp_transf) {
+            cluster.lower_lim = log(x[ix_arr[st]] - min_gap * sd) * orig_sd + orig_mean;
+        } else {
+            cluster.lower_lim = exp(x[ix_arr[st]] - min_gap * sd) + log_minval;
+        }
+        cluster.display_lim_low = orig_x[ix_arr[st]];
+    }
+    if (has_high_values) {
+        for (size_t row = end; row > (end - tail_size); row--) {
+            if (( z_score(x[ix_arr[row]], mean, sd) - z_score(x[ix_arr[row - 1]], mean, sd) ) >= min_gap) {
+                end_normals = row - 1;
+                if (is_exp_transf) {
+                    cluster.upper_lim = log(x[ix_arr[row - 1]] + min_gap * sd) * orig_sd + orig_mean;
+                } else if (is_log_transf) {
+                    cluster.upper_lim = exp(x[ix_arr[row - 1]] + min_gap * sd) + log_minval;
+                } else {
+                    cluster.upper_lim = x[ix_arr[row - 1]] + min_gap * sd;
+                }
+                cluster.display_lim_high = orig_x[ix_arr[row - 1]];
+                cluster.perc_below = (long double)(end_normals - st + 1) / (long double)(end - st + 1);
+                break;
+            }
+            if (z_score(x[ix_arr[row]], mean, sd) < z_outlier) break;
+        }
+        if (end_normals == 0) {
+            has_high_values = false;
+        } else {
+            for (size_t row = end; row > end_normals; row--) {
+                /*  assign outlier if it's a better cluster than previously assigned - Note that it might produce slight mismatches
+                    against the predict function (the latter is more trustable) due to the size of the cluster not yet being known
+                    at the moment of determinining whether to overwrite previous in here */
+                if (
+                        outlier_scores[ix_arr[row]] >= 1.0 ||
+                        (clusters[outlier_clusters[ix_arr[row]]].has_NA_branch && !cluster.has_NA_branch) ||
+                        (
+                            cluster.has_NA_branch == clusters[outlier_clusters[ix_arr[row]]].has_NA_branch
+                            &&
+                            (
+                                tree_depth < outlier_depth[ix_arr[row]] ||
+                                (
+                                    tree_depth == outlier_depth[ix_arr[row]] &&
+                                    clusters[outlier_clusters[ix_arr[row]]].cluster_size < (cnt - 2 * tail_size)
+                                )
+                            )
+                        )
+                    )
+                {
+                    outlier_scores[ix_arr[row]] = chebyshyov_bound(z_score(x[ix_arr[row]], mean, sd));
+                    if (is_na_or_inf(outlier_scores[ix_arr[row]])) outlier_scores[ix_arr[row]] = 0;
+                    outlier_clusters[ix_arr[row]] = cluster_num;
+                    outlier_trees[ix_arr[row]] = tree_num;
+                    outlier_depth[ix_arr[row]] = tree_depth;
+                }
+            }
+        }
+    }
+    if (!has_high_values) {
+        cluster.perc_below = 1.0;
+        if (!is_log_transf && !is_exp_transf) {
+            if (isinf(right_tail)) {
+                cluster.upper_lim = x[ix_arr[end]] + min_gap * sd;
+            } else {
+                cluster.upper_lim = HUGE_VAL;
+            }
+        } else if (is_exp_transf) {
+            cluster.upper_lim = log(x[ix_arr[end]] + min_gap * sd) * orig_sd + orig_mean;
+        } else {
+            cluster.upper_lim = exp(x[ix_arr[end]] + min_gap * sd) + log_minval;
+        }
+        cluster.display_lim_high = orig_x[ix_arr[end]];
+    }
+    /* save displayed statistics for cluster */
+    if (has_high_values || has_low_values || is_log_transf || is_exp_transf) {
+        size_t st_disp  = has_low_values?  st_normals  : st;
+        size_t end_disp = has_high_values? end_normals : end;
+        running_mean = 0;
+        mean_prev    = 0;
+        running_ssq  = 0;
+        for (size_t row = st_disp; row <= end_disp; row++) {
+            xval = orig_x[ix_arr[row]];
+            running_mean += (xval - running_mean) / (long double)(row - st_disp + 1);
+            running_ssq  += (xval - running_mean) * (xval - mean_prev);
+            mean_prev     = running_mean;
+        }
+        cluster.cluster_size = end_disp - st_disp + 1;
+        cluster.display_mean = (double) running_mean;
+        cluster.display_sd   = (double) sqrtl(running_ssq / (long double)(cluster.cluster_size - 1));
+    } else {
+        cluster.display_mean = cluster.cluster_mean;
+        cluster.display_sd   = cluster.cluster_sd;
+        cluster.cluster_size = end - st + 1;
+    }
+    /* report whether outliers were found or not */
+    return has_low_values || has_high_values;
+}
+/*    Characterize a homogeneous categorical cluster from the *full* data
+*
+*    Function is meant for the data as it comes, before splitting it, as once split, it will
+*    not be able to detect these outliers. As such, it takes fewer parameters, since it can only
+*    be the first tree and cluster in a column. It assumes the outliers have already been identified.
+*
+*    Parameters:
+*    - x[n]
+*        Array indicating the category to which each observation belongs.
+*    - ix_arr[n] (in)
+*        Indices to take from the array above.
+*    - st (in)
+*        Position at which ix_arr starts (inclusive).
+*    - end (in)
+*        Position at which ix_arr ends (inclusive).
+*    - ncateg (in)
+*        Number of categories in this column.
+*    - outlier_scores[n] (in, out)
+*        Array where to assign outlier scores (based on proportion) to each observation belonging to an outlier category.
+*    - outlier_clusters[n] (in, out)
+*        Array where to assign cluster number to each observation belonging to an outlier category.
+*    - outlier_trees[n] (in, out)
+*        Array where to assign tree number to each observation belonging to an outlier category.
+*    - outlier_depth[n] (in, out)
+*        Array where to assign tree depth to each observation belonging to an outlier category.
+*    - cluster (in, out)
+*        Outlier cluster object with statistics and classifications.
+*    - categ_counts[ncateg] (in)
+*        Array with the frequencies of each category in the data.
+*    - is_outlier[ncateg] (in)
+*        Array indicating which categories are to be considered as outliers (must be already calculated).
+*    - perc_next_most_comm (in)
+*        Proportion of the least common non-outlier category (must be already calculated).
+*/
+void define_categ_cluster_no_cond(int *restrict x, size_t *restrict ix_arr, size_t st, size_t end, size_t ncateg,
+                                  double *restrict outlier_scores, size_t *restrict outlier_clusters, size_t *restrict outlier_trees,
+                                  size_t *restrict outlier_depth, Cluster &cluster,
+                                  size_t *restrict categ_counts, char *restrict is_outlier, double perc_next_most_comm)
+{
+    size_t cnt_common = end - st + 1;
+    cluster.cluster_size = cnt_common;
+    double pct_outl;
+    cluster.subset_common.assign(is_outlier, is_outlier + ncateg);
+    cluster.score_categ.resize(ncateg, 0);
+    for (size_t row = st; row <= end; row++) {
+        if (is_outlier[x[ix_arr[row]]]) {
+            cnt_common--;
+            pct_outl = (long double)categ_counts[ x[ix_arr[row]] ] / (long double)cluster.cluster_size;
+            pct_outl = pct_outl + sqrt(pct_outl * (1 - pct_outl) / (long double)cluster.cluster_size);
+            cluster.score_categ[ x[ix_arr[row]] ] = pct_outl;
+            outlier_scores[ix_arr[row]] = pct_outl;
+            outlier_clusters[ix_arr[row]] = 0;
+            outlier_trees[ix_arr[row]] = 0;
+            outlier_depth[ix_arr[row]] = 0;
+        }
+    }
+    cluster.perc_in_subset = (long double)cnt_common / (long double)cluster.cluster_size;
+    cluster.perc_next_most_comm = perc_next_most_comm;
+}
+/*    Characterize a homogeneous categorical cluster form a subset of the data, or report if it's not homogeneous
+*
+*    Function is meant to be called with subsets of the data only. Will calculate the counts inside it.
+*    In order to consider a category as outlier, it must:
+*        * Have a proportion smaller than its prior probability and than a condifence interval of its prior.
+*        * Have a large gap with respect to the next most-common category.
+*        * Be in a cluster in which few or no observations belong to a category meeting such conditions.
+*    It's oftentimes not possible to create a cluster with category frequencies that would produce outliers,
+*    in which case it will report whether the cluster should be dropped.
+*
+*    Parameters:
+*    - x[n]
+*        Array indicating the category to which each observation belongs.
+*    - ix_arr[n] (in)
+*        Indices to take from the array above.
+*    - st (in)
+*        Position at which ix_arr starts (inclusive).
+*    - end (in)
+*        Position at which ix_arr ends (inclusive).
+*    - ncateg (in)
+*        Number of categories in this column.
+*    - by_maj (in)
+*        Model parameter. Default is 'false'. Indicates whether to detect outliers according to the number of non-majority
+*        obsevations compared to the expected number for each category.
+*    - outlier_scores[n] (in, out)
+*        Outlier scores (based on observed category proportion) that are already assigned to the observations from this column
+*        from previous runs of this function in larger subsets (should be started to 1).
+*    - outlier_clusters[n] (in, out)
+*        Cluster number under which an observation is the most anomalous.
+*    - outlier_trees[n] (in, out)
+*        Tree under which the outlier cluster assigned lies.
+*    - outlier_depth[n] (in, out)
+*        Tree depth at which the outlier cluster assigned is found.
+*    - cluster (in, out)
+*        Outlier cluster object with statistics and limits.
+*    - clusters (in)
+*        Vector containing all cluster already generated.
+*    - cluster_num (in)
+*        Number to give to this cluster.
+*    - tree_num (in)
+*        Number of the tree under which this cluster is to be found.
+*    - tree_depth (in)
+*        Distance form the tree root at which this tree is to be found.
+*    - max_perc_outliers (in)
+*        Model parameter. Default is 0.01.
+*    - z_norm (in)
+*        Model parameter. Default is 2.67.
+*    - z_outlier (in)
+*        Model parameter. Default is 8.0.
+*    - perc_threshold[ncateg] (in)
+*        Observed proportion below which a category can be considered as outlier.
+*    - prop_prior[ncateg] (in)
+*        Prior probability of each category in the full data (only used when passing 'by_maj' = 'true').
+*    - buffer_categ_counts[ncateg] (temp)
+*        Buffer where to save the observed frequencies of each category.
+*    - buffer_categ_pct[ncateg] (temp)
+*        Buffer where to save the observed proportion of each category.
+*    - buffer_categ_ix[ncateg] (temp)
+*        Buffer where to save the category numbers sorted by proportion.
+*    - buffer_outliers[ncateg] (temp)
+*        Buffer where to save the results of which categories are flagged as outliers
+*        before copying it to the cluster (will not copy if none is flagged).
+*    - drop_cluster (out)
+*        Whethet the cluster should be dropped (i.e. it was not possible to flag any present
+*        or non-present category as outlier).
+*
+*    Returns:
+*        - Whether it identified any outliers or not.
+*/
+bool define_categ_cluster(int *restrict x, size_t *restrict ix_arr, size_t st, size_t end, size_t ncateg, bool by_maj,
+                          double *restrict outlier_scores, size_t *restrict outlier_clusters, size_t *restrict outlier_trees,
+                          size_t *restrict outlier_depth, Cluster &cluster, std::vector<Cluster> &clusters,
+                          size_t cluster_num, size_t tree_num, size_t tree_depth,
+                          double max_perc_outliers, double z_norm, double z_outlier,
+                          long double *restrict perc_threshold, long double *restrict prop_prior,
+                          size_t *restrict buffer_categ_counts, long double *restrict buffer_categ_pct,
+                          size_t *restrict buffer_categ_ix, char *restrict buffer_outliers,
+                          bool *restrict drop_cluster)
+{
+    bool found_outliers, new_is_outlier;
+    size_t tot = end - st + 1;
+    size_t sz_maj = tot;
+    long double tot_dbl = (long double) tot;
+    size_t tail_size = (size_t) calculate_max_outliers(tot_dbl, max_perc_outliers);
+    cluster.perc_in_subset = 1;
+    double pct_outl;
+    /* calculate category counts */
+    memset(buffer_categ_counts, 0, ncateg * sizeof(size_t));
+    for (size_t row = st; row <= end; row++) {
+        buffer_categ_counts[ x[ix_arr[row]] ]++;
+    }
+    /* flag categories as outliers if appropriate */
+    if (!by_maj)
+        find_outlier_categories(buffer_categ_counts, ncateg, tot, max_perc_outliers,
+                                perc_threshold, buffer_categ_ix, buffer_categ_pct,
+                                z_norm, buffer_outliers, &found_outliers,
+                                &new_is_outlier, &cluster.perc_next_most_comm);
+    else
+        find_outlier_categories_by_maj(buffer_categ_counts, ncateg, tot, max_perc_outliers,
+                                       prop_prior, z_outlier, buffer_outliers,
+                                       &found_outliers, &new_is_outlier, &cluster.categ_maj);
+    if (found_outliers) {
+        for (size_t row = st; row <= end; row++) {
+            if (buffer_outliers[ x[ix_arr[row]] ]) {
+                /* follow usual rules for preferring this cluster over others */
+                if (
+                        outlier_scores[ix_arr[row]] >= 1.0 ||
+                        (clusters[outlier_clusters[ix_arr[row]]].has_NA_branch && !cluster.has_NA_branch) ||
+                        (
+                            cluster.has_NA_branch == clusters[outlier_clusters[ix_arr[row]]].has_NA_branch
+                            &&
+                            (
+                                tree_depth < outlier_depth[ix_arr[row]] ||
+                                (
+                                    tree_depth == outlier_depth[ix_arr[row]] &&
+                                    clusters[outlier_clusters[ix_arr[row]]].cluster_size < (tot - tail_size)
+                                )
+                            )
+                        )
+                    )
+                {
+                    if (!by_maj) {
+                        pct_outl = (long double)buffer_categ_counts[ x[ix_arr[row]] ] / tot_dbl;
+                        pct_outl = pct_outl + sqrt(pct_outl * (1 - pct_outl) / tot_dbl);
+                        outlier_scores[ix_arr[row]] = pct_outl;
+                    } else {
+                        pct_outl = (long double)(tot - buffer_categ_counts[cluster.categ_maj]) / (tot_dbl * prop_prior[ x[ix_arr[row]] ]);
+                        outlier_scores[ix_arr[row]] = square(pct_outl);
+                    }
+                    outlier_clusters[ix_arr[row]] = cluster_num;
+                    outlier_trees[ix_arr[row]] = tree_num;
+                    outlier_depth[ix_arr[row]] = tree_depth;
+                }
+                sz_maj--;
+            }
+        }
+        cluster.perc_in_subset = (long double)sz_maj / tot_dbl;
+    }
+    if (new_is_outlier && !found_outliers) {
+        cluster.perc_in_subset = 1.0;
+    }
+    if (new_is_outlier || found_outliers) {
+        *drop_cluster = false;
+        cluster.cluster_size = sz_maj;
+        cluster.subset_common.assign(buffer_outliers, buffer_outliers + ncateg);
+        cluster.score_categ.resize(ncateg, 0);
+        if (!by_maj) {
+            for (size_t cat = 0; cat < ncateg; cat++) {
+                if (cluster.subset_common[cat] > 0) {
+                    pct_outl = (long double)buffer_categ_counts[cat] / tot_dbl;
+                    cluster.score_categ[cat] = pct_outl + sqrt(pct_outl * (1 - pct_outl) / tot_dbl);
+                } else if (cluster.subset_common[cat] < 0) {
+                    pct_outl = (long double)1 / (long double)(tot + 2);
+                    cluster.score_categ[cat] = pct_outl + sqrt(pct_outl * (1 - pct_outl) / (long double)(tot + 2));
+                }
+            }
+        } else {
+            cluster.perc_in_subset = (long double) buffer_categ_counts[cluster.categ_maj] / tot_dbl;
+            for (size_t cat = 0; cat < ncateg; cat++) {
+                if (cat == cluster.categ_maj)
+                    continue;
+                if (cluster.subset_common[cat] != 0) {
+                    cluster.score_categ[cat] = (long double)(tot - buffer_categ_counts[cluster.categ_maj] + 1)
+                                                            / ((long double)(tot + 2) * prop_prior[cat]);
+                    cluster.score_categ[cat] = square(cluster.score_categ[cat]);
+                }
+            }
+        }
+    } else {
+        *drop_cluster = true;
+    }
+    return found_outliers;
+}
+/* Convert in/not-in conditions to 'equals' or 'not equals' when they look for only 1 category */
+void simplify_when_equal_cond(std::vector<Cluster> &clusters, int ncat_ord[])
+{
+    int col_equal;
+    size_t size_subset;
+    size_t size_subset_excl;
+    for (size_t clust = 0; clust < clusters.size(); clust++) {
+        if (clusters[clust].split_type == IsNa) continue;
+        switch(clusters[clust].column_type) {
+            case Categorical:
+            {
+                col_equal = -1;
+                if (clusters[clust].split_subset.size() == 2) {
+                    switch(col_equal = clusters[clust].split_type) {
+                        case InSubset:
+                        {
+                            col_equal = clusters[clust].split_subset[0]? 0 : 1;
+                            break;
+                        }
+                        case NotInSubset:
+                        {
+                            col_equal = clusters[clust].split_subset[0]? 1 : 0;
+                            break;
+                        }
+                        case SingleCateg:
+                        {
+                            col_equal = clusters[clust].split_subset[0]? 0 : 1;
+                            break;
+                        }
+                    }
+                    clusters[clust].split_type = Equal;
+                } else {
+                    size_subset_excl = std::accumulate(clusters[clust].split_subset.begin(), clusters[clust].split_subset.end(), (size_t)0,
+                                                       [](const size_t a, const char b){return a + ((b < 0)? 1 : 0);});
+                    if (size_subset_excl > 0) continue;
+                    size_subset = std::accumulate(clusters[clust].split_subset.begin(), clusters[clust].split_subset.end(), (size_t)0,
+                                                  [](const size_t a, const char b){return a + ((b > 0)? 1 : 0);});
+                    if (size_subset == 1) {
+                        do {col_equal++;} while (clusters[clust].split_subset[col_equal] <= 0);
+                        if (clusters[clust].split_type == InSubset || clusters[clust].split_type == SingleCateg)
+                            clusters[clust].split_type = Equal;
+                        else
+                            clusters[clust].split_type = NotEqual;
+                    } else if (size_subset == (clusters[clust].split_subset.size() - 1)) {
+                        do {col_equal++;} while (clusters[clust].split_subset[col_equal] != 0);
+                        if (clusters[clust].split_type == NotInSubset)
+                            clusters[clust].split_type = Equal;
+                        else
+                            clusters[clust].split_type = NotEqual;
+                    }
+                }
+                if (col_equal >= 0) {
+                    clusters[clust].split_subset.resize(0);
+                    clusters[clust].split_lev = col_equal;
+                }
+                break;
+            }
+            case Ordinal:
+            {
+                if (clusters[clust].split_lev == 0) {
+                    if (clusters[clust].split_type == LessOrEqual)
+                        clusters[clust].split_type = Equal;
+                    else
+                        clusters[clust].split_type = NotEqual;
+                }
+                else if (clusters[clust].split_lev == (ncat_ord[clusters[clust].col_num] - 2)) {
+                    clusters[clust].split_lev++;
+                    if (clusters[clust].split_type == Greater)
+                        clusters[clust].split_type = Equal;
+                    else
+                        clusters[clust].split_type = NotEqual;
+                }
+                break;
+            }
+        }
+    }
+}
+/*
+*    Convert in/not-in conditions to 'equals' when they look for only 1 category
+*    Note: unlike in the case of clusters, trees do not store the split type, but rather
+*    always assume left is in/l.e. and right the opposite, so it's not possible to
+*    simplify ordinal splits to equals (as the tree will not distinguish between
+*    an ordinal split with equals and another with l.e./g.e.). Thus, this part needs
+*    to be done in the function that prints the outlier conditions.
+*/
+void simplify_when_equal_cond(std::vector<ClusterTree> &trees, int ncat_ord[])
+{
+    int col_equal;
+    size_t size_subset;
+    size_t size_subset_excl;
+    size_t temp_swap;
+    for (size_t tree = 0; tree < trees.size(); tree++) {
+        if (trees[tree].all_branches.size() == 0 && trees[tree].tree_left == 0 && trees[tree].tree_right == 0) continue;
+        if (trees[trees[tree].parent].all_branches.size() > 0 && trees[tree].split_this_branch == IsNa) continue;
+        switch(trees[tree].column_type) {
+            case Categorical:
+            {
+                size_subset_excl = std::accumulate(trees[tree].split_subset.begin(), trees[tree].split_subset.end(), (size_t)0,
+                                                   [](const size_t a, const char b){return a + ((b < 0)? 1 : 0);});
+                if (size_subset_excl > 0) continue;
+                col_equal = -1;
+                if (trees[tree].split_subset.size() == 2) {
+                    col_equal = 0;
+                    if (trees[tree].split_subset[0] == 0) {
+                        temp_swap = trees[tree].tree_left;
+                        trees[tree].tree_left = trees[tree].tree_right;
+                        trees[tree].tree_right = temp_swap;
+                    }
+                    if (trees[tree].tree_left > 0)
+                        trees[trees[tree].tree_left].parent_branch = Equal;
+                    if (trees[tree].tree_right > 0)
+                        trees[trees[tree].tree_right].parent_branch = NotEqual;
+                    if (trees[trees[tree].parent].all_branches.size() > 0) {
+                        switch(trees[tree].split_this_branch) {
+                            case InSubset:
+                            {
+                                trees[tree].split_this_branch = Equal;
+                                break;
+                            }
+                            case NotInSubset:
+                            {
+                                trees[tree].split_this_branch = NotEqual;
+                                break;
+                            }
+                            case SingleCateg:
+                            {
+                                trees[tree].split_this_branch = Equal;
+                                break;
+                            }
+                        }
+                    }
+                }
+                else {
+                    size_subset = std::accumulate(trees[tree].split_subset.begin(), trees[tree].split_subset.end(), (size_t)0,
+                                                  [](const size_t a, const char b){return a + ((b > 0)? 1 : 0);});
+                    if (size_subset == 1) {
+                        do {col_equal++;} while (trees[tree].split_subset[col_equal] <= 0);
+                        if (trees[trees[tree].parent].all_branches.size() > 0) {
+                            switch(trees[tree].split_this_branch) {
+                                case InSubset:
+                                {
+                                    trees[tree].split_this_branch = Equal;
+                                    break;
+                                }
+                                case NotInSubset:
+                                {
+                                    trees[tree].split_this_branch = NotEqual;
+                                    break;
+                                }
+                                case SingleCateg:
+                                {
+                                    trees[tree].split_this_branch = Equal;
+                                    break;
+                                }
+                            }
+                        }
+                    } else if (size_subset == (trees[tree].split_subset.size() - 1)) {
+                        do {col_equal++;} while (trees[tree].split_subset[col_equal] != 0);
+                        temp_swap = trees[tree].tree_left;
+                        trees[tree].tree_left = trees[tree].tree_right;
+                        trees[tree].tree_right = temp_swap;
+                        if (trees[trees[tree].parent].all_branches.size() > 0) {
+                            switch(trees[tree].split_this_branch) {
+                                case InSubset:
+                                {
+                                    trees[tree].split_this_branch = NotEqual;
+                                    break;
+                                }
+                                case NotInSubset:
+                                {
+                                    trees[tree].split_this_branch = Equal;
+                                    break;
+                                }
+                            }
+                        }
+                    }
+                }
+                if (col_equal >= 0) {
+                    trees[tree].split_subset.resize(0);
+                    trees[tree].split_lev = col_equal;
+                    if (trees[tree].tree_left > 0)
+                        trees[trees[tree].tree_left].parent_branch = Equal;
+                    if (trees[tree].tree_right > 0)
+                        trees[trees[tree].tree_right].parent_branch = NotEqual;
+                }
+                break;
+            }
+            case Ordinal:
+            {
+                if (trees[trees[tree].parent].all_branches.size() == 0) continue;
+                if (trees[tree].split_lev == 0) {
+                    if (trees[tree].split_this_branch == LessOrEqual)
+                        trees[tree].split_this_branch = Equal;
+                    else
+                        trees[tree].split_this_branch = NotEqual;
+                }
+                else if (trees[tree].split_lev == (ncat_ord[trees[tree].col_num] - 2)) {
+                    trees[tree].split_lev++;
+                    if (trees[tree].split_this_branch == Greater)
+                        trees[tree].split_this_branch = Equal;
+                    else
+                        trees[tree].split_this_branch = NotEqual;
+                }
+                break;
+            }
+        }
+    }
+}
+#ifdef TEST_MODE_DEFINE
+/*
+*    Goodie to help with testing and debugging (not used in the final code)
+*
+*    This function tries to unconnect unnecessary trees so that, if a tree has no clusters and its children
+*    don't have any clusters either, such tree would not be reached at prediction time. It will drop trees from the vector
+*    if they happen to lie at the end of it, but otherwise will just leave them there so as not to have to recalculate
+*    the tree indexes and avoid having to update them everywhere they are referenced (such as in identified outliers).
+*
+*    This is only for categorical and ordinal columns, as numerical columns will always produce produce clusters when
+*    they have children.
+*
+*    This is supposed to be done with the conditions at the end of each recursive function, but this piece of
+*    code can provide help in identifying errors when the code is modified.
+*/
+void prune_unused_trees(std::vector<ClusterTree> &trees)
+{
+    /* TODO: when using 'follow_all', function should delete instead of disconnect by setting to zero */
+    if (trees.size() == 0) return;
+    for (size_t t = trees.size() - 1; t >= 0; t--) {
+        if (trees[t].binary_branches.size() > 0) {
+            for (size_t br = 0; br < trees[t].binary_branches.size(); br++) {
+                if (trees[t].binary_branches[br] == 0) continue;
+                if (trees[t].binary_branches[br] >= trees.size()) trees[t].binary_branches[br] = 0;
+                if (check_tree_is_not_needed(trees[trees[t].binary_branches[br]])) trees[t].binary_branches[br] = 0;
+            }
+        }
+        if (trees[t].all_branches.size() > 0) {
+            for (size_t br = 0; br < trees[t].all_branches.size(); br++) {
+                if (trees[t].all_branches[br] == 0) continue;
+                if (trees[t].all_branches[br] >= trees.size()) trees[t].all_branches[br] = 0;
+                if (check_tree_is_not_needed(trees[trees[t].all_branches[br]])) trees[t].all_branches[br] = 0;
+            }
+        }
+        if (check_tree_is_not_needed(trees[t])) {
+            /* disconnect tree from parent */
+            switch(trees[t].parent_branch) {
+                case IsNa:
+                {
+                    trees[trees[t].parent].tree_NA = 0;
+                    break;
+                }
+                case LessOrEqual:
+                {
+                    trees[trees[t].parent].tree_left = 0;
+                    break;
+                }
+                case Greater:
+                {
+                    trees[trees[t].parent].tree_right = 0;
+                    break;
+                }
+                case InSubset:
+                {
+                    trees[trees[t].parent].tree_left = 0;
+                    break;
+                }
+                case NotInSubset:
+                {
+                    trees[trees[t].parent].tree_right = 0;
+                    break;
+                }
+            }
+            if (t == (trees.size() - 1)) trees.pop_back();
+        }
+        if (t == 0) break;
+    }
+}
+#endif
+/* Check whether a tree has no clusters and no children with clusters either */
+bool check_tree_is_not_needed(ClusterTree &tree)
+{
+    return
+        tree.tree_NA == 0 && tree.tree_left == 0 && tree.tree_right == 0 &&
+        tree.clusters.size() == 0 &&
+        (tree.binary_branches.size() == 0 || *std::max_element(tree.binary_branches.begin(), tree.binary_branches.end()) == 0) &&
+        (tree.all_branches.size() == 0 || *std::max_element(tree.all_branches.begin(), tree.all_branches.end()) == 0)
+        ;
+}
+/*
+*    These functions simply check what's the minimum/maximum value that could identify an observation
+*    as outlier in any cluster, or which categories could be possibly flagged as outliers in any cluster.
+*    This info is redundant, as outliers can be identified by following splits, but it can help speed up
+*    things at prediction time by not having to even bother checking a column if the value is within
+*    non-flaggable limits.
+*/
+void calculate_cluster_minimums(ModelOutputs &model_outputs, size_t col)
+{
+    for (size_t cl = 0; cl < model_outputs.all_clusters[col].size(); cl++) {
+        model_outputs.min_outlier_any_cl[col] = fmax(model_outputs.min_outlier_any_cl[col], model_outputs.all_clusters[col][cl].lower_lim);
+        model_outputs.max_outlier_any_cl[col] = fmin(model_outputs.max_outlier_any_cl[col], model_outputs.all_clusters[col][cl].upper_lim);
+    }
+}
+void calculate_cluster_poss_categs(ModelOutputs &model_outputs, size_t col, size_t col_rel)
+{
+    if (model_outputs.all_clusters[col].size() == 0) return;
+    model_outputs.cat_outlier_any_cl[col_rel].resize(model_outputs.all_clusters[col][0].subset_common.size(), 0);
+    for (size_t cl = 0; cl < model_outputs.all_clusters[col].size(); cl++) {
+        for (size_t cat = 0; cat < model_outputs.all_clusters[col][cl].subset_common.size(); cat++) {
+            if (model_outputs.all_clusters[col][cl].subset_common[cat] != 0) model_outputs.cat_outlier_any_cl[col_rel][cat] = true;
+        }
+    }
+}