outliertree 0.3.0 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/README.md +1 -1
- data/ext/outliertree/ext.cpp +23 -0
- data/lib/outliertree/version.rb +1 -1
- data/lib/outliertree.rb +4 -4
- data/vendor/outliertree/README.md +12 -7
- data/vendor/outliertree/src/Makevars.in +1 -2
- data/vendor/outliertree/src/Makevars.win +1 -2
- data/vendor/outliertree/src/RcppExports.cpp +2 -23
- data/vendor/outliertree/src/Rwrapper.cpp +137 -44
- data/vendor/outliertree/src/clusters.cpp +37 -22
- data/vendor/outliertree/src/fit_model.cpp +53 -46
- data/vendor/outliertree/src/misc.cpp +5 -17
- data/vendor/outliertree/src/outlier_tree.hpp +7 -6
- metadata +6 -6
    
        checksums.yaml
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 2 | 
             
            SHA256:
         | 
| 3 | 
            -
              metadata.gz:  | 
| 4 | 
            -
              data.tar.gz:  | 
| 3 | 
            +
              metadata.gz: 107a39daf1b8743880c65c0c9bd20f6b2430687a843aa3394e4f57ba38b58766
         | 
| 4 | 
            +
              data.tar.gz: 81e5e13612dd119624a6ec12652b048002c0c2103ee6389709682fb6bcb27e5e
         | 
| 5 5 | 
             
            SHA512:
         | 
| 6 | 
            -
              metadata.gz:  | 
| 7 | 
            -
              data.tar.gz:  | 
| 6 | 
            +
              metadata.gz: 2a8c6276389a465d548b7b06e7933e64094059960301b4393015bd906dd8deed361887876c152017bc2427fe54b81271e076de24f3e1df801f8f0c330a6c0f76
         | 
| 7 | 
            +
              data.tar.gz: 27b9eb4c42adc7abf6c905ec3c787f6947aae6475ecb37283c9b00e560ebb49a8a6bd7ebacfce2c636ba289f014b6dd87821d65311cd3a8640700a4dae44464d
         | 
    
        data/CHANGELOG.md
    CHANGED
    
    
    
        data/README.md
    CHANGED
    
    | @@ -10,7 +10,7 @@ Price (2.50) looks low given Department is Books and Sale is false | |
| 10 10 |  | 
| 11 11 | 
             
            :evergreen_tree: Check out [IsoTree](https://github.com/ankane/isotree-ruby) for an alternative approach that uses Isolation Forest
         | 
| 12 12 |  | 
| 13 | 
            -
            [](https://github.com/ankane/outliertree-ruby/actions)
         | 
| 14 14 |  | 
| 15 15 | 
             
            ## Installation
         | 
| 16 16 |  | 
    
        data/ext/outliertree/ext.cpp
    CHANGED
    
    | @@ -30,6 +30,29 @@ namespace Rice::detail | |
| 30 30 | 
             
                }
         | 
| 31 31 | 
             
              };
         | 
| 32 32 |  | 
| 33 | 
            +
              template<>
         | 
| 34 | 
            +
              class To_Ruby<std::vector<signed char>>
         | 
| 35 | 
            +
              {
         | 
| 36 | 
            +
              public:
         | 
| 37 | 
            +
                VALUE convert(std::vector<signed char> const & x)
         | 
| 38 | 
            +
                {
         | 
| 39 | 
            +
                  auto a = rb_ary_new2(x.size());
         | 
| 40 | 
            +
                  for (const auto& v : x) {
         | 
| 41 | 
            +
                    rb_ary_push(a, To_Ruby<signed char>().convert(v));
         | 
| 42 | 
            +
                  }
         | 
| 43 | 
            +
                  return a;
         | 
| 44 | 
            +
                }
         | 
| 45 | 
            +
              };
         | 
| 46 | 
            +
             | 
| 47 | 
            +
              template<>
         | 
| 48 | 
            +
              struct Type<std::vector<signed char>>
         | 
| 49 | 
            +
              {
         | 
| 50 | 
            +
                static bool verify()
         | 
| 51 | 
            +
                {
         | 
| 52 | 
            +
                  return true;
         | 
| 53 | 
            +
                }
         | 
| 54 | 
            +
              };
         | 
| 55 | 
            +
             | 
| 33 56 | 
             
              template<>
         | 
| 34 57 | 
             
              struct Type<ColType>
         | 
| 35 58 | 
             
              {
         | 
    
        data/lib/outliertree/version.rb
    CHANGED
    
    
    
        data/lib/outliertree.rb
    CHANGED
    
    | @@ -5,10 +5,10 @@ require "outliertree/ext" | |
| 5 5 | 
             
            require "etc"
         | 
| 6 6 |  | 
| 7 7 | 
             
            # modules
         | 
| 8 | 
            -
             | 
| 9 | 
            -
             | 
| 10 | 
            -
             | 
| 11 | 
            -
             | 
| 8 | 
            +
            require_relative "outliertree/dataset"
         | 
| 9 | 
            +
            require_relative "outliertree/model"
         | 
| 10 | 
            +
            require_relative "outliertree/result"
         | 
| 11 | 
            +
            require_relative "outliertree/version"
         | 
| 12 12 |  | 
| 13 13 | 
             
            module OutlierTree
         | 
| 14 14 | 
             
              def self.new(**options)
         | 
| @@ -58,12 +58,18 @@ Procedure is described in more detail in [Explainable outlier detection through | |
| 58 58 | 
             
            # Installation
         | 
| 59 59 |  | 
| 60 60 | 
             
            * For R:
         | 
| 61 | 
            +
             | 
| 62 | 
            +
            **Note:** This package benefits from extra optimizations that aren't enabled by default for R packages. See [this guide](https://github.com/david-cortes/installing-optimized-libraries) for instructions on how to enable them.
         | 
| 63 | 
            +
             | 
| 61 64 | 
             
            ```r
         | 
| 62 65 | 
             
            install.packages("outliertree")
         | 
| 63 66 | 
             
            ```
         | 
| 64 67 |  | 
| 65 68 |  | 
| 66 69 | 
             
            * For Python:
         | 
| 70 | 
            +
             | 
| 71 | 
            +
            **Note:** requires C/C++ compilers configured for Python. See [this guide](https://github.com/david-cortes/installing-optimized-libraries) for instructions.
         | 
| 72 | 
            +
             | 
| 67 73 | 
             
            ```
         | 
| 68 74 | 
             
            pip install outliertree
         | 
| 69 75 | 
             
            ```
         | 
| @@ -77,22 +83,21 @@ pip install --no-use-pep517 outliertree | |
| 77 83 | 
             
            ```
         | 
| 78 84 | 
             
            brew install libomp
         | 
| 79 85 | 
             
            ```
         | 
| 80 | 
            -
            And then reinstall this package: `pip install --force-reinstall outliertree`.
         | 
| 86 | 
            +
            And then reinstall this package: `pip install --upgrade --no-deps --force-reinstall outliertree`.
         | 
| 81 87 |  | 
| 82 88 | 
             
            ** *
         | 
| 83 | 
            -
            **IMPORTANT:** the setup script will try to add compilation flag `-march=native`. This instructs the compiler to tune the package for the CPU in which it is being installed, but the result might not be usable in other computers. If building a binary wheel of this package or putting it into a docker image which will be used in different machines, this can be overriden by manually supplying compilation `CFLAGS`  | 
| 89 | 
            +
            **IMPORTANT:** the setup script will try to add compilation flag `-march=native`. This instructs the compiler to tune the package for the CPU in which it is being installed (by e.g. using AVX instructions if available), but the result might not be usable in other computers. If building a binary wheel of this package or putting it into a docker image which will be used in different machines, this can be overriden either by (a) defining an environment variable `DONT_SET_MARCH=1`, or by (b) manually supplying compilation `CFLAGS` as an environment variable with something related to architecture. For maximum compatibility (but slowest speed), it's possible to do something like this:
         | 
| 84 90 |  | 
| 85 91 | 
             
            ```
         | 
| 86 | 
            -
            export  | 
| 87 | 
            -
            export CXXFLAGS="-march=x86-64"
         | 
| 92 | 
            +
            export DONT_SET_MARCH=1
         | 
| 88 93 | 
             
            pip install outliertree
         | 
| 89 94 | 
             
            ```
         | 
| 90 95 |  | 
| 91 | 
            -
            or for  | 
| 96 | 
            +
            or, by specifying some compilation flag for architecture:
         | 
| 92 97 | 
             
            ```
         | 
| 93 98 | 
             
            export CFLAGS="-march=x86-64"
         | 
| 94 99 | 
             
            export CXXFLAGS="-march=x86-64"
         | 
| 95 | 
            -
             | 
| 100 | 
            +
            pip install outliertree
         | 
| 96 101 | 
             
            ```
         | 
| 97 102 | 
             
            ** *
         | 
| 98 103 |  | 
| @@ -134,7 +139,7 @@ summary(new_outliers) | |
| 134 139 | 
             
            ```
         | 
| 135 140 | 
             
            (see documentation for more examples)
         | 
| 136 141 |  | 
| 137 | 
            -
            Example [RMarkdown](http://htmlpreview.github.io/?https://github.com/david-cortes/outliertree/blob/master/ | 
| 142 | 
            +
            Example [RMarkdown](http://htmlpreview.github.io/?https://github.com/david-cortes/outliertree/blob/master/vignettes/Explainable_Outlier_Detection_in_Titanic_dataset.html) using the Titanic dataset.
         | 
| 138 143 |  | 
| 139 144 |  | 
| 140 145 | 
             
            * For Python:
         | 
| @@ -10,27 +10,6 @@ Rcpp::Rostream<true>&  Rcpp::Rcout = Rcpp::Rcpp_cout_get(); | |
| 10 10 | 
             
            Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
         | 
| 11 11 | 
             
            #endif
         | 
| 12 12 |  | 
| 13 | 
            -
            // deserialize_OutlierTree
         | 
| 14 | 
            -
            SEXP deserialize_OutlierTree(Rcpp::RawVector src, SEXP ptr_obj);
         | 
| 15 | 
            -
            RcppExport SEXP _outliertree_deserialize_OutlierTree(SEXP srcSEXP, SEXP ptr_objSEXP) {
         | 
| 16 | 
            -
            BEGIN_RCPP
         | 
| 17 | 
            -
                Rcpp::RObject rcpp_result_gen;
         | 
| 18 | 
            -
                Rcpp::traits::input_parameter< Rcpp::RawVector >::type src(srcSEXP);
         | 
| 19 | 
            -
                Rcpp::traits::input_parameter< SEXP >::type ptr_obj(ptr_objSEXP);
         | 
| 20 | 
            -
                rcpp_result_gen = Rcpp::wrap(deserialize_OutlierTree(src, ptr_obj));
         | 
| 21 | 
            -
                return rcpp_result_gen;
         | 
| 22 | 
            -
            END_RCPP
         | 
| 23 | 
            -
            }
         | 
| 24 | 
            -
            // check_null_ptr_model
         | 
| 25 | 
            -
            Rcpp::LogicalVector check_null_ptr_model(SEXP ptr_model);
         | 
| 26 | 
            -
            RcppExport SEXP _outliertree_check_null_ptr_model(SEXP ptr_modelSEXP) {
         | 
| 27 | 
            -
            BEGIN_RCPP
         | 
| 28 | 
            -
                Rcpp::RObject rcpp_result_gen;
         | 
| 29 | 
            -
                Rcpp::traits::input_parameter< SEXP >::type ptr_model(ptr_modelSEXP);
         | 
| 30 | 
            -
                rcpp_result_gen = Rcpp::wrap(check_null_ptr_model(ptr_model));
         | 
| 31 | 
            -
                return rcpp_result_gen;
         | 
| 32 | 
            -
            END_RCPP
         | 
| 33 | 
            -
            }
         | 
| 34 13 | 
             
            // fit_OutlierTree
         | 
| 35 14 | 
             
            Rcpp::List fit_OutlierTree(Rcpp::NumericVector arr_num, size_t ncols_numeric, Rcpp::IntegerVector arr_cat, size_t ncols_categ, Rcpp::IntegerVector ncat, Rcpp::IntegerVector arr_ord, size_t ncols_ord, Rcpp::IntegerVector ncat_ord, size_t nrows, Rcpp::LogicalVector cols_ignore_r, int nthreads, bool categ_as_bin, bool ord_as_bin, bool cat_bruteforce_subset, bool categ_from_maj, bool take_mid, size_t max_depth, double max_perc_outliers, size_t min_size_numeric, size_t min_size_categ, double min_gain, bool follow_all, bool gain_as_pct, double z_norm, double z_outlier, bool return_outliers, Rcpp::ListOf<Rcpp::StringVector> cat_levels, Rcpp::ListOf<Rcpp::StringVector> ord_levels, Rcpp::StringVector colnames_num, Rcpp::StringVector colnames_cat, Rcpp::StringVector colnames_ord, Rcpp::NumericVector min_date, Rcpp::NumericVector min_ts);
         | 
| 36 15 | 
             
            RcppExport SEXP _outliertree_fit_OutlierTree(SEXP arr_numSEXP, SEXP ncols_numericSEXP, SEXP arr_catSEXP, SEXP ncols_categSEXP, SEXP ncatSEXP, SEXP arr_ordSEXP, SEXP ncols_ordSEXP, SEXP ncat_ordSEXP, SEXP nrowsSEXP, SEXP cols_ignore_rSEXP, SEXP nthreadsSEXP, SEXP categ_as_binSEXP, SEXP ord_as_binSEXP, SEXP cat_bruteforce_subsetSEXP, SEXP categ_from_majSEXP, SEXP take_midSEXP, SEXP max_depthSEXP, SEXP max_perc_outliersSEXP, SEXP min_size_numericSEXP, SEXP min_size_categSEXP, SEXP min_gainSEXP, SEXP follow_allSEXP, SEXP gain_as_pctSEXP, SEXP z_normSEXP, SEXP z_outlierSEXP, SEXP return_outliersSEXP, SEXP cat_levelsSEXP, SEXP ord_levelsSEXP, SEXP colnames_numSEXP, SEXP colnames_catSEXP, SEXP colnames_ordSEXP, SEXP min_dateSEXP, SEXP min_tsSEXP) {
         | 
| @@ -119,8 +98,6 @@ END_RCPP | |
| 119 98 | 
             
            }
         | 
| 120 99 |  | 
| 121 100 | 
             
            static const R_CallMethodDef CallEntries[] = {
         | 
| 122 | 
            -
                {"_outliertree_deserialize_OutlierTree", (DL_FUNC) &_outliertree_deserialize_OutlierTree, 2},
         | 
| 123 | 
            -
                {"_outliertree_check_null_ptr_model", (DL_FUNC) &_outliertree_check_null_ptr_model, 1},
         | 
| 124 101 | 
             
                {"_outliertree_fit_OutlierTree", (DL_FUNC) &_outliertree_fit_OutlierTree, 33},
         | 
| 125 102 | 
             
                {"_outliertree_predict_OutlierTree", (DL_FUNC) &_outliertree_predict_OutlierTree, 13},
         | 
| 126 103 | 
             
                {"_outliertree_check_few_values", (DL_FUNC) &_outliertree_check_few_values, 4},
         | 
| @@ -128,7 +105,9 @@ static const R_CallMethodDef CallEntries[] = { | |
| 128 105 | 
             
                {NULL, NULL, 0}
         | 
| 129 106 | 
             
            };
         | 
| 130 107 |  | 
| 108 | 
            +
            void init_altrepped_class(DllInfo* dll);
         | 
| 131 109 | 
             
            RcppExport void R_init_outliertree(DllInfo *dll) {
         | 
| 132 110 | 
             
                R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
         | 
| 133 111 | 
             
                R_useDynamicSymbols(dll, FALSE);
         | 
| 112 | 
            +
                init_altrepped_class(dll);
         | 
| 134 113 | 
             
            }
         | 
| @@ -2,8 +2,8 @@ | |
| 2 2 |  | 
| 3 3 | 
             
            #include <Rcpp.h>
         | 
| 4 4 | 
             
            #include <Rcpp/unwindProtect.h>
         | 
| 5 | 
            +
            #include <R_ext/Altrep.h>
         | 
| 5 6 | 
             
            // [[Rcpp::plugins(cpp11)]]
         | 
| 6 | 
            -
            // [[Rcpp::plugins(unwindProtect)]]
         | 
| 7 7 |  | 
| 8 8 | 
             
            /* This is to serialize the model objects */
         | 
| 9 9 | 
             
            // [[Rcpp::depends(Rcereal)]]
         | 
| @@ -16,18 +16,26 @@ | |
| 16 16 | 
             
            /* This is the package's header */
         | 
| 17 17 | 
             
            #include "outlier_tree.hpp"
         | 
| 18 18 |  | 
| 19 | 
            +
            void delete_model_from_R_ptr(SEXP R_ptr)
         | 
| 20 | 
            +
            {
         | 
| 21 | 
            +
                ModelOutputs *model = static_cast<ModelOutputs*>(R_ExternalPtrAddr(R_ptr));
         | 
| 22 | 
            +
                delete model;
         | 
| 23 | 
            +
                R_SetExternalPtrAddr(R_ptr, nullptr);
         | 
| 24 | 
            +
                R_ClearExternalPtr(R_ptr);
         | 
| 25 | 
            +
            }
         | 
| 26 | 
            +
             | 
| 19 27 | 
             
            SEXP alloc_RawVec(void *data)
         | 
| 20 28 | 
             
            {
         | 
| 21 29 | 
             
                size_t vec_size = *(size_t*)data;
         | 
| 22 30 | 
             
                if (vec_size > (size_t)std::numeric_limits<R_xlen_t>::max())
         | 
| 23 31 | 
             
                    Rcpp::stop("Resulting model is too large for R to handle.");
         | 
| 24 | 
            -
                return  | 
| 32 | 
            +
                return Rf_allocVector(RAWSXP, vec_size);
         | 
| 25 33 | 
             
            }
         | 
| 26 34 |  | 
| 27 35 | 
             
            /* for model serialization and re-usage in R */
         | 
| 28 36 | 
             
            /* https://stackoverflow.com/questions/18474292/how-to-handle-c-internal-data-structure-in-r-in-order-to-allow-save-load */
         | 
| 29 37 | 
             
            /* this extra comment below the link is a workaround for Rcpp issue 675 in GitHub, do not remove it */
         | 
| 30 | 
            -
             | 
| 38 | 
            +
            SEXP serialize_OutlierTree(ModelOutputs *model_outputs)
         | 
| 31 39 | 
             
            {
         | 
| 32 40 | 
             
                std::stringstream ss;
         | 
| 33 41 | 
             
                {
         | 
| @@ -37,35 +45,20 @@ Rcpp::RawVector serialize_OutlierTree(ModelOutputs *model_outputs) | |
| 37 45 | 
             
                ss.seekg(0, ss.end);
         | 
| 38 46 | 
             
                std::stringstream::pos_type vec_size = ss.tellg();
         | 
| 39 47 | 
             
                if (vec_size <= 0) {
         | 
| 40 | 
            -
                     | 
| 41 | 
            -
                    return Rcpp::RawVector();
         | 
| 48 | 
            +
                    Rf_error("Error: model is too big to serialize, resulting object will not be usable.\n");
         | 
| 42 49 | 
             
                }
         | 
| 43 50 | 
             
                size_t vec_size_ = (size_t)vec_size;
         | 
| 44 | 
            -
                 | 
| 45 | 
            -
                if (!retval.size())
         | 
| 46 | 
            -
                    return retval;
         | 
| 51 | 
            +
                SEXP retval = PROTECT(Rcpp::unwindProtect(alloc_RawVec, (void*)&vec_size_));
         | 
| 47 52 | 
             
                ss.seekg(0, ss.beg);
         | 
| 48 | 
            -
                ss.read(reinterpret_cast<char*>(RAW(retval)),  | 
| 53 | 
            +
                ss.read(reinterpret_cast<char*>(RAW(retval)), vec_size_);
         | 
| 54 | 
            +
                UNPROTECT(1);
         | 
| 49 55 | 
             
                return retval;
         | 
| 50 56 | 
             
            }
         | 
| 51 57 |  | 
| 52 | 
            -
            SEXP  | 
| 53 | 
            -
            {
         | 
| 54 | 
            -
                return Rcpp::XPtr<ModelOutputs>((ModelOutputs*)model_ptr, true);
         | 
| 55 | 
            -
            }
         | 
| 56 | 
            -
             | 
| 57 | 
            -
            void R_delete_model(SEXP R_ptr)
         | 
| 58 | 
            -
            {
         | 
| 59 | 
            -
                ModelOutputs *model = static_cast<ModelOutputs*>(R_ExternalPtrAddr(R_ptr));
         | 
| 60 | 
            -
                delete model;
         | 
| 61 | 
            -
                R_ClearExternalPtr(R_ptr);
         | 
| 62 | 
            -
            }
         | 
| 63 | 
            -
             | 
| 64 | 
            -
            // [[Rcpp::export(rng = false)]]
         | 
| 65 | 
            -
            SEXP deserialize_OutlierTree(Rcpp::RawVector src, SEXP ptr_obj)
         | 
| 58 | 
            +
            SEXP deserialize_OutlierTree(SEXP src, SEXP ptr_obj)
         | 
| 66 59 | 
             
            {
         | 
| 67 60 | 
             
                std::stringstream ss;
         | 
| 68 | 
            -
                ss.write(reinterpret_cast<char*>(RAW(src)), src | 
| 61 | 
            +
                ss.write(reinterpret_cast<char*>(RAW(src)), Rf_xlength(src));
         | 
| 69 62 | 
             
                ss.seekg(0, ss.beg);
         | 
| 70 63 | 
             
                std::unique_ptr<ModelOutputs> model_outputs = std::unique_ptr<ModelOutputs>(new ModelOutputs());
         | 
| 71 64 | 
             
                {
         | 
| @@ -73,25 +66,134 @@ SEXP deserialize_OutlierTree(Rcpp::RawVector src, SEXP ptr_obj) | |
| 73 66 | 
             
                    iarchive(*model_outputs);
         | 
| 74 67 | 
             
                }
         | 
| 75 68 | 
             
                R_SetExternalPtrAddr(ptr_obj, model_outputs.get());
         | 
| 76 | 
            -
                R_RegisterCFinalizerEx(ptr_obj,  | 
| 69 | 
            +
                R_RegisterCFinalizerEx(ptr_obj, delete_model_from_R_ptr, TRUE);
         | 
| 77 70 | 
             
                model_outputs.release();
         | 
| 78 71 | 
             
                return R_NilValue;
         | 
| 79 72 | 
             
            }
         | 
| 80 73 |  | 
| 81 | 
            -
             | 
| 74 | 
            +
            static R_altrep_class_t otree_altrepped_pointer_class;
         | 
| 75 | 
            +
             | 
| 76 | 
            +
            R_xlen_t altrepped_pointer_length(SEXP obj)
         | 
| 82 77 | 
             
            {
         | 
| 83 | 
            -
                return  | 
| 78 | 
            +
                return 1;
         | 
| 84 79 | 
             
            }
         | 
| 85 80 |  | 
| 86 | 
            -
            SEXP  | 
| 81 | 
            +
            SEXP get_element_from_altrepped_ptr(SEXP R_altrepped_obj, R_xlen_t idx)
         | 
| 87 82 | 
             
            {
         | 
| 88 | 
            -
                return  | 
| 83 | 
            +
                return R_altrep_data1(R_altrepped_obj);
         | 
| 89 84 | 
             
            }
         | 
| 90 85 |  | 
| 91 | 
            -
             | 
| 92 | 
            -
             | 
| 86 | 
            +
            Rboolean inspect_altrepped_pointer(SEXP x, int pre, int deep, int pvec, void (*inspect_subtree)(SEXP, int, int, int))
         | 
| 87 | 
            +
            {
         | 
| 88 | 
            +
                Rprintf("Altrepped pointer [address:%p]\n", R_ExternalPtrAddr(R_altrep_data1(x)));
         | 
| 89 | 
            +
                return TRUE;
         | 
| 90 | 
            +
            }
         | 
| 91 | 
            +
             | 
| 92 | 
            +
            SEXP duplicate_altrepped_pointer(SEXP altrepped_obj, Rboolean deep)
         | 
| 93 | 
            +
            {
         | 
| 94 | 
            +
                SEXP R_ptr_name = PROTECT(Rf_mkString("ptr"));
         | 
| 95 | 
            +
                SEXP R_ptr_class = PROTECT(Rf_mkString("otree_altrepped_handle"));
         | 
| 96 | 
            +
                SEXP out = PROTECT(R_new_altrep(otree_altrepped_pointer_class, R_NilValue, R_NilValue));
         | 
| 97 | 
            +
             | 
| 98 | 
            +
                if (!deep) {
         | 
| 99 | 
            +
                    R_set_altrep_data1(out, R_altrep_data1(altrepped_obj));
         | 
| 100 | 
            +
                }
         | 
| 101 | 
            +
             | 
| 102 | 
            +
                else {
         | 
| 103 | 
            +
             | 
| 104 | 
            +
                    SEXP R_ptr = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
         | 
| 105 | 
            +
             | 
| 106 | 
            +
                    try {
         | 
| 107 | 
            +
                        std::unique_ptr<ModelOutputs> new_obj(new ModelOutputs());
         | 
| 108 | 
            +
                        ModelOutputs *cpp_ptr = (ModelOutputs*)R_ExternalPtrAddr(R_altrep_data1(altrepped_obj));
         | 
| 109 | 
            +
                        *new_obj = *cpp_ptr;
         | 
| 110 | 
            +
             | 
| 111 | 
            +
                        R_SetExternalPtrAddr(R_ptr, new_obj.get());
         | 
| 112 | 
            +
                        R_RegisterCFinalizerEx(R_ptr, delete_model_from_R_ptr, TRUE);
         | 
| 113 | 
            +
                        new_obj.release();
         | 
| 114 | 
            +
                    }
         | 
| 115 | 
            +
             | 
| 116 | 
            +
                    catch (const std::exception &ex) {
         | 
| 117 | 
            +
                        Rf_error("%s\n", ex.what());
         | 
| 118 | 
            +
                    }
         | 
| 119 | 
            +
             | 
| 120 | 
            +
                    R_set_altrep_data1(out, R_ptr);
         | 
| 121 | 
            +
                    UNPROTECT(1);
         | 
| 122 | 
            +
                }
         | 
| 123 | 
            +
             | 
| 124 | 
            +
                Rf_setAttrib(out, R_NamesSymbol, R_ptr_name);
         | 
| 125 | 
            +
                Rf_setAttrib(out, R_NamesSymbol, R_ptr_class);
         | 
| 126 | 
            +
                UNPROTECT(3);
         | 
| 127 | 
            +
                return out;
         | 
| 128 | 
            +
            }
         | 
| 129 | 
            +
             | 
| 130 | 
            +
            SEXP serialize_altrepped_pointer(SEXP altrepped_obj)
         | 
| 93 131 | 
             
            {
         | 
| 94 | 
            -
                return  | 
| 132 | 
            +
                return serialize_OutlierTree((ModelOutputs*)R_ExternalPtrAddr(R_altrep_data1(altrepped_obj)));
         | 
| 133 | 
            +
            }
         | 
| 134 | 
            +
             | 
| 135 | 
            +
            SEXP deserialize_altrepped_pointer(SEXP cls, SEXP R_state)
         | 
| 136 | 
            +
            {
         | 
| 137 | 
            +
                SEXP R_ptr_name = PROTECT(Rf_mkString("ptr"));
         | 
| 138 | 
            +
                SEXP R_ptr_class = PROTECT(Rf_mkString("otree_altrepped_handle"));
         | 
| 139 | 
            +
                SEXP R_ptr = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
         | 
| 140 | 
            +
                SEXP out = PROTECT(R_new_altrep(otree_altrepped_pointer_class, R_NilValue, R_NilValue));
         | 
| 141 | 
            +
             | 
| 142 | 
            +
                try {
         | 
| 143 | 
            +
                    deserialize_OutlierTree(R_state, R_ptr);
         | 
| 144 | 
            +
                }
         | 
| 145 | 
            +
                catch (const std::exception &ex) {
         | 
| 146 | 
            +
                    Rf_error("%s\n", ex.what());
         | 
| 147 | 
            +
                }
         | 
| 148 | 
            +
             | 
| 149 | 
            +
                R_set_altrep_data1(out, R_ptr);
         | 
| 150 | 
            +
                Rf_setAttrib(out, R_NamesSymbol, R_ptr_name);
         | 
| 151 | 
            +
                Rf_setAttrib(out, R_ClassSymbol, R_ptr_class);
         | 
| 152 | 
            +
             | 
| 153 | 
            +
                UNPROTECT(4);
         | 
| 154 | 
            +
                return out;
         | 
| 155 | 
            +
            }
         | 
| 156 | 
            +
             | 
| 157 | 
            +
            SEXP get_altrepped_pointer(void *void_ptr)
         | 
| 158 | 
            +
            {
         | 
| 159 | 
            +
                SEXP R_ptr_name = PROTECT(Rf_mkString("ptr"));
         | 
| 160 | 
            +
                SEXP R_ptr_class = PROTECT(Rf_mkString("otree_altrepped_handle"));
         | 
| 161 | 
            +
                SEXP R_ptr = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
         | 
| 162 | 
            +
                SEXP out = PROTECT(R_new_altrep(otree_altrepped_pointer_class, R_NilValue, R_NilValue));
         | 
| 163 | 
            +
             | 
| 164 | 
            +
                std::unique_ptr<ModelOutputs> *ptr = (std::unique_ptr<ModelOutputs>*)void_ptr;
         | 
| 165 | 
            +
                R_SetExternalPtrAddr(R_ptr, ptr->get());
         | 
| 166 | 
            +
                R_RegisterCFinalizerEx(R_ptr, delete_model_from_R_ptr, TRUE);
         | 
| 167 | 
            +
                ptr->release();
         | 
| 168 | 
            +
                
         | 
| 169 | 
            +
                R_set_altrep_data1(out, R_ptr);
         | 
| 170 | 
            +
                Rf_setAttrib(out, R_NamesSymbol, R_ptr_name);
         | 
| 171 | 
            +
                Rf_setAttrib(out, R_ClassSymbol, R_ptr_class);
         | 
| 172 | 
            +
             | 
| 173 | 
            +
                UNPROTECT(4);
         | 
| 174 | 
            +
                return out;
         | 
| 175 | 
            +
            }
         | 
| 176 | 
            +
             | 
| 177 | 
            +
            // [[Rcpp::init]]
         | 
| 178 | 
            +
            void init_altrepped_class(DllInfo* dll)
         | 
| 179 | 
            +
            {
         | 
| 180 | 
            +
                otree_altrepped_pointer_class = R_make_altlist_class("otree_altrepped_pointer_class", "outliertree", dll);
         | 
| 181 | 
            +
                R_set_altrep_Length_method(otree_altrepped_pointer_class, altrepped_pointer_length);
         | 
| 182 | 
            +
                R_set_altrep_Inspect_method(otree_altrepped_pointer_class, inspect_altrepped_pointer);
         | 
| 183 | 
            +
                R_set_altrep_Serialized_state_method(otree_altrepped_pointer_class, serialize_altrepped_pointer);
         | 
| 184 | 
            +
                R_set_altrep_Unserialize_method(otree_altrepped_pointer_class, deserialize_altrepped_pointer);
         | 
| 185 | 
            +
                R_set_altrep_Duplicate_method(otree_altrepped_pointer_class, duplicate_altrepped_pointer);
         | 
| 186 | 
            +
                R_set_altlist_Elt_method(otree_altrepped_pointer_class, get_element_from_altrepped_ptr);
         | 
| 187 | 
            +
            }
         | 
| 188 | 
            +
             | 
| 189 | 
            +
            SEXP safe_int(void *x)
         | 
| 190 | 
            +
            {
         | 
| 191 | 
            +
                return Rf_ScalarInteger(*(int*)x);
         | 
| 192 | 
            +
            }
         | 
| 193 | 
            +
             | 
| 194 | 
            +
            SEXP safe_bool(void *x)
         | 
| 195 | 
            +
            {
         | 
| 196 | 
            +
                return Rf_ScalarLogical(*(bool*)x);
         | 
| 95 197 | 
             
            }
         | 
| 96 198 |  | 
| 97 199 | 
             
            double* set_R_nan_as_C_nan(double *restrict x_R, std::vector<double> &x_C, size_t n, int nthreads)
         | 
| @@ -264,7 +366,7 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs, | |
| 264 366 | 
             
                        } else if (outl_col < (ncols_num + ncols_cat)) {
         | 
| 265 367 | 
             
                            if (outl_col < (ncols_num + ncols_cat_cat)) {
         | 
| 266 368 | 
             
                                tmp_bool = Rcpp::LogicalVector(model_outputs.all_clusters[outl_col][outl_clust].subset_common.size(), false);
         | 
| 267 | 
            -
                                for (size_t cat = 0; cat < tmp_bool.size(); cat++) {
         | 
| 369 | 
            +
                                for (size_t cat = 0; cat < (size_t)tmp_bool.size(); cat++) {
         | 
| 268 370 | 
             
                                    if (model_outputs.all_clusters[outl_col][outl_clust].subset_common[cat] == 0) {
         | 
| 269 371 | 
             
                                        tmp_bool[cat] = true;
         | 
| 270 372 | 
             
                                        }
         | 
| @@ -308,7 +410,7 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs, | |
| 308 410 | 
             
                            }
         | 
| 309 411 | 
             
                        } else {
         | 
| 310 412 | 
             
                            tmp_bool = Rcpp::LogicalVector(model_outputs.all_clusters[outl_col][outl_clust].subset_common.size(), false);
         | 
| 311 | 
            -
                            for (size_t cat = 0; cat < tmp_bool.size(); cat++) {
         | 
| 413 | 
            +
                            for (size_t cat = 0; cat < (size_t)tmp_bool.size(); cat++) {
         | 
| 312 414 | 
             
                                if (model_outputs.all_clusters[outl_col][outl_clust].subset_common[cat] == 0) {
         | 
| 313 415 | 
             
                                    tmp_bool[cat] = true;
         | 
| 314 416 | 
             
                                }
         | 
| @@ -1274,7 +1376,6 @@ Rcpp::List fit_OutlierTree(Rcpp::NumericVector arr_num, size_t ncols_numeric, | |
| 1274 1376 | 
             
            {
         | 
| 1275 1377 | 
             
                Rcpp::List outp = Rcpp::List::create(
         | 
| 1276 1378 | 
             
                    Rcpp::_["ptr_model"] = R_NilValue,
         | 
| 1277 | 
            -
                    Rcpp::_["serialized_obj"] = R_NilValue,
         | 
| 1278 1379 | 
             
                    Rcpp::_["bounds"] = R_NilValue,
         | 
| 1279 1380 | 
             
                    Rcpp::_["outliers_info"] = R_NilValue,
         | 
| 1280 1381 | 
             
                    Rcpp::_["ntrees"] = R_NilValue,
         | 
| @@ -1295,7 +1396,6 @@ Rcpp::List fit_OutlierTree(Rcpp::NumericVector arr_num, size_t ncols_numeric, | |
| 1295 1396 | 
             
                double *arr_num_C = set_R_nan_as_C_nan(REAL(arr_num), Xcpp, arr_num.size(), nthreads);
         | 
| 1296 1397 |  | 
| 1297 1398 | 
             
                std::unique_ptr<ModelOutputs> model_outputs = std::unique_ptr<ModelOutputs>(new ModelOutputs());
         | 
| 1298 | 
            -
                try {
         | 
| 1299 1399 | 
             
                found_outliers = fit_outliers_models(*model_outputs,
         | 
| 1300 1400 | 
             
                                                     arr_num_C, ncols_numeric,
         | 
| 1301 1401 | 
             
                                                     INTEGER(arr_cat), ncols_categ, INTEGER(ncat),
         | 
| @@ -1313,13 +1413,7 @@ Rcpp::List fit_OutlierTree(Rcpp::NumericVector arr_num, size_t ncols_numeric, | |
| 1313 1413 | 
             
                    &min_ts
         | 
| 1314 1414 | 
             
                };
         | 
| 1315 1415 | 
             
                outp["bounds"] = Rcpp::unwindProtect(extract_outl_bounds_wrapper, (void*)&temp);
         | 
| 1316 | 
            -
                outp["serialized_obj"] = serialize_OutlierTree(model_outputs.get());
         | 
| 1317 | 
            -
                } catch(std::bad_alloc &e) {
         | 
| 1318 | 
            -
                    Rcpp::stop("Insufficient memory.\n");
         | 
| 1319 | 
            -
                }
         | 
| 1320 1416 |  | 
| 1321 | 
            -
                if (!Rf_xlength(outp["serialized_obj"]))
         | 
| 1322 | 
            -
                    return outp;
         | 
| 1323 1417 | 
             
                if (return_outliers) {
         | 
| 1324 1418 | 
             
                    args_describe_outliers temp = {
         | 
| 1325 1419 | 
             
                        model_outputs.get(),
         | 
| @@ -1350,8 +1444,7 @@ Rcpp::List fit_OutlierTree(Rcpp::NumericVector arr_num, size_t ncols_numeric, | |
| 1350 1444 | 
             
                outp["nclust"] = Rcpp::unwindProtect(safe_int, (void*)&nclust_int);
         | 
| 1351 1445 | 
             
                outp["found_outliers"] = Rcpp::unwindProtect(safe_bool, (void*)&found_outliers);
         | 
| 1352 1446 |  | 
| 1353 | 
            -
                outp["ptr_model"] = Rcpp::unwindProtect( | 
| 1354 | 
            -
                model_outputs.release();
         | 
| 1447 | 
            +
                outp["ptr_model"] = Rcpp::unwindProtect(get_altrepped_pointer, &model_outputs);
         | 
| 1355 1448 | 
             
                return outp;
         | 
| 1356 1449 | 
             
            }
         | 
| 1357 1450 |  | 
| @@ -11,7 +11,7 @@ | |
| 11 11 | 
             
            *      arXiv preprint arXiv:2001.00636 (2020).
         | 
| 12 12 | 
             
            *    
         | 
| 13 13 | 
             
            *    
         | 
| 14 | 
            -
            *    Copyright 2020 David Cortes.
         | 
| 14 | 
            +
            *    Copyright 2020-2024 David Cortes.
         | 
| 15 15 | 
             
            *    
         | 
| 16 16 | 
             
            *    Written for C++11 standard and OpenMP 2.0 or later. Code is meant to be wrapped into scripting languages
         | 
| 17 17 | 
             
            *    such as R or Python.
         | 
| @@ -97,6 +97,10 @@ | |
| 97 97 | 
             
            *        Model parameter. Default is 2.67.
         | 
| 98 98 | 
             
            *    - z_outlier (in)
         | 
| 99 99 | 
             
            *        Model parameter. Default is 8.0. Must be greater than z_norm.
         | 
| 100 | 
            +
            *    - check_nonneg_outliers (in)
         | 
| 101 | 
            +
            *        Whether to add an extra check for possible outliers defined as having negative values while all
         | 
| 102 | 
            +
            *        the rest have positive values, regardless of how many standard deviations away they are.
         | 
| 103 | 
            +
            *        This is currently only done on the first cluster (no conditions on any variable).
         | 
| 100 104 | 
             
            *    
         | 
| 101 105 | 
             
            *    Returns:
         | 
| 102 106 | 
             
            *        - Whether there were any outliers detected.
         | 
| @@ -107,7 +111,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_ | |
| 107 111 | 
             
                                          size_t cluster_num, size_t tree_num, size_t tree_depth,
         | 
| 108 112 | 
             
                                          bool is_log_transf, double log_minval, bool is_exp_transf, double orig_mean, double orig_sd,
         | 
| 109 113 | 
             
                                          double left_tail, double right_tail, double *restrict orig_x,
         | 
| 110 | 
            -
                                          double max_perc_outliers, double z_norm, double z_outlier | 
| 114 | 
            +
                                          double max_perc_outliers, double z_norm, double z_outlier,
         | 
| 115 | 
            +
                                          bool check_nonneg_outliers)
         | 
| 111 116 | 
             
            {
         | 
| 112 117 |  | 
| 113 118 | 
             
                /*  TODO: this function could try to determine if the distribution is multimodal, and if so,
         | 
| @@ -120,6 +125,7 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_ | |
| 120 125 | 
             
                /* NAs and Inf should have already been removed, and outliers with fewer conditionals already discarded */
         | 
| 121 126 | 
             
                bool has_low_values  = false;
         | 
| 122 127 | 
             
                bool has_high_values = false;
         | 
| 128 | 
            +
                bool has_outlier_neg_values = false;
         | 
| 123 129 | 
             
                long double running_mean = 0;
         | 
| 124 130 | 
             
                long double running_ssq  = 0;
         | 
| 125 131 | 
             
                long double mean_prev    = 0;
         | 
| @@ -127,14 +133,15 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_ | |
| 127 133 | 
             
                double mean;
         | 
| 128 134 | 
             
                double sd;
         | 
| 129 135 | 
             
                size_t cnt;
         | 
| 130 | 
            -
                size_t  | 
| 136 | 
            +
                size_t tot           = end - st + 1;
         | 
| 137 | 
            +
                size_t tail_size     = (size_t) calculate_max_outliers((long double)tot, max_perc_outliers);
         | 
| 131 138 | 
             
                size_t st_non_tail   = st  + tail_size;
         | 
| 132 139 | 
             
                size_t end_non_tail  = end - tail_size;
         | 
| 133 140 | 
             
                size_t st_normals    = 0;
         | 
| 134 141 | 
             
                size_t end_normals   = 0;
         | 
| 135 142 | 
             
                double min_gap = z_outlier - z_norm;
         | 
| 136 143 |  | 
| 137 | 
            -
                double curr_gap, next_gap,  | 
| 144 | 
            +
                double curr_gap, next_gap, lim_by_orig;
         | 
| 138 145 |  | 
| 139 146 | 
             
                /* Note: there is no good reason and no theory behind these numbers.
         | 
| 140 147 | 
             
                   TODO: find a better way of setting this */
         | 
| @@ -166,9 +173,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_ | |
| 166 173 | 
             
                if ((!isinf(left_tail) || !isinf(right_tail)) && !is_log_transf && !is_exp_transf) {
         | 
| 167 174 | 
             
                    sd *= 0.5;
         | 
| 168 175 | 
             
                }
         | 
| 169 | 
            -
                sd = std::fmax(sd, 1e-15);
         | 
| 170 176 | 
             
                while (std::numeric_limits<double>::epsilon() > sd*std::fmin(min_gap, z_norm))
         | 
| 171 | 
            -
                    sd  | 
| 177 | 
            +
                    sd = std::nextafter(sd, std::numeric_limits<double>::infinity());
         | 
| 172 178 | 
             
                cluster.cluster_mean = mean;
         | 
| 173 179 | 
             
                cluster.cluster_sd = sd;
         | 
| 174 180 | 
             
                cnt = end - st + 1;
         | 
| @@ -212,10 +218,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_ | |
| 212 218 | 
             
                            cluster.display_lim_low = orig_x[ix_arr[row + 1]];
         | 
| 213 219 | 
             
                            cluster.perc_above = (long double)(end - st_normals + 1) / (long double)(end - st + 1);
         | 
| 214 220 |  | 
| 215 | 
            -
                            eps = 1e-15;
         | 
| 216 221 | 
             
                            while (cluster.display_lim_low <= cluster.lower_lim) {
         | 
| 217 | 
            -
                                cluster.lower_lim  | 
| 218 | 
            -
                                eps *= 4;
         | 
| 222 | 
            +
                                cluster.lower_lim = std::nextafter(cluster.lower_lim, -std::numeric_limits<double>::infinity());
         | 
| 219 223 | 
             
                            }
         | 
| 220 224 | 
             
                            break;
         | 
| 221 225 | 
             
                        }
         | 
| @@ -225,6 +229,7 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_ | |
| 225 229 | 
             
                    if (st_normals == 0) {
         | 
| 226 230 | 
             
                        has_low_values = false;
         | 
| 227 231 | 
             
                    } else {
         | 
| 232 | 
            +
                        assign_low_outliers:
         | 
| 228 233 | 
             
                        for (size_t row = st; row < st_normals; row++) {
         | 
| 229 234 |  | 
| 230 235 | 
             
                            /* assign outlier if it's a better cluster than previously assigned */
         | 
| @@ -254,7 +259,23 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_ | |
| 254 259 | 
             
                        }
         | 
| 255 260 | 
             
                    }
         | 
| 256 261 | 
             
                }
         | 
| 257 | 
            -
                 | 
| 262 | 
            +
                /* special type of outliers not based on standard deviations */
         | 
| 263 | 
            +
                if (check_nonneg_outliers && st_normals == 0 && tot >= 500 && orig_x[ix_arr[st]] < 0. && orig_x[ix_arr[end]] >= 2.) {
         | 
| 264 | 
            +
                    size_t max_neg_outliers = (tot < 10000)? 1 : ((tot < 100000)? 2 : 3);
         | 
| 265 | 
            +
                    if (orig_x[ix_arr[st + max_neg_outliers]] > 0.) {
         | 
| 266 | 
            +
                        size_t num_neg = 0;
         | 
| 267 | 
            +
                        for (size_t row = st; row < st + max_neg_outliers; row++) {
         | 
| 268 | 
            +
                            num_neg += orig_x[ix_arr[row]] < 0.;
         | 
| 269 | 
            +
                        }
         | 
| 270 | 
            +
                        st_normals = st + num_neg;
         | 
| 271 | 
            +
                        cluster.lower_lim = 0.;
         | 
| 272 | 
            +
                        cluster.display_lim_low = orig_x[ix_arr[st + st_normals]];
         | 
| 273 | 
            +
                        cluster.perc_above = (long double)(end - st_normals + 1) / (long double)(end - st + 1);
         | 
| 274 | 
            +
                        has_outlier_neg_values = true;
         | 
| 275 | 
            +
                        goto assign_low_outliers;
         | 
| 276 | 
            +
                    }
         | 
| 277 | 
            +
                }
         | 
| 278 | 
            +
                if (!has_low_values && !has_outlier_neg_values) {
         | 
| 258 279 | 
             
                    cluster.perc_above = 1.0;
         | 
| 259 280 | 
             
                    if (!is_log_transf && !is_exp_transf) {
         | 
| 260 281 |  | 
| @@ -271,10 +292,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_ | |
| 271 292 | 
             
                    }
         | 
| 272 293 |  | 
| 273 294 | 
             
                    if (cluster.lower_lim > -HUGE_VAL) {
         | 
| 274 | 
            -
                        eps = 1e-15;
         | 
| 275 295 | 
             
                        while (cluster.lower_lim >= orig_x[ix_arr[st]]) {
         | 
| 276 | 
            -
                            cluster.lower_lim  | 
| 277 | 
            -
                            eps *= 4.;
         | 
| 296 | 
            +
                            cluster.lower_lim = std::nextafter(cluster.lower_lim, -std::numeric_limits<double>::infinity());
         | 
| 278 297 | 
             
                        }
         | 
| 279 298 | 
             
                    }
         | 
| 280 299 |  | 
| @@ -324,10 +343,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_ | |
| 324 343 | 
             
                            cluster.display_lim_high = orig_x[ix_arr[row - 1]];
         | 
| 325 344 | 
             
                            cluster.perc_below = (long double)(end_normals - st + 1) / (long double)(end - st + 1);
         | 
| 326 345 |  | 
| 327 | 
            -
                            eps = 1e-15;
         | 
| 328 346 | 
             
                            while (cluster.display_lim_high >= cluster.upper_lim) {
         | 
| 329 | 
            -
                                cluster.upper_lim  | 
| 330 | 
            -
                                eps *= 4;
         | 
| 347 | 
            +
                                cluster.upper_lim = std::nextafter(cluster.upper_lim, std::numeric_limits<double>::infinity());
         | 
| 331 348 | 
             
                            }
         | 
| 332 349 | 
             
                            break;
         | 
| 333 350 | 
             
                        }
         | 
| @@ -384,10 +401,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_ | |
| 384 401 | 
             
                    }
         | 
| 385 402 |  | 
| 386 403 | 
             
                    if (cluster.upper_lim < HUGE_VAL) {
         | 
| 387 | 
            -
                        eps = 1e-15;
         | 
| 388 404 | 
             
                        while (cluster.upper_lim <= orig_x[ix_arr[end]]) {
         | 
| 389 | 
            -
                            cluster.upper_lim  | 
| 390 | 
            -
                            eps *= 4.;
         | 
| 405 | 
            +
                            cluster.upper_lim = std::nextafter(cluster.upper_lim, std::numeric_limits<double>::infinity());
         | 
| 391 406 | 
             
                        }
         | 
| 392 407 | 
             
                    }
         | 
| 393 408 |  | 
| @@ -406,8 +421,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_ | |
| 406 421 | 
             
                }
         | 
| 407 422 |  | 
| 408 423 | 
             
                /* save displayed statistics for cluster */
         | 
| 409 | 
            -
                if (has_high_values || has_low_values || is_log_transf || is_exp_transf) {
         | 
| 410 | 
            -
                    size_t st_disp  = has_low_values?  st_normals  : st;
         | 
| 424 | 
            +
                if (has_high_values || has_low_values || is_log_transf || is_exp_transf || has_outlier_neg_values) {
         | 
| 425 | 
            +
                    size_t st_disp  = (has_low_values || has_outlier_neg_values)?  st_normals  : st;
         | 
| 411 426 | 
             
                    size_t end_disp = has_high_values? end_normals : end;
         | 
| 412 427 | 
             
                    running_mean = 0;
         | 
| 413 428 | 
             
                    running_ssq  = 0;
         | 
| @@ -428,7 +443,7 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_ | |
| 428 443 | 
             
                }
         | 
| 429 444 |  | 
| 430 445 | 
             
                /* report whether outliers were found or not */
         | 
| 431 | 
            -
                return has_low_values || has_high_values;
         | 
| 446 | 
            +
                return has_low_values || has_high_values || has_outlier_neg_values;
         | 
| 432 447 | 
             
            }
         | 
| 433 448 |  | 
| 434 449 |  | 
| @@ -11,7 +11,7 @@ | |
| 11 11 | 
             
            *      arXiv preprint arXiv:2001.00636 (2020).
         | 
| 12 12 | 
             
            *    
         | 
| 13 13 | 
             
            *    
         | 
| 14 | 
            -
            *    Copyright 2020 David Cortes.
         | 
| 14 | 
            +
            *    Copyright 2020-2024 David Cortes.
         | 
| 15 15 | 
             
            *    
         | 
| 16 16 | 
             
            *    Written for C++11 standard and OpenMP 2.0 or later. Code is meant to be wrapped into scripting languages
         | 
| 17 17 | 
             
            *    such as R or Python.
         | 
| @@ -190,9 +190,9 @@ bool fit_outliers_models(ModelOutputs &model_outputs, | |
| 190 190 | 
             
                model_outputs.start_ix_cat_counts[0] = 0;
         | 
| 191 191 | 
             
                if (tot_cols > ncols_numeric) {
         | 
| 192 192 | 
             
                    input_data.max_categ = calculate_category_indices(model_outputs.start_ix_cat_counts.data(), input_data.ncat, input_data.ncols_categ,
         | 
| 193 | 
            -
                                                                       | 
| 193 | 
            +
                                                                      input_data.skip_col.data() + ncols_numeric);
         | 
| 194 194 | 
             
                    input_data.max_categ = calculate_category_indices(model_outputs.start_ix_cat_counts.data() + input_data.ncols_categ, input_data.ncat_ord, input_data.ncols_ord,
         | 
| 195 | 
            -
                                                                       | 
| 195 | 
            +
                                                                      input_data.skip_col.data() + input_data.ncols_numeric + input_data.ncols_categ, input_data.max_categ);
         | 
| 196 196 | 
             
                } else {
         | 
| 197 197 | 
             
                    input_data.max_categ = 0;
         | 
| 198 198 | 
             
                }
         | 
| @@ -209,42 +209,39 @@ bool fit_outliers_models(ModelOutputs &model_outputs, | |
| 209 209 |  | 
| 210 210 | 
             
                /* calculate prior probabilities for categorical variables (in parallel), see if any is unsplittable */
         | 
| 211 211 | 
             
                if (tot_cols > ncols_numeric) {
         | 
| 212 | 
            -
                    #pragma omp parallel
         | 
| 212 | 
            +
                    #pragma omp parallel sections if(nthreads > 1)
         | 
| 213 213 | 
             
                    {
         | 
| 214 | 
            -
                        #pragma omp sections
         | 
| 215 | 
            -
                        {
         | 
| 216 214 |  | 
| 217 | 
            -
             | 
| 218 | 
            -
             | 
| 219 | 
            -
             | 
| 220 | 
            -
             | 
| 221 | 
            -
             | 
| 222 | 
            -
             | 
| 223 | 
            -
             | 
| 224 | 
            -
             | 
| 225 | 
            -
             | 
| 226 | 
            -
             | 
| 227 | 
            -
             | 
| 228 | 
            -
             | 
| 229 | 
            -
             | 
| 215 | 
            +
                        #pragma omp section
         | 
| 216 | 
            +
                        {
         | 
| 217 | 
            +
                            if (ncols_categ > 0) {
         | 
| 218 | 
            +
                                calculate_all_cat_counts(model_outputs.start_ix_cat_counts.data(), input_data.cat_counts.data(), input_data.ncat,
         | 
| 219 | 
            +
                                                         input_data.categorical_data, input_data.ncols_categ, input_data.nrows,
         | 
| 220 | 
            +
                                                         input_data.has_NA.data() + ncols_numeric, input_data.skip_col.data() + input_data.ncols_numeric,
         | 
| 221 | 
            +
                                                         std::min(input_data.ncols_categ, (size_t)std::max(1, nthreads - 1)) );
         | 
| 222 | 
            +
             | 
| 223 | 
            +
                                check_cat_col_unsplittable(model_outputs.start_ix_cat_counts.data(), input_data.cat_counts.data(), input_data.ncat,
         | 
| 224 | 
            +
                                                           input_data.ncols_categ, std::min(model_params.min_size_numeric, model_params.min_size_categ), input_data.nrows,
         | 
| 225 | 
            +
                                                           input_data.skip_col.data() + input_data.ncols_numeric,
         | 
| 226 | 
            +
                                                           std::min(input_data.ncols_categ, (size_t)std::max(1, nthreads - 1)));
         | 
| 227 | 
            +
                            }
         | 
| 230 228 |  | 
| 231 229 |  | 
| 232 | 
            -
             | 
| 230 | 
            +
                        }
         | 
| 233 231 |  | 
| 234 | 
            -
             | 
| 235 | 
            -
             | 
| 236 | 
            -
             | 
| 237 | 
            -
             | 
| 238 | 
            -
             | 
| 239 | 
            -
             | 
| 240 | 
            -
             | 
| 241 | 
            -
             | 
| 242 | 
            -
             | 
| 243 | 
            -
             | 
| 244 | 
            -
             | 
| 245 | 
            -
             | 
| 246 | 
            -
             | 
| 247 | 
            -
                                }
         | 
| 232 | 
            +
                        #pragma omp section
         | 
| 233 | 
            +
                        {
         | 
| 234 | 
            +
                            if (ncols_ord > 0) {
         | 
| 235 | 
            +
                                calculate_all_cat_counts(model_outputs.start_ix_cat_counts.data() + input_data.ncols_categ, input_data.cat_counts.data(), input_data.ncat_ord,
         | 
| 236 | 
            +
                                                         input_data.ordinal_data, input_data.ncols_ord, input_data.nrows,
         | 
| 237 | 
            +
                                                         input_data.has_NA.data() + input_data.ncols_numeric + input_data.ncols_categ,
         | 
| 238 | 
            +
                                                         input_data.skip_col.data() + input_data.ncols_numeric + input_data.ncols_categ,
         | 
| 239 | 
            +
                                                         std::max((int)1, nthreads - (int)input_data.ncols_categ) );
         | 
| 240 | 
            +
             | 
| 241 | 
            +
                                check_cat_col_unsplittable(model_outputs.start_ix_cat_counts.data() + input_data.ncols_categ, input_data.cat_counts.data(), input_data.ncat_ord,
         | 
| 242 | 
            +
                                                           ncols_ord, std::min(model_params.min_size_numeric, model_params.min_size_categ), input_data.nrows,
         | 
| 243 | 
            +
                                                           input_data.skip_col.data() + input_data.ncols_numeric + input_data.ncols_categ,
         | 
| 244 | 
            +
                                                           std::max((int)1, nthreads - (int)input_data.ncols_categ));
         | 
| 248 245 | 
             
                            }
         | 
| 249 246 | 
             
                        }
         | 
| 250 247 |  | 
| @@ -260,13 +257,13 @@ bool fit_outliers_models(ModelOutputs &model_outputs, | |
| 260 257 |  | 
| 261 258 | 
             
                /* for numerical columns, check if they have NAs or if total variance is  too small */
         | 
| 262 259 | 
             
                check_missing_no_variance(input_data.numeric_data, input_data.ncols_numeric, input_data.nrows,
         | 
| 263 | 
            -
                                           | 
| 260 | 
            +
                                          input_data.has_NA.data(), input_data.skip_col.data(),
         | 
| 264 261 | 
             
                                          model_outputs.min_decimals_col.data(), nthreads);
         | 
| 265 262 |  | 
| 266 263 | 
             
                /* determine an approximate size for the output clusters, and reserve memory right away */
         | 
| 267 264 | 
             
                model_outputs.all_clusters.resize(tot_cols);
         | 
| 268 265 | 
             
                model_outputs.all_trees.resize(tot_cols);
         | 
| 269 | 
            -
                #pragma omp parallel for shared(model_outputs, input_data, model_params, tot_cols)
         | 
| 266 | 
            +
                #pragma omp parallel for num_threads(nthreads) shared(model_outputs, input_data, model_params, tot_cols)
         | 
| 270 267 | 
             
                for (size_t_for col = 0; col < tot_cols; col++) {
         | 
| 271 268 | 
             
                    if (input_data.skip_col[col]) continue;
         | 
| 272 269 | 
             
                    if (cols_ignore != NULL && cols_ignore[col]) continue;
         | 
| @@ -555,7 +552,8 @@ void process_numeric_col(std::vector<Cluster> &cluster_root, | |
| 555 552 | 
             
                                                                      workspace.log_transf, workspace.log_minval, workspace.exp_transf,
         | 
| 556 553 | 
             
                                                                      workspace.orig_mean, workspace.orig_sd,
         | 
| 557 554 | 
             
                                                                      workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
         | 
| 558 | 
            -
                                                                      model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier | 
| 555 | 
            +
                                                                      model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
         | 
| 556 | 
            +
                                                                      true);
         | 
| 559 557 | 
             
                workspace.tree->back().clusters.push_back(0);
         | 
| 560 558 |  | 
| 561 559 | 
             
                /* remove outliers if any were found */
         | 
| @@ -639,7 +637,8 @@ void recursive_split_numeric(Workspace &workspace, | |
| 639 637 | 
             
                                                                              workspace.log_transf, workspace.log_minval, workspace.exp_transf,
         | 
| 640 638 | 
             
                                                                              workspace.orig_mean, workspace.orig_sd,
         | 
| 641 639 | 
             
                                                                              workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
         | 
| 642 | 
            -
                                                                              model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier | 
| 640 | 
            +
                                                                              model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
         | 
| 641 | 
            +
                                                                              false);
         | 
| 643 642 | 
             
                            workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
         | 
| 644 643 |  | 
| 645 644 | 
             
                            if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
         | 
| @@ -666,7 +665,8 @@ void recursive_split_numeric(Workspace &workspace, | |
| 666 665 | 
             
                                                                          workspace.log_transf, workspace.log_minval, workspace.exp_transf,
         | 
| 667 666 | 
             
                                                                          workspace.orig_mean, workspace.orig_sd,
         | 
| 668 667 | 
             
                                                                          workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
         | 
| 669 | 
            -
                                                                          model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier | 
| 668 | 
            +
                                                                          model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
         | 
| 669 | 
            +
                                                                          false);
         | 
| 670 670 | 
             
                        workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
         | 
| 671 671 |  | 
| 672 672 | 
             
                        if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
         | 
| @@ -690,7 +690,8 @@ void recursive_split_numeric(Workspace &workspace, | |
| 690 690 | 
             
                                                                          workspace.log_transf, workspace.log_minval, workspace.exp_transf,
         | 
| 691 691 | 
             
                                                                          workspace.orig_mean, workspace.orig_sd,
         | 
| 692 692 | 
             
                                                                          workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
         | 
| 693 | 
            -
                                                                          model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier | 
| 693 | 
            +
                                                                          model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
         | 
| 694 | 
            +
                                                                          false);
         | 
| 694 695 | 
             
                        workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
         | 
| 695 696 |  | 
| 696 697 | 
             
                        if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
         | 
| @@ -753,7 +754,8 @@ void recursive_split_numeric(Workspace &workspace, | |
| 753 754 | 
             
                                                                              workspace.log_transf, workspace.log_minval, workspace.exp_transf,
         | 
| 754 755 | 
             
                                                                              workspace.orig_mean, workspace.orig_sd,
         | 
| 755 756 | 
             
                                                                              workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
         | 
| 756 | 
            -
                                                                              model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier | 
| 757 | 
            +
                                                                              model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
         | 
| 758 | 
            +
                                                                              false);
         | 
| 757 759 | 
             
                            workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
         | 
| 758 760 |  | 
| 759 761 | 
             
                            if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
         | 
| @@ -780,7 +782,8 @@ void recursive_split_numeric(Workspace &workspace, | |
| 780 782 | 
             
                                                                          workspace.log_transf, workspace.log_minval, workspace.exp_transf,
         | 
| 781 783 | 
             
                                                                          workspace.orig_mean, workspace.orig_sd,
         | 
| 782 784 | 
             
                                                                          workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
         | 
| 783 | 
            -
                                                                          model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier | 
| 785 | 
            +
                                                                          model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
         | 
| 786 | 
            +
                                                                          false);
         | 
| 784 787 | 
             
                        workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
         | 
| 785 788 |  | 
| 786 789 | 
             
                        if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
         | 
| @@ -808,7 +811,8 @@ void recursive_split_numeric(Workspace &workspace, | |
| 808 811 | 
             
                                                                          workspace.log_transf, workspace.log_minval, workspace.exp_transf,
         | 
| 809 812 | 
             
                                                                          workspace.orig_mean, workspace.orig_sd,
         | 
| 810 813 | 
             
                                                                          workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
         | 
| 811 | 
            -
                                                                          model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier | 
| 814 | 
            +
                                                                          model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
         | 
| 815 | 
            +
                                                                          false);
         | 
| 812 816 | 
             
                        workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
         | 
| 813 817 |  | 
| 814 818 | 
             
                        if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
         | 
| @@ -874,7 +878,8 @@ void recursive_split_numeric(Workspace &workspace, | |
| 874 878 | 
             
                                                                              workspace.log_transf, workspace.log_minval, workspace.exp_transf,
         | 
| 875 879 | 
             
                                                                              workspace.orig_mean, workspace.orig_sd,
         | 
| 876 880 | 
             
                                                                              workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
         | 
| 877 | 
            -
                                                                              model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier | 
| 881 | 
            +
                                                                              model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
         | 
| 882 | 
            +
                                                                              false);
         | 
| 878 883 | 
             
                            workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
         | 
| 879 884 |  | 
| 880 885 | 
             
                            if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
         | 
| @@ -901,7 +906,8 @@ void recursive_split_numeric(Workspace &workspace, | |
| 901 906 | 
             
                                                                          workspace.log_transf, workspace.log_minval, workspace.exp_transf,
         | 
| 902 907 | 
             
                                                                          workspace.orig_mean, workspace.orig_sd,
         | 
| 903 908 | 
             
                                                                          workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
         | 
| 904 | 
            -
                                                                          model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier | 
| 909 | 
            +
                                                                          model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
         | 
| 910 | 
            +
                                                                          false);
         | 
| 905 911 | 
             
                        workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
         | 
| 906 912 |  | 
| 907 913 | 
             
                        if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
         | 
| @@ -931,7 +937,8 @@ void recursive_split_numeric(Workspace &workspace, | |
| 931 937 | 
             
                                                                          workspace.log_transf, workspace.log_minval, workspace.exp_transf,
         | 
| 932 938 | 
             
                                                                          workspace.orig_mean, workspace.orig_sd,
         | 
| 933 939 | 
             
                                                                          workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
         | 
| 934 | 
            -
                                                                          model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier | 
| 940 | 
            +
                                                                          model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
         | 
| 941 | 
            +
                                                                          false);
         | 
| 935 942 | 
             
                        workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
         | 
| 936 943 |  | 
| 937 944 | 
             
                        if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
         | 
| @@ -39,7 +39,7 @@ | |
| 39 39 | 
             
            *    at which position will the counts for a given column start. Note that NAs are stored as the last index in each
         | 
| 40 40 | 
             
            *    column, so each one needs one extra category
         | 
| 41 41 | 
             
            */
         | 
| 42 | 
            -
            int calculate_category_indices(size_t start_ix_cat_counts[], int ncat[], size_t ncols,  | 
| 42 | 
            +
            int calculate_category_indices(size_t start_ix_cat_counts[], int ncat[], size_t ncols, char skip_col[], int max_categ)
         | 
| 43 43 | 
             
            {
         | 
| 44 44 | 
             
                for (size_t col = 0; col < ncols; col++) {
         | 
| 45 45 | 
             
                    max_categ = std::max(ncat[col], max_categ);
         | 
| @@ -53,7 +53,7 @@ int calculate_category_indices(size_t start_ix_cat_counts[], int ncat[], size_t | |
| 53 53 | 
             
            /* Save the counts of each category for each column in the array determined above */
         | 
| 54 54 | 
             
            void calculate_all_cat_counts(size_t start_ix_cat_counts[], size_t cat_counts[], int ncat[],
         | 
| 55 55 | 
             
                                          int categorical_data[], size_t ncols, size_t nrows,
         | 
| 56 | 
            -
                                           | 
| 56 | 
            +
                                          char has_NA[], char skip_col[], int nthreads)
         | 
| 57 57 | 
             
            {
         | 
| 58 58 | 
             
                size_t col_st_offset;
         | 
| 59 59 | 
             
                size_t col_stop;
         | 
| @@ -80,7 +80,7 @@ void calculate_all_cat_counts(size_t start_ix_cat_counts[], size_t cat_counts[], | |
| 80 80 |  | 
| 81 81 | 
             
            /* Check if some column has a large majority that would make any split fail to meet minimum sizes */
         | 
| 82 82 | 
             
            void check_cat_col_unsplittable(size_t start_ix_cat_counts[], size_t cat_counts[], int ncat[],
         | 
| 83 | 
            -
                                            size_t ncols, size_t min_conditioned_size, size_t nrows,  | 
| 83 | 
            +
                                            size_t ncols, size_t min_conditioned_size, size_t nrows, char skip_col[], int nthreads)
         | 
| 84 84 | 
             
            {
         | 
| 85 85 | 
             
                size_t largest_cnt;
         | 
| 86 86 | 
             
                #pragma omp parallel for num_threads(nthreads) private(largest_cnt) shared(ncols, nrows, ncat, cat_counts, start_ix_cat_counts, min_conditioned_size, skip_col)
         | 
| @@ -127,8 +127,8 @@ void calculate_lowerlim_proportion(long double *restrict prop_small, long double | |
| 127 127 |  | 
| 128 128 | 
             
            /* Check if a numerical column has no variance (i.e. will not be splittable).
         | 
| 129 129 | 
             
               Along the way, also record the number of decimals to display for this column. */
         | 
| 130 | 
            -
            void check_missing_no_variance(double numeric_data[], size_t ncols, size_t nrows,  | 
| 131 | 
            -
                                            | 
| 130 | 
            +
            void check_missing_no_variance(double numeric_data[], size_t ncols, size_t nrows, char has_NA[],
         | 
| 131 | 
            +
                                           char skip_col[], int min_decimals[], int nthreads)
         | 
| 132 132 | 
             
            {
         | 
| 133 133 | 
             
                long double running_mean;
         | 
| 134 134 | 
             
                long double mean_prev;
         | 
| @@ -680,18 +680,6 @@ int decimals_diff(double val1, double val2) | |
| 680 680 | 
             
                return (int) res;
         | 
| 681 681 | 
             
            }
         | 
| 682 682 |  | 
| 683 | 
            -
             | 
| 684 | 
            -
            /* Reason behind this function: Cython (as of v0.29) will not auto-deallocate
         | 
| 685 | 
            -
               structs which are part of a cdef'd class, which produces a memory leak
         | 
| 686 | 
            -
               but can be force-destructed. Unfortunately, Cython itself doesn't even
         | 
| 687 | 
            -
               allow calling destructors for structs, so it has to be done externally.
         | 
| 688 | 
            -
               This function should otherwise have no reason to exist.
         | 
| 689 | 
            -
            */
         | 
| 690 | 
            -
            void dealloc_ModelOutputs(ModelOutputs &model_outputs)
         | 
| 691 | 
            -
            {
         | 
| 692 | 
            -
                model_outputs.~ModelOutputs();
         | 
| 693 | 
            -
            }
         | 
| 694 | 
            -
             | 
| 695 683 | 
             
            ModelOutputs get_empty_ModelOutputs()
         | 
| 696 684 | 
             
            {
         | 
| 697 685 | 
             
                return ModelOutputs();
         | 
| @@ -733,7 +733,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_ | |
| 733 733 | 
             
                                          size_t *restrict outlier_depth, Cluster &cluster, std::vector<Cluster> &clusters, size_t cluster_num, size_t tree_num, size_t tree_depth,
         | 
| 734 734 | 
             
                                          bool is_log_transf, double log_minval, bool is_exp_transf, double orig_mean, double orig_sd,
         | 
| 735 735 | 
             
                                          double left_tail, double right_tail, double *restrict orig_x,
         | 
| 736 | 
            -
                                          double max_perc_outliers, double z_norm, double z_outlier | 
| 736 | 
            +
                                          double max_perc_outliers, double z_norm, double z_outlier,
         | 
| 737 | 
            +
                                          bool check_nonneg_outliers);
         | 
| 737 738 | 
             
            void define_categ_cluster_no_cond(int *restrict x, size_t *restrict ix_arr, size_t st, size_t end, size_t ncateg,
         | 
| 738 739 | 
             
                                              double *restrict outlier_scores, size_t *restrict outlier_clusters, size_t *restrict outlier_trees,
         | 
| 739 740 | 
             
                                              size_t *restrict outlier_depth, Cluster &cluster,
         | 
| @@ -802,17 +803,17 @@ typedef struct { | |
| 802 803 | 
             
            } RecursionState;
         | 
| 803 804 |  | 
| 804 805 |  | 
| 805 | 
            -
            int calculate_category_indices(size_t start_ix_cat_counts[], int ncat[], size_t ncols,  | 
| 806 | 
            +
            int calculate_category_indices(size_t start_ix_cat_counts[], int ncat[], size_t ncols, char skip_col[], int max_categ = 0);
         | 
| 806 807 | 
             
            void calculate_all_cat_counts(size_t start_ix_cat_counts[], size_t cat_counts[], int ncat[],
         | 
| 807 808 | 
             
                                          int categorical_data[], size_t ncols, size_t nrows,
         | 
| 808 | 
            -
                                           | 
| 809 | 
            +
                                          char has_NA[], char skip_col[], int nthreads);
         | 
| 809 810 | 
             
            void check_cat_col_unsplittable(size_t start_ix_cat_counts[], size_t cat_counts[], int ncat[],
         | 
| 810 | 
            -
                                            size_t ncols, size_t min_conditioned_size, size_t nrows,  | 
| 811 | 
            +
                                            size_t ncols, size_t min_conditioned_size, size_t nrows, char skip_col[], int nthreads);
         | 
| 811 812 | 
             
            void calculate_lowerlim_proportion(long double *restrict prop_small, long double *restrict prop,
         | 
| 812 813 | 
             
                                               size_t start_ix_cat_counts[], size_t cat_counts[],
         | 
| 813 814 | 
             
                                               size_t ncols, size_t nrows, double z_norm, double z_tail);
         | 
| 814 | 
            -
            void check_missing_no_variance(double numeric_data[], size_t ncols, size_t nrows,  | 
| 815 | 
            -
                                            | 
| 815 | 
            +
            void check_missing_no_variance(double numeric_data[], size_t ncols, size_t nrows, char has_NA[],
         | 
| 816 | 
            +
                                           char skip_col[], int min_decimals[], int nthreads);
         | 
| 816 817 | 
             
            void calc_central_mean_and_sd(size_t ix_arr[], size_t st, size_t end, double x[], size_t size_quarter, double *mean_central, double *sd_central);
         | 
| 817 818 | 
             
            void check_for_tails(size_t ix_arr[], size_t st, size_t end, double *restrict x,
         | 
| 818 819 | 
             
                                 double z_norm, double max_perc_outliers,
         | 
    
        metadata
    CHANGED
    
    | @@ -1,14 +1,14 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: outliertree
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 0. | 
| 4 | 
            +
              version: 0.4.0
         | 
| 5 5 | 
             
            platform: ruby
         | 
| 6 6 | 
             
            authors:
         | 
| 7 7 | 
             
            - Andrew Kane
         | 
| 8 8 | 
             
            autorequire:
         | 
| 9 9 | 
             
            bindir: bin
         | 
| 10 10 | 
             
            cert_chain: []
         | 
| 11 | 
            -
            date:  | 
| 11 | 
            +
            date: 2024-06-12 00:00:00.000000000 Z
         | 
| 12 12 | 
             
            dependencies:
         | 
| 13 13 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 14 14 | 
             
              name: rice
         | 
| @@ -16,14 +16,14 @@ dependencies: | |
| 16 16 | 
             
                requirements:
         | 
| 17 17 | 
             
                - - ">="
         | 
| 18 18 | 
             
                  - !ruby/object:Gem::Version
         | 
| 19 | 
            -
                    version: 4. | 
| 19 | 
            +
                    version: '4.3'
         | 
| 20 20 | 
             
              type: :runtime
         | 
| 21 21 | 
             
              prerelease: false
         | 
| 22 22 | 
             
              version_requirements: !ruby/object:Gem::Requirement
         | 
| 23 23 | 
             
                requirements:
         | 
| 24 24 | 
             
                - - ">="
         | 
| 25 25 | 
             
                  - !ruby/object:Gem::Version
         | 
| 26 | 
            -
                    version: 4. | 
| 26 | 
            +
                    version: '4.3'
         | 
| 27 27 | 
             
            description:
         | 
| 28 28 | 
             
            email: andrew@ankane.org
         | 
| 29 29 | 
             
            executables: []
         | 
| @@ -68,14 +68,14 @@ required_ruby_version: !ruby/object:Gem::Requirement | |
| 68 68 | 
             
              requirements:
         | 
| 69 69 | 
             
              - - ">="
         | 
| 70 70 | 
             
                - !ruby/object:Gem::Version
         | 
| 71 | 
            -
                  version: ' | 
| 71 | 
            +
                  version: '3.1'
         | 
| 72 72 | 
             
            required_rubygems_version: !ruby/object:Gem::Requirement
         | 
| 73 73 | 
             
              requirements:
         | 
| 74 74 | 
             
              - - ">="
         | 
| 75 75 | 
             
                - !ruby/object:Gem::Version
         | 
| 76 76 | 
             
                  version: '0'
         | 
| 77 77 | 
             
            requirements: []
         | 
| 78 | 
            -
            rubygems_version: 3. | 
| 78 | 
            +
            rubygems_version: 3.5.9
         | 
| 79 79 | 
             
            signing_key:
         | 
| 80 80 | 
             
            specification_version: 4
         | 
| 81 81 | 
             
            summary: Explainable outlier/anomaly detection for Ruby
         |