outliertree 0.3.0 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2851b4b56b23141bc9f1ef5b3c448fb75d785ce0e7b38580113001898ce18e2e
4
- data.tar.gz: 817325392325bc61f1dea1363096678fe9fd578ec6026301e173447edc522752
3
+ metadata.gz: 107a39daf1b8743880c65c0c9bd20f6b2430687a843aa3394e4f57ba38b58766
4
+ data.tar.gz: 81e5e13612dd119624a6ec12652b048002c0c2103ee6389709682fb6bcb27e5e
5
5
  SHA512:
6
- metadata.gz: e1bc84c131959bb7260100b4aa1e345ae09330600252299084efe5d40fbff8d8d9d9aadf78c6384a0b6a158a1b3bbcdeff9de3ce71fca90ce938003125b37898
7
- data.tar.gz: cad988456c492f101bc71334997217a48d6588c4e2e5feee333bc074f3b14b62557d0e64429fca787705244b701eca7d685aa17f090dad94578fce172812e5a0
6
+ metadata.gz: 2a8c6276389a465d548b7b06e7933e64094059960301b4393015bd906dd8deed361887876c152017bc2427fe54b81271e076de24f3e1df801f8f0c330a6c0f76
7
+ data.tar.gz: 27b9eb4c42adc7abf6c905ec3c787f6947aae6475ecb37283c9b00e560ebb49a8a6bd7ebacfce2c636ba289f014b6dd87821d65311cd3a8640700a4dae44464d
data/CHANGELOG.md CHANGED
@@ -1,3 +1,13 @@
1
+ ## 0.4.0 (2024-06-11)
2
+
3
+ - Updated OutlierTree to 1.9.0
4
+ - Dropped support for Ruby < 3.1
5
+
6
+ ## 0.3.1 (2023-12-19)
7
+
8
+ - Updated OutlierTree to 1.8.2
9
+ - Fixed error with Rice 4.1
10
+
1
11
  ## 0.3.0 (2022-06-13)
2
12
 
3
13
  - Updated OutlierTree to 1.8.1
data/README.md CHANGED
@@ -10,7 +10,7 @@ Price (2.50) looks low given Department is Books and Sale is false
10
10
 
11
11
  :evergreen_tree: Check out [IsoTree](https://github.com/ankane/isotree-ruby) for an alternative approach that uses Isolation Forest
12
12
 
13
- [![Build Status](https://github.com/ankane/outliertree-ruby/workflows/build/badge.svg?branch=master)](https://github.com/ankane/outliertree-ruby/actions)
13
+ [![Build Status](https://github.com/ankane/outliertree-ruby/actions/workflows/build.yml/badge.svg)](https://github.com/ankane/outliertree-ruby/actions)
14
14
 
15
15
  ## Installation
16
16
 
@@ -30,6 +30,29 @@ namespace Rice::detail
30
30
  }
31
31
  };
32
32
 
33
+ template<>
34
+ class To_Ruby<std::vector<signed char>>
35
+ {
36
+ public:
37
+ VALUE convert(std::vector<signed char> const & x)
38
+ {
39
+ auto a = rb_ary_new2(x.size());
40
+ for (const auto& v : x) {
41
+ rb_ary_push(a, To_Ruby<signed char>().convert(v));
42
+ }
43
+ return a;
44
+ }
45
+ };
46
+
47
+ template<>
48
+ struct Type<std::vector<signed char>>
49
+ {
50
+ static bool verify()
51
+ {
52
+ return true;
53
+ }
54
+ };
55
+
33
56
  template<>
34
57
  struct Type<ColType>
35
58
  {
@@ -1,3 +1,3 @@
1
1
  module OutlierTree
2
- VERSION = "0.3.0"
2
+ VERSION = "0.4.0"
3
3
  end
data/lib/outliertree.rb CHANGED
@@ -5,10 +5,10 @@ require "outliertree/ext"
5
5
  require "etc"
6
6
 
7
7
  # modules
8
- require "outliertree/dataset"
9
- require "outliertree/model"
10
- require "outliertree/result"
11
- require "outliertree/version"
8
+ require_relative "outliertree/dataset"
9
+ require_relative "outliertree/model"
10
+ require_relative "outliertree/result"
11
+ require_relative "outliertree/version"
12
12
 
13
13
  module OutlierTree
14
14
  def self.new(**options)
@@ -58,12 +58,18 @@ Procedure is described in more detail in [Explainable outlier detection through
58
58
  # Installation
59
59
 
60
60
  * For R:
61
+
62
+ **Note:** This package benefits from extra optimizations that aren't enabled by default for R packages. See [this guide](https://github.com/david-cortes/installing-optimized-libraries) for instructions on how to enable them.
63
+
61
64
  ```r
62
65
  install.packages("outliertree")
63
66
  ```
64
67
 
65
68
 
66
69
  * For Python:
70
+
71
+ **Note:** requires C/C++ compilers configured for Python. See [this guide](https://github.com/david-cortes/installing-optimized-libraries) for instructions.
72
+
67
73
  ```
68
74
  pip install outliertree
69
75
  ```
@@ -77,22 +83,21 @@ pip install --no-use-pep517 outliertree
77
83
  ```
78
84
  brew install libomp
79
85
  ```
80
- And then reinstall this package: `pip install --force-reinstall outliertree`.
86
+ And then reinstall this package: `pip install --upgrade --no-deps --force-reinstall outliertree`.
81
87
 
82
88
  ** *
83
- **IMPORTANT:** the setup script will try to add compilation flag `-march=native`. This instructs the compiler to tune the package for the CPU in which it is being installed, but the result might not be usable in other computers. If building a binary wheel of this package or putting it into a docker image which will be used in different machines, this can be overriden by manually supplying compilation `CFLAGS` and `CXXFLAGS` as environment variables with something related to architecture. For maximum compatibility (but slowest speed), assuming `x86-64` computers, it's possible to do something like this:
89
+ **IMPORTANT:** the setup script will try to add compilation flag `-march=native`. This instructs the compiler to tune the package for the CPU in which it is being installed (by e.g. using AVX instructions if available), but the result might not be usable in other computers. If building a binary wheel of this package or putting it into a docker image which will be used in different machines, this can be overriden either by (a) defining an environment variable `DONT_SET_MARCH=1`, or by (b) manually supplying compilation `CFLAGS` as an environment variable with something related to architecture. For maximum compatibility (but slowest speed), it's possible to do something like this:
84
90
 
85
91
  ```
86
- export CFLAGS="-march=x86-64"
87
- export CXXFLAGS="-march=x86-64"
92
+ export DONT_SET_MARCH=1
88
93
  pip install outliertree
89
94
  ```
90
95
 
91
- or for creating wheels:
96
+ or, by specifying some compilation flag for architecture:
92
97
  ```
93
98
  export CFLAGS="-march=x86-64"
94
99
  export CXXFLAGS="-march=x86-64"
95
- python setup.py bwheel
100
+ pip install outliertree
96
101
  ```
97
102
  ** *
98
103
 
@@ -134,7 +139,7 @@ summary(new_outliers)
134
139
  ```
135
140
  (see documentation for more examples)
136
141
 
137
- Example [RMarkdown](http://htmlpreview.github.io/?https://github.com/david-cortes/outliertree/blob/master/example/titanic_outliertree_r.html) using the Titanic dataset.
142
+ Example [RMarkdown](http://htmlpreview.github.io/?https://github.com/david-cortes/outliertree/blob/master/vignettes/Explainable_Outlier_Detection_in_Titanic_dataset.html) using the Titanic dataset.
138
143
 
139
144
 
140
145
  * For Python:
@@ -1,4 +1,3 @@
1
- PKG_CPPFLAGS = -D_FOR_R @SUPPORTS_RESTRICT@
1
+ PKG_CPPFLAGS = -DRCPP_USE_UNWIND_PROTECT -D_FOR_R @SUPPORTS_RESTRICT@
2
2
  PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) @FNE_FLAG@ @FNTP_FLAG@ $(CXX_VISIBILITY)
3
3
  PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS)
4
- CXX_STD = CXX11
@@ -1,4 +1,3 @@
1
- PKG_CPPFLAGS = -D_FOR_R
1
+ PKG_CPPFLAGS = -DRCPP_USE_UNWIND_PROTECT -D_FOR_R
2
2
  PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS) -fno-trapping-math -fno-math-errno
3
3
  PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS)
4
- CXX_STD = CXX11
@@ -10,27 +10,6 @@ Rcpp::Rostream<true>& Rcpp::Rcout = Rcpp::Rcpp_cout_get();
10
10
  Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
11
11
  #endif
12
12
 
13
- // deserialize_OutlierTree
14
- SEXP deserialize_OutlierTree(Rcpp::RawVector src, SEXP ptr_obj);
15
- RcppExport SEXP _outliertree_deserialize_OutlierTree(SEXP srcSEXP, SEXP ptr_objSEXP) {
16
- BEGIN_RCPP
17
- Rcpp::RObject rcpp_result_gen;
18
- Rcpp::traits::input_parameter< Rcpp::RawVector >::type src(srcSEXP);
19
- Rcpp::traits::input_parameter< SEXP >::type ptr_obj(ptr_objSEXP);
20
- rcpp_result_gen = Rcpp::wrap(deserialize_OutlierTree(src, ptr_obj));
21
- return rcpp_result_gen;
22
- END_RCPP
23
- }
24
- // check_null_ptr_model
25
- Rcpp::LogicalVector check_null_ptr_model(SEXP ptr_model);
26
- RcppExport SEXP _outliertree_check_null_ptr_model(SEXP ptr_modelSEXP) {
27
- BEGIN_RCPP
28
- Rcpp::RObject rcpp_result_gen;
29
- Rcpp::traits::input_parameter< SEXP >::type ptr_model(ptr_modelSEXP);
30
- rcpp_result_gen = Rcpp::wrap(check_null_ptr_model(ptr_model));
31
- return rcpp_result_gen;
32
- END_RCPP
33
- }
34
13
  // fit_OutlierTree
35
14
  Rcpp::List fit_OutlierTree(Rcpp::NumericVector arr_num, size_t ncols_numeric, Rcpp::IntegerVector arr_cat, size_t ncols_categ, Rcpp::IntegerVector ncat, Rcpp::IntegerVector arr_ord, size_t ncols_ord, Rcpp::IntegerVector ncat_ord, size_t nrows, Rcpp::LogicalVector cols_ignore_r, int nthreads, bool categ_as_bin, bool ord_as_bin, bool cat_bruteforce_subset, bool categ_from_maj, bool take_mid, size_t max_depth, double max_perc_outliers, size_t min_size_numeric, size_t min_size_categ, double min_gain, bool follow_all, bool gain_as_pct, double z_norm, double z_outlier, bool return_outliers, Rcpp::ListOf<Rcpp::StringVector> cat_levels, Rcpp::ListOf<Rcpp::StringVector> ord_levels, Rcpp::StringVector colnames_num, Rcpp::StringVector colnames_cat, Rcpp::StringVector colnames_ord, Rcpp::NumericVector min_date, Rcpp::NumericVector min_ts);
36
15
  RcppExport SEXP _outliertree_fit_OutlierTree(SEXP arr_numSEXP, SEXP ncols_numericSEXP, SEXP arr_catSEXP, SEXP ncols_categSEXP, SEXP ncatSEXP, SEXP arr_ordSEXP, SEXP ncols_ordSEXP, SEXP ncat_ordSEXP, SEXP nrowsSEXP, SEXP cols_ignore_rSEXP, SEXP nthreadsSEXP, SEXP categ_as_binSEXP, SEXP ord_as_binSEXP, SEXP cat_bruteforce_subsetSEXP, SEXP categ_from_majSEXP, SEXP take_midSEXP, SEXP max_depthSEXP, SEXP max_perc_outliersSEXP, SEXP min_size_numericSEXP, SEXP min_size_categSEXP, SEXP min_gainSEXP, SEXP follow_allSEXP, SEXP gain_as_pctSEXP, SEXP z_normSEXP, SEXP z_outlierSEXP, SEXP return_outliersSEXP, SEXP cat_levelsSEXP, SEXP ord_levelsSEXP, SEXP colnames_numSEXP, SEXP colnames_catSEXP, SEXP colnames_ordSEXP, SEXP min_dateSEXP, SEXP min_tsSEXP) {
@@ -119,8 +98,6 @@ END_RCPP
119
98
  }
120
99
 
121
100
  static const R_CallMethodDef CallEntries[] = {
122
- {"_outliertree_deserialize_OutlierTree", (DL_FUNC) &_outliertree_deserialize_OutlierTree, 2},
123
- {"_outliertree_check_null_ptr_model", (DL_FUNC) &_outliertree_check_null_ptr_model, 1},
124
101
  {"_outliertree_fit_OutlierTree", (DL_FUNC) &_outliertree_fit_OutlierTree, 33},
125
102
  {"_outliertree_predict_OutlierTree", (DL_FUNC) &_outliertree_predict_OutlierTree, 13},
126
103
  {"_outliertree_check_few_values", (DL_FUNC) &_outliertree_check_few_values, 4},
@@ -128,7 +105,9 @@ static const R_CallMethodDef CallEntries[] = {
128
105
  {NULL, NULL, 0}
129
106
  };
130
107
 
108
+ void init_altrepped_class(DllInfo* dll);
131
109
  RcppExport void R_init_outliertree(DllInfo *dll) {
132
110
  R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
133
111
  R_useDynamicSymbols(dll, FALSE);
112
+ init_altrepped_class(dll);
134
113
  }
@@ -2,8 +2,8 @@
2
2
 
3
3
  #include <Rcpp.h>
4
4
  #include <Rcpp/unwindProtect.h>
5
+ #include <R_ext/Altrep.h>
5
6
  // [[Rcpp::plugins(cpp11)]]
6
- // [[Rcpp::plugins(unwindProtect)]]
7
7
 
8
8
  /* This is to serialize the model objects */
9
9
  // [[Rcpp::depends(Rcereal)]]
@@ -16,18 +16,26 @@
16
16
  /* This is the package's header */
17
17
  #include "outlier_tree.hpp"
18
18
 
19
+ void delete_model_from_R_ptr(SEXP R_ptr)
20
+ {
21
+ ModelOutputs *model = static_cast<ModelOutputs*>(R_ExternalPtrAddr(R_ptr));
22
+ delete model;
23
+ R_SetExternalPtrAddr(R_ptr, nullptr);
24
+ R_ClearExternalPtr(R_ptr);
25
+ }
26
+
19
27
  SEXP alloc_RawVec(void *data)
20
28
  {
21
29
  size_t vec_size = *(size_t*)data;
22
30
  if (vec_size > (size_t)std::numeric_limits<R_xlen_t>::max())
23
31
  Rcpp::stop("Resulting model is too large for R to handle.");
24
- return Rcpp::RawVector((R_xlen_t)vec_size);
32
+ return Rf_allocVector(RAWSXP, vec_size);
25
33
  }
26
34
 
27
35
  /* for model serialization and re-usage in R */
28
36
  /* https://stackoverflow.com/questions/18474292/how-to-handle-c-internal-data-structure-in-r-in-order-to-allow-save-load */
29
37
  /* this extra comment below the link is a workaround for Rcpp issue 675 in GitHub, do not remove it */
30
- Rcpp::RawVector serialize_OutlierTree(ModelOutputs *model_outputs)
38
+ SEXP serialize_OutlierTree(ModelOutputs *model_outputs)
31
39
  {
32
40
  std::stringstream ss;
33
41
  {
@@ -37,35 +45,20 @@ Rcpp::RawVector serialize_OutlierTree(ModelOutputs *model_outputs)
37
45
  ss.seekg(0, ss.end);
38
46
  std::stringstream::pos_type vec_size = ss.tellg();
39
47
  if (vec_size <= 0) {
40
- Rcpp::Rcerr << "Error: model is too big to serialize, resulting object will not be usable.\n" << std::endl;
41
- return Rcpp::RawVector();
48
+ Rf_error("Error: model is too big to serialize, resulting object will not be usable.\n");
42
49
  }
43
50
  size_t vec_size_ = (size_t)vec_size;
44
- Rcpp::RawVector retval = Rcpp::unwindProtect(alloc_RawVec, (void*)&vec_size_);
45
- if (!retval.size())
46
- return retval;
51
+ SEXP retval = PROTECT(Rcpp::unwindProtect(alloc_RawVec, (void*)&vec_size_));
47
52
  ss.seekg(0, ss.beg);
48
- ss.read(reinterpret_cast<char*>(RAW(retval)), retval.size());
53
+ ss.read(reinterpret_cast<char*>(RAW(retval)), vec_size_);
54
+ UNPROTECT(1);
49
55
  return retval;
50
56
  }
51
57
 
52
- SEXP safe_XPtr(void *model_ptr)
53
- {
54
- return Rcpp::XPtr<ModelOutputs>((ModelOutputs*)model_ptr, true);
55
- }
56
-
57
- void R_delete_model(SEXP R_ptr)
58
- {
59
- ModelOutputs *model = static_cast<ModelOutputs*>(R_ExternalPtrAddr(R_ptr));
60
- delete model;
61
- R_ClearExternalPtr(R_ptr);
62
- }
63
-
64
- // [[Rcpp::export(rng = false)]]
65
- SEXP deserialize_OutlierTree(Rcpp::RawVector src, SEXP ptr_obj)
58
+ SEXP deserialize_OutlierTree(SEXP src, SEXP ptr_obj)
66
59
  {
67
60
  std::stringstream ss;
68
- ss.write(reinterpret_cast<char*>(RAW(src)), src.size());
61
+ ss.write(reinterpret_cast<char*>(RAW(src)), Rf_xlength(src));
69
62
  ss.seekg(0, ss.beg);
70
63
  std::unique_ptr<ModelOutputs> model_outputs = std::unique_ptr<ModelOutputs>(new ModelOutputs());
71
64
  {
@@ -73,25 +66,134 @@ SEXP deserialize_OutlierTree(Rcpp::RawVector src, SEXP ptr_obj)
73
66
  iarchive(*model_outputs);
74
67
  }
75
68
  R_SetExternalPtrAddr(ptr_obj, model_outputs.get());
76
- R_RegisterCFinalizerEx(ptr_obj, R_delete_model, TRUE);
69
+ R_RegisterCFinalizerEx(ptr_obj, delete_model_from_R_ptr, TRUE);
77
70
  model_outputs.release();
78
71
  return R_NilValue;
79
72
  }
80
73
 
81
- SEXP safe_int(void *x)
74
+ static R_altrep_class_t otree_altrepped_pointer_class;
75
+
76
+ R_xlen_t altrepped_pointer_length(SEXP obj)
82
77
  {
83
- return Rcpp::wrap(*(int*)x);
78
+ return 1;
84
79
  }
85
80
 
86
- SEXP safe_bool(void *x)
81
+ SEXP get_element_from_altrepped_ptr(SEXP R_altrepped_obj, R_xlen_t idx)
87
82
  {
88
- return Rcpp::wrap(*(bool*)x);
83
+ return R_altrep_data1(R_altrepped_obj);
89
84
  }
90
85
 
91
- // [[Rcpp::export(rng = false)]]
92
- Rcpp::LogicalVector check_null_ptr_model(SEXP ptr_model)
86
+ Rboolean inspect_altrepped_pointer(SEXP x, int pre, int deep, int pvec, void (*inspect_subtree)(SEXP, int, int, int))
87
+ {
88
+ Rprintf("Altrepped pointer [address:%p]\n", R_ExternalPtrAddr(R_altrep_data1(x)));
89
+ return TRUE;
90
+ }
91
+
92
+ SEXP duplicate_altrepped_pointer(SEXP altrepped_obj, Rboolean deep)
93
+ {
94
+ SEXP R_ptr_name = PROTECT(Rf_mkString("ptr"));
95
+ SEXP R_ptr_class = PROTECT(Rf_mkString("otree_altrepped_handle"));
96
+ SEXP out = PROTECT(R_new_altrep(otree_altrepped_pointer_class, R_NilValue, R_NilValue));
97
+
98
+ if (!deep) {
99
+ R_set_altrep_data1(out, R_altrep_data1(altrepped_obj));
100
+ }
101
+
102
+ else {
103
+
104
+ SEXP R_ptr = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
105
+
106
+ try {
107
+ std::unique_ptr<ModelOutputs> new_obj(new ModelOutputs());
108
+ ModelOutputs *cpp_ptr = (ModelOutputs*)R_ExternalPtrAddr(R_altrep_data1(altrepped_obj));
109
+ *new_obj = *cpp_ptr;
110
+
111
+ R_SetExternalPtrAddr(R_ptr, new_obj.get());
112
+ R_RegisterCFinalizerEx(R_ptr, delete_model_from_R_ptr, TRUE);
113
+ new_obj.release();
114
+ }
115
+
116
+ catch (const std::exception &ex) {
117
+ Rf_error("%s\n", ex.what());
118
+ }
119
+
120
+ R_set_altrep_data1(out, R_ptr);
121
+ UNPROTECT(1);
122
+ }
123
+
124
+ Rf_setAttrib(out, R_NamesSymbol, R_ptr_name);
125
+ Rf_setAttrib(out, R_NamesSymbol, R_ptr_class);
126
+ UNPROTECT(3);
127
+ return out;
128
+ }
129
+
130
+ SEXP serialize_altrepped_pointer(SEXP altrepped_obj)
93
131
  {
94
- return Rcpp::LogicalVector(R_ExternalPtrAddr(ptr_model) == NULL);
132
+ return serialize_OutlierTree((ModelOutputs*)R_ExternalPtrAddr(R_altrep_data1(altrepped_obj)));
133
+ }
134
+
135
+ SEXP deserialize_altrepped_pointer(SEXP cls, SEXP R_state)
136
+ {
137
+ SEXP R_ptr_name = PROTECT(Rf_mkString("ptr"));
138
+ SEXP R_ptr_class = PROTECT(Rf_mkString("otree_altrepped_handle"));
139
+ SEXP R_ptr = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
140
+ SEXP out = PROTECT(R_new_altrep(otree_altrepped_pointer_class, R_NilValue, R_NilValue));
141
+
142
+ try {
143
+ deserialize_OutlierTree(R_state, R_ptr);
144
+ }
145
+ catch (const std::exception &ex) {
146
+ Rf_error("%s\n", ex.what());
147
+ }
148
+
149
+ R_set_altrep_data1(out, R_ptr);
150
+ Rf_setAttrib(out, R_NamesSymbol, R_ptr_name);
151
+ Rf_setAttrib(out, R_ClassSymbol, R_ptr_class);
152
+
153
+ UNPROTECT(4);
154
+ return out;
155
+ }
156
+
157
+ SEXP get_altrepped_pointer(void *void_ptr)
158
+ {
159
+ SEXP R_ptr_name = PROTECT(Rf_mkString("ptr"));
160
+ SEXP R_ptr_class = PROTECT(Rf_mkString("otree_altrepped_handle"));
161
+ SEXP R_ptr = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
162
+ SEXP out = PROTECT(R_new_altrep(otree_altrepped_pointer_class, R_NilValue, R_NilValue));
163
+
164
+ std::unique_ptr<ModelOutputs> *ptr = (std::unique_ptr<ModelOutputs>*)void_ptr;
165
+ R_SetExternalPtrAddr(R_ptr, ptr->get());
166
+ R_RegisterCFinalizerEx(R_ptr, delete_model_from_R_ptr, TRUE);
167
+ ptr->release();
168
+
169
+ R_set_altrep_data1(out, R_ptr);
170
+ Rf_setAttrib(out, R_NamesSymbol, R_ptr_name);
171
+ Rf_setAttrib(out, R_ClassSymbol, R_ptr_class);
172
+
173
+ UNPROTECT(4);
174
+ return out;
175
+ }
176
+
177
+ // [[Rcpp::init]]
178
+ void init_altrepped_class(DllInfo* dll)
179
+ {
180
+ otree_altrepped_pointer_class = R_make_altlist_class("otree_altrepped_pointer_class", "outliertree", dll);
181
+ R_set_altrep_Length_method(otree_altrepped_pointer_class, altrepped_pointer_length);
182
+ R_set_altrep_Inspect_method(otree_altrepped_pointer_class, inspect_altrepped_pointer);
183
+ R_set_altrep_Serialized_state_method(otree_altrepped_pointer_class, serialize_altrepped_pointer);
184
+ R_set_altrep_Unserialize_method(otree_altrepped_pointer_class, deserialize_altrepped_pointer);
185
+ R_set_altrep_Duplicate_method(otree_altrepped_pointer_class, duplicate_altrepped_pointer);
186
+ R_set_altlist_Elt_method(otree_altrepped_pointer_class, get_element_from_altrepped_ptr);
187
+ }
188
+
189
+ SEXP safe_int(void *x)
190
+ {
191
+ return Rf_ScalarInteger(*(int*)x);
192
+ }
193
+
194
+ SEXP safe_bool(void *x)
195
+ {
196
+ return Rf_ScalarLogical(*(bool*)x);
95
197
  }
96
198
 
97
199
  double* set_R_nan_as_C_nan(double *restrict x_R, std::vector<double> &x_C, size_t n, int nthreads)
@@ -264,7 +366,7 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
264
366
  } else if (outl_col < (ncols_num + ncols_cat)) {
265
367
  if (outl_col < (ncols_num + ncols_cat_cat)) {
266
368
  tmp_bool = Rcpp::LogicalVector(model_outputs.all_clusters[outl_col][outl_clust].subset_common.size(), false);
267
- for (size_t cat = 0; cat < tmp_bool.size(); cat++) {
369
+ for (size_t cat = 0; cat < (size_t)tmp_bool.size(); cat++) {
268
370
  if (model_outputs.all_clusters[outl_col][outl_clust].subset_common[cat] == 0) {
269
371
  tmp_bool[cat] = true;
270
372
  }
@@ -308,7 +410,7 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
308
410
  }
309
411
  } else {
310
412
  tmp_bool = Rcpp::LogicalVector(model_outputs.all_clusters[outl_col][outl_clust].subset_common.size(), false);
311
- for (size_t cat = 0; cat < tmp_bool.size(); cat++) {
413
+ for (size_t cat = 0; cat < (size_t)tmp_bool.size(); cat++) {
312
414
  if (model_outputs.all_clusters[outl_col][outl_clust].subset_common[cat] == 0) {
313
415
  tmp_bool[cat] = true;
314
416
  }
@@ -1274,7 +1376,6 @@ Rcpp::List fit_OutlierTree(Rcpp::NumericVector arr_num, size_t ncols_numeric,
1274
1376
  {
1275
1377
  Rcpp::List outp = Rcpp::List::create(
1276
1378
  Rcpp::_["ptr_model"] = R_NilValue,
1277
- Rcpp::_["serialized_obj"] = R_NilValue,
1278
1379
  Rcpp::_["bounds"] = R_NilValue,
1279
1380
  Rcpp::_["outliers_info"] = R_NilValue,
1280
1381
  Rcpp::_["ntrees"] = R_NilValue,
@@ -1295,7 +1396,6 @@ Rcpp::List fit_OutlierTree(Rcpp::NumericVector arr_num, size_t ncols_numeric,
1295
1396
  double *arr_num_C = set_R_nan_as_C_nan(REAL(arr_num), Xcpp, arr_num.size(), nthreads);
1296
1397
 
1297
1398
  std::unique_ptr<ModelOutputs> model_outputs = std::unique_ptr<ModelOutputs>(new ModelOutputs());
1298
- try {
1299
1399
  found_outliers = fit_outliers_models(*model_outputs,
1300
1400
  arr_num_C, ncols_numeric,
1301
1401
  INTEGER(arr_cat), ncols_categ, INTEGER(ncat),
@@ -1313,13 +1413,7 @@ Rcpp::List fit_OutlierTree(Rcpp::NumericVector arr_num, size_t ncols_numeric,
1313
1413
  &min_ts
1314
1414
  };
1315
1415
  outp["bounds"] = Rcpp::unwindProtect(extract_outl_bounds_wrapper, (void*)&temp);
1316
- outp["serialized_obj"] = serialize_OutlierTree(model_outputs.get());
1317
- } catch(std::bad_alloc &e) {
1318
- Rcpp::stop("Insufficient memory.\n");
1319
- }
1320
1416
 
1321
- if (!Rf_xlength(outp["serialized_obj"]))
1322
- return outp;
1323
1417
  if (return_outliers) {
1324
1418
  args_describe_outliers temp = {
1325
1419
  model_outputs.get(),
@@ -1350,8 +1444,7 @@ Rcpp::List fit_OutlierTree(Rcpp::NumericVector arr_num, size_t ncols_numeric,
1350
1444
  outp["nclust"] = Rcpp::unwindProtect(safe_int, (void*)&nclust_int);
1351
1445
  outp["found_outliers"] = Rcpp::unwindProtect(safe_bool, (void*)&found_outliers);
1352
1446
 
1353
- outp["ptr_model"] = Rcpp::unwindProtect(safe_XPtr, model_outputs.get());
1354
- model_outputs.release();
1447
+ outp["ptr_model"] = Rcpp::unwindProtect(get_altrepped_pointer, &model_outputs);
1355
1448
  return outp;
1356
1449
  }
1357
1450
 
@@ -11,7 +11,7 @@
11
11
  * arXiv preprint arXiv:2001.00636 (2020).
12
12
  *
13
13
  *
14
- * Copyright 2020 David Cortes.
14
+ * Copyright 2020-2024 David Cortes.
15
15
  *
16
16
  * Written for C++11 standard and OpenMP 2.0 or later. Code is meant to be wrapped into scripting languages
17
17
  * such as R or Python.
@@ -97,6 +97,10 @@
97
97
  * Model parameter. Default is 2.67.
98
98
  * - z_outlier (in)
99
99
  * Model parameter. Default is 8.0. Must be greater than z_norm.
100
+ * - check_nonneg_outliers (in)
101
+ * Whether to add an extra check for possible outliers defined as having negative values while all
102
+ * the rest have positive values, regardless of how many standard deviations away they are.
103
+ * This is currently only done on the first cluster (no conditions on any variable).
100
104
  *
101
105
  * Returns:
102
106
  * - Whether there were any outliers detected.
@@ -107,7 +111,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
107
111
  size_t cluster_num, size_t tree_num, size_t tree_depth,
108
112
  bool is_log_transf, double log_minval, bool is_exp_transf, double orig_mean, double orig_sd,
109
113
  double left_tail, double right_tail, double *restrict orig_x,
110
- double max_perc_outliers, double z_norm, double z_outlier)
114
+ double max_perc_outliers, double z_norm, double z_outlier,
115
+ bool check_nonneg_outliers)
111
116
  {
112
117
 
113
118
  /* TODO: this function could try to determine if the distribution is multimodal, and if so,
@@ -120,6 +125,7 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
120
125
  /* NAs and Inf should have already been removed, and outliers with fewer conditionals already discarded */
121
126
  bool has_low_values = false;
122
127
  bool has_high_values = false;
128
+ bool has_outlier_neg_values = false;
123
129
  long double running_mean = 0;
124
130
  long double running_ssq = 0;
125
131
  long double mean_prev = 0;
@@ -127,14 +133,15 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
127
133
  double mean;
128
134
  double sd;
129
135
  size_t cnt;
130
- size_t tail_size = (size_t) calculate_max_outliers((long double)(end - st + 1), max_perc_outliers);
136
+ size_t tot = end - st + 1;
137
+ size_t tail_size = (size_t) calculate_max_outliers((long double)tot, max_perc_outliers);
131
138
  size_t st_non_tail = st + tail_size;
132
139
  size_t end_non_tail = end - tail_size;
133
140
  size_t st_normals = 0;
134
141
  size_t end_normals = 0;
135
142
  double min_gap = z_outlier - z_norm;
136
143
 
137
- double curr_gap, next_gap, eps, lim_by_orig;
144
+ double curr_gap, next_gap, lim_by_orig;
138
145
 
139
146
  /* Note: there is no good reason and no theory behind these numbers.
140
147
  TODO: find a better way of setting this */
@@ -166,9 +173,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
166
173
  if ((!isinf(left_tail) || !isinf(right_tail)) && !is_log_transf && !is_exp_transf) {
167
174
  sd *= 0.5;
168
175
  }
169
- sd = std::fmax(sd, 1e-15);
170
176
  while (std::numeric_limits<double>::epsilon() > sd*std::fmin(min_gap, z_norm))
171
- sd *= 4;
177
+ sd = std::nextafter(sd, std::numeric_limits<double>::infinity());
172
178
  cluster.cluster_mean = mean;
173
179
  cluster.cluster_sd = sd;
174
180
  cnt = end - st + 1;
@@ -212,10 +218,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
212
218
  cluster.display_lim_low = orig_x[ix_arr[row + 1]];
213
219
  cluster.perc_above = (long double)(end - st_normals + 1) / (long double)(end - st + 1);
214
220
 
215
- eps = 1e-15;
216
221
  while (cluster.display_lim_low <= cluster.lower_lim) {
217
- cluster.lower_lim -= eps;
218
- eps *= 4;
222
+ cluster.lower_lim = std::nextafter(cluster.lower_lim, -std::numeric_limits<double>::infinity());
219
223
  }
220
224
  break;
221
225
  }
@@ -225,6 +229,7 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
225
229
  if (st_normals == 0) {
226
230
  has_low_values = false;
227
231
  } else {
232
+ assign_low_outliers:
228
233
  for (size_t row = st; row < st_normals; row++) {
229
234
 
230
235
  /* assign outlier if it's a better cluster than previously assigned */
@@ -254,7 +259,23 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
254
259
  }
255
260
  }
256
261
  }
257
- if (!has_low_values) {
262
+ /* special type of outliers not based on standard deviations */
263
+ if (check_nonneg_outliers && st_normals == 0 && tot >= 500 && orig_x[ix_arr[st]] < 0. && orig_x[ix_arr[end]] >= 2.) {
264
+ size_t max_neg_outliers = (tot < 10000)? 1 : ((tot < 100000)? 2 : 3);
265
+ if (orig_x[ix_arr[st + max_neg_outliers]] > 0.) {
266
+ size_t num_neg = 0;
267
+ for (size_t row = st; row < st + max_neg_outliers; row++) {
268
+ num_neg += orig_x[ix_arr[row]] < 0.;
269
+ }
270
+ st_normals = st + num_neg;
271
+ cluster.lower_lim = 0.;
272
+ cluster.display_lim_low = orig_x[ix_arr[st + st_normals]];
273
+ cluster.perc_above = (long double)(end - st_normals + 1) / (long double)(end - st + 1);
274
+ has_outlier_neg_values = true;
275
+ goto assign_low_outliers;
276
+ }
277
+ }
278
+ if (!has_low_values && !has_outlier_neg_values) {
258
279
  cluster.perc_above = 1.0;
259
280
  if (!is_log_transf && !is_exp_transf) {
260
281
 
@@ -271,10 +292,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
271
292
  }
272
293
 
273
294
  if (cluster.lower_lim > -HUGE_VAL) {
274
- eps = 1e-15;
275
295
  while (cluster.lower_lim >= orig_x[ix_arr[st]]) {
276
- cluster.lower_lim -= eps;
277
- eps *= 4.;
296
+ cluster.lower_lim = std::nextafter(cluster.lower_lim, -std::numeric_limits<double>::infinity());
278
297
  }
279
298
  }
280
299
 
@@ -324,10 +343,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
324
343
  cluster.display_lim_high = orig_x[ix_arr[row - 1]];
325
344
  cluster.perc_below = (long double)(end_normals - st + 1) / (long double)(end - st + 1);
326
345
 
327
- eps = 1e-15;
328
346
  while (cluster.display_lim_high >= cluster.upper_lim) {
329
- cluster.upper_lim += eps;
330
- eps *= 4;
347
+ cluster.upper_lim = std::nextafter(cluster.upper_lim, std::numeric_limits<double>::infinity());
331
348
  }
332
349
  break;
333
350
  }
@@ -384,10 +401,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
384
401
  }
385
402
 
386
403
  if (cluster.upper_lim < HUGE_VAL) {
387
- eps = 1e-15;
388
404
  while (cluster.upper_lim <= orig_x[ix_arr[end]]) {
389
- cluster.upper_lim += eps;
390
- eps *= 4.;
405
+ cluster.upper_lim = std::nextafter(cluster.upper_lim, std::numeric_limits<double>::infinity());
391
406
  }
392
407
  }
393
408
 
@@ -406,8 +421,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
406
421
  }
407
422
 
408
423
  /* save displayed statistics for cluster */
409
- if (has_high_values || has_low_values || is_log_transf || is_exp_transf) {
410
- size_t st_disp = has_low_values? st_normals : st;
424
+ if (has_high_values || has_low_values || is_log_transf || is_exp_transf || has_outlier_neg_values) {
425
+ size_t st_disp = (has_low_values || has_outlier_neg_values)? st_normals : st;
411
426
  size_t end_disp = has_high_values? end_normals : end;
412
427
  running_mean = 0;
413
428
  running_ssq = 0;
@@ -428,7 +443,7 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
428
443
  }
429
444
 
430
445
  /* report whether outliers were found or not */
431
- return has_low_values || has_high_values;
446
+ return has_low_values || has_high_values || has_outlier_neg_values;
432
447
  }
433
448
 
434
449
 
@@ -11,7 +11,7 @@
11
11
  * arXiv preprint arXiv:2001.00636 (2020).
12
12
  *
13
13
  *
14
- * Copyright 2020 David Cortes.
14
+ * Copyright 2020-2024 David Cortes.
15
15
  *
16
16
  * Written for C++11 standard and OpenMP 2.0 or later. Code is meant to be wrapped into scripting languages
17
17
  * such as R or Python.
@@ -190,9 +190,9 @@ bool fit_outliers_models(ModelOutputs &model_outputs,
190
190
  model_outputs.start_ix_cat_counts[0] = 0;
191
191
  if (tot_cols > ncols_numeric) {
192
192
  input_data.max_categ = calculate_category_indices(model_outputs.start_ix_cat_counts.data(), input_data.ncat, input_data.ncols_categ,
193
- (bool*) input_data.skip_col.data() + ncols_numeric);
193
+ input_data.skip_col.data() + ncols_numeric);
194
194
  input_data.max_categ = calculate_category_indices(model_outputs.start_ix_cat_counts.data() + input_data.ncols_categ, input_data.ncat_ord, input_data.ncols_ord,
195
- (bool*) input_data.skip_col.data() + input_data.ncols_numeric + input_data.ncols_categ, input_data.max_categ);
195
+ input_data.skip_col.data() + input_data.ncols_numeric + input_data.ncols_categ, input_data.max_categ);
196
196
  } else {
197
197
  input_data.max_categ = 0;
198
198
  }
@@ -209,42 +209,39 @@ bool fit_outliers_models(ModelOutputs &model_outputs,
209
209
 
210
210
  /* calculate prior probabilities for categorical variables (in parallel), see if any is unsplittable */
211
211
  if (tot_cols > ncols_numeric) {
212
- #pragma omp parallel
212
+ #pragma omp parallel sections if(nthreads > 1)
213
213
  {
214
- #pragma omp sections
215
- {
216
214
 
217
- #pragma omp section
218
- {
219
- if (ncols_categ > 0) {
220
- calculate_all_cat_counts(model_outputs.start_ix_cat_counts.data(), input_data.cat_counts.data(), input_data.ncat,
221
- input_data.categorical_data, input_data.ncols_categ, input_data.nrows,
222
- (bool*) input_data.has_NA.data() + ncols_numeric, (bool*) input_data.skip_col.data() + input_data.ncols_numeric,
223
- std::min(input_data.ncols_categ, (size_t)std::max(1, nthreads - 1)) );
224
-
225
- check_cat_col_unsplittable(model_outputs.start_ix_cat_counts.data(), input_data.cat_counts.data(), input_data.ncat,
226
- input_data.ncols_categ, std::min(model_params.min_size_numeric, model_params.min_size_categ), input_data.nrows,
227
- (bool*) input_data.skip_col.data() + input_data.ncols_numeric,
228
- std::min(input_data.ncols_categ, (size_t)std::max(1, nthreads - 1)));
229
- }
215
+ #pragma omp section
216
+ {
217
+ if (ncols_categ > 0) {
218
+ calculate_all_cat_counts(model_outputs.start_ix_cat_counts.data(), input_data.cat_counts.data(), input_data.ncat,
219
+ input_data.categorical_data, input_data.ncols_categ, input_data.nrows,
220
+ input_data.has_NA.data() + ncols_numeric, input_data.skip_col.data() + input_data.ncols_numeric,
221
+ std::min(input_data.ncols_categ, (size_t)std::max(1, nthreads - 1)) );
222
+
223
+ check_cat_col_unsplittable(model_outputs.start_ix_cat_counts.data(), input_data.cat_counts.data(), input_data.ncat,
224
+ input_data.ncols_categ, std::min(model_params.min_size_numeric, model_params.min_size_categ), input_data.nrows,
225
+ input_data.skip_col.data() + input_data.ncols_numeric,
226
+ std::min(input_data.ncols_categ, (size_t)std::max(1, nthreads - 1)));
227
+ }
230
228
 
231
229
 
232
- }
230
+ }
233
231
 
234
- #pragma omp section
235
- {
236
- if (ncols_ord > 0) {
237
- calculate_all_cat_counts(model_outputs.start_ix_cat_counts.data() + input_data.ncols_categ, input_data.cat_counts.data(), input_data.ncat_ord,
238
- input_data.ordinal_data, input_data.ncols_ord, input_data.nrows,
239
- (bool*) input_data.has_NA.data() + input_data.ncols_numeric + input_data.ncols_categ,
240
- (bool*) input_data.skip_col.data() + input_data.ncols_numeric + input_data.ncols_categ,
241
- std::max((int)1, nthreads - (int)input_data.ncols_categ) );
242
-
243
- check_cat_col_unsplittable(model_outputs.start_ix_cat_counts.data() + input_data.ncols_categ, input_data.cat_counts.data(), input_data.ncat_ord,
244
- ncols_ord, std::min(model_params.min_size_numeric, model_params.min_size_categ), input_data.nrows,
245
- (bool*) input_data.skip_col.data() + input_data.ncols_numeric + input_data.ncols_categ,
246
- std::max((int)1, nthreads - (int)input_data.ncols_categ));
247
- }
232
+ #pragma omp section
233
+ {
234
+ if (ncols_ord > 0) {
235
+ calculate_all_cat_counts(model_outputs.start_ix_cat_counts.data() + input_data.ncols_categ, input_data.cat_counts.data(), input_data.ncat_ord,
236
+ input_data.ordinal_data, input_data.ncols_ord, input_data.nrows,
237
+ input_data.has_NA.data() + input_data.ncols_numeric + input_data.ncols_categ,
238
+ input_data.skip_col.data() + input_data.ncols_numeric + input_data.ncols_categ,
239
+ std::max((int)1, nthreads - (int)input_data.ncols_categ) );
240
+
241
+ check_cat_col_unsplittable(model_outputs.start_ix_cat_counts.data() + input_data.ncols_categ, input_data.cat_counts.data(), input_data.ncat_ord,
242
+ ncols_ord, std::min(model_params.min_size_numeric, model_params.min_size_categ), input_data.nrows,
243
+ input_data.skip_col.data() + input_data.ncols_numeric + input_data.ncols_categ,
244
+ std::max((int)1, nthreads - (int)input_data.ncols_categ));
248
245
  }
249
246
  }
250
247
 
@@ -260,13 +257,13 @@ bool fit_outliers_models(ModelOutputs &model_outputs,
260
257
 
261
258
  /* for numerical columns, check if they have NAs or if total variance is too small */
262
259
  check_missing_no_variance(input_data.numeric_data, input_data.ncols_numeric, input_data.nrows,
263
- (bool*) input_data.has_NA.data(), (bool*) input_data.skip_col.data(),
260
+ input_data.has_NA.data(), input_data.skip_col.data(),
264
261
  model_outputs.min_decimals_col.data(), nthreads);
265
262
 
266
263
  /* determine an approximate size for the output clusters, and reserve memory right away */
267
264
  model_outputs.all_clusters.resize(tot_cols);
268
265
  model_outputs.all_trees.resize(tot_cols);
269
- #pragma omp parallel for shared(model_outputs, input_data, model_params, tot_cols)
266
+ #pragma omp parallel for num_threads(nthreads) shared(model_outputs, input_data, model_params, tot_cols)
270
267
  for (size_t_for col = 0; col < tot_cols; col++) {
271
268
  if (input_data.skip_col[col]) continue;
272
269
  if (cols_ignore != NULL && cols_ignore[col]) continue;
@@ -555,7 +552,8 @@ void process_numeric_col(std::vector<Cluster> &cluster_root,
555
552
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
556
553
  workspace.orig_mean, workspace.orig_sd,
557
554
  workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
558
- model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
555
+ model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
556
+ true);
559
557
  workspace.tree->back().clusters.push_back(0);
560
558
 
561
559
  /* remove outliers if any were found */
@@ -639,7 +637,8 @@ void recursive_split_numeric(Workspace &workspace,
639
637
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
640
638
  workspace.orig_mean, workspace.orig_sd,
641
639
  workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
642
- model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
640
+ model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
641
+ false);
643
642
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
644
643
 
645
644
  if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
@@ -666,7 +665,8 @@ void recursive_split_numeric(Workspace &workspace,
666
665
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
667
666
  workspace.orig_mean, workspace.orig_sd,
668
667
  workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
669
- model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
668
+ model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
669
+ false);
670
670
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
671
671
 
672
672
  if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
@@ -690,7 +690,8 @@ void recursive_split_numeric(Workspace &workspace,
690
690
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
691
691
  workspace.orig_mean, workspace.orig_sd,
692
692
  workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
693
- model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
693
+ model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
694
+ false);
694
695
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
695
696
 
696
697
  if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
@@ -753,7 +754,8 @@ void recursive_split_numeric(Workspace &workspace,
753
754
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
754
755
  workspace.orig_mean, workspace.orig_sd,
755
756
  workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
756
- model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
757
+ model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
758
+ false);
757
759
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
758
760
 
759
761
  if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
@@ -780,7 +782,8 @@ void recursive_split_numeric(Workspace &workspace,
780
782
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
781
783
  workspace.orig_mean, workspace.orig_sd,
782
784
  workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
783
- model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
785
+ model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
786
+ false);
784
787
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
785
788
 
786
789
  if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
@@ -808,7 +811,8 @@ void recursive_split_numeric(Workspace &workspace,
808
811
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
809
812
  workspace.orig_mean, workspace.orig_sd,
810
813
  workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
811
- model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
814
+ model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
815
+ false);
812
816
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
813
817
 
814
818
  if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
@@ -874,7 +878,8 @@ void recursive_split_numeric(Workspace &workspace,
874
878
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
875
879
  workspace.orig_mean, workspace.orig_sd,
876
880
  workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
877
- model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
881
+ model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
882
+ false);
878
883
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
879
884
 
880
885
  if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
@@ -901,7 +906,8 @@ void recursive_split_numeric(Workspace &workspace,
901
906
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
902
907
  workspace.orig_mean, workspace.orig_sd,
903
908
  workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
904
- model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
909
+ model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
910
+ false);
905
911
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
906
912
 
907
913
  if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
@@ -931,7 +937,8 @@ void recursive_split_numeric(Workspace &workspace,
931
937
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
932
938
  workspace.orig_mean, workspace.orig_sd,
933
939
  workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
934
- model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
940
+ model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
941
+ false);
935
942
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
936
943
 
937
944
  if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
@@ -39,7 +39,7 @@
39
39
  * at which position will the counts for a given column start. Note that NAs are stored as the last index in each
40
40
  * column, so each one needs one extra category
41
41
  */
42
- int calculate_category_indices(size_t start_ix_cat_counts[], int ncat[], size_t ncols, bool skip_col[], int max_categ)
42
+ int calculate_category_indices(size_t start_ix_cat_counts[], int ncat[], size_t ncols, char skip_col[], int max_categ)
43
43
  {
44
44
  for (size_t col = 0; col < ncols; col++) {
45
45
  max_categ = std::max(ncat[col], max_categ);
@@ -53,7 +53,7 @@ int calculate_category_indices(size_t start_ix_cat_counts[], int ncat[], size_t
53
53
  /* Save the counts of each category for each column in the array determined above */
54
54
  void calculate_all_cat_counts(size_t start_ix_cat_counts[], size_t cat_counts[], int ncat[],
55
55
  int categorical_data[], size_t ncols, size_t nrows,
56
- bool has_NA[], bool skip_col[], int nthreads)
56
+ char has_NA[], char skip_col[], int nthreads)
57
57
  {
58
58
  size_t col_st_offset;
59
59
  size_t col_stop;
@@ -80,7 +80,7 @@ void calculate_all_cat_counts(size_t start_ix_cat_counts[], size_t cat_counts[],
80
80
 
81
81
  /* Check if some column has a large majority that would make any split fail to meet minimum sizes */
82
82
  void check_cat_col_unsplittable(size_t start_ix_cat_counts[], size_t cat_counts[], int ncat[],
83
- size_t ncols, size_t min_conditioned_size, size_t nrows, bool skip_col[], int nthreads)
83
+ size_t ncols, size_t min_conditioned_size, size_t nrows, char skip_col[], int nthreads)
84
84
  {
85
85
  size_t largest_cnt;
86
86
  #pragma omp parallel for num_threads(nthreads) private(largest_cnt) shared(ncols, nrows, ncat, cat_counts, start_ix_cat_counts, min_conditioned_size, skip_col)
@@ -127,8 +127,8 @@ void calculate_lowerlim_proportion(long double *restrict prop_small, long double
127
127
 
128
128
  /* Check if a numerical column has no variance (i.e. will not be splittable).
129
129
  Along the way, also record the number of decimals to display for this column. */
130
- void check_missing_no_variance(double numeric_data[], size_t ncols, size_t nrows, bool has_NA[],
131
- bool skip_col[], int min_decimals[], int nthreads)
130
+ void check_missing_no_variance(double numeric_data[], size_t ncols, size_t nrows, char has_NA[],
131
+ char skip_col[], int min_decimals[], int nthreads)
132
132
  {
133
133
  long double running_mean;
134
134
  long double mean_prev;
@@ -680,18 +680,6 @@ int decimals_diff(double val1, double val2)
680
680
  return (int) res;
681
681
  }
682
682
 
683
-
684
- /* Reason behind this function: Cython (as of v0.29) will not auto-deallocate
685
- structs which are part of a cdef'd class, which produces a memory leak
686
- but can be force-destructed. Unfortunately, Cython itself doesn't even
687
- allow calling destructors for structs, so it has to be done externally.
688
- This function should otherwise have no reason to exist.
689
- */
690
- void dealloc_ModelOutputs(ModelOutputs &model_outputs)
691
- {
692
- model_outputs.~ModelOutputs();
693
- }
694
-
695
683
  ModelOutputs get_empty_ModelOutputs()
696
684
  {
697
685
  return ModelOutputs();
@@ -733,7 +733,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
733
733
  size_t *restrict outlier_depth, Cluster &cluster, std::vector<Cluster> &clusters, size_t cluster_num, size_t tree_num, size_t tree_depth,
734
734
  bool is_log_transf, double log_minval, bool is_exp_transf, double orig_mean, double orig_sd,
735
735
  double left_tail, double right_tail, double *restrict orig_x,
736
- double max_perc_outliers, double z_norm, double z_outlier);
736
+ double max_perc_outliers, double z_norm, double z_outlier,
737
+ bool check_nonneg_outliers);
737
738
  void define_categ_cluster_no_cond(int *restrict x, size_t *restrict ix_arr, size_t st, size_t end, size_t ncateg,
738
739
  double *restrict outlier_scores, size_t *restrict outlier_clusters, size_t *restrict outlier_trees,
739
740
  size_t *restrict outlier_depth, Cluster &cluster,
@@ -802,17 +803,17 @@ typedef struct {
802
803
  } RecursionState;
803
804
 
804
805
 
805
- int calculate_category_indices(size_t start_ix_cat_counts[], int ncat[], size_t ncols, bool skip_col[], int max_categ = 0);
806
+ int calculate_category_indices(size_t start_ix_cat_counts[], int ncat[], size_t ncols, char skip_col[], int max_categ = 0);
806
807
  void calculate_all_cat_counts(size_t start_ix_cat_counts[], size_t cat_counts[], int ncat[],
807
808
  int categorical_data[], size_t ncols, size_t nrows,
808
- bool has_NA[], bool skip_col[], int nthreads);
809
+ char has_NA[], char skip_col[], int nthreads);
809
810
  void check_cat_col_unsplittable(size_t start_ix_cat_counts[], size_t cat_counts[], int ncat[],
810
- size_t ncols, size_t min_conditioned_size, size_t nrows, bool skip_col[], int nthreads);
811
+ size_t ncols, size_t min_conditioned_size, size_t nrows, char skip_col[], int nthreads);
811
812
  void calculate_lowerlim_proportion(long double *restrict prop_small, long double *restrict prop,
812
813
  size_t start_ix_cat_counts[], size_t cat_counts[],
813
814
  size_t ncols, size_t nrows, double z_norm, double z_tail);
814
- void check_missing_no_variance(double numeric_data[], size_t ncols, size_t nrows, bool has_NA[],
815
- bool skip_col[], int min_decimals[], int nthreads);
815
+ void check_missing_no_variance(double numeric_data[], size_t ncols, size_t nrows, char has_NA[],
816
+ char skip_col[], int min_decimals[], int nthreads);
816
817
  void calc_central_mean_and_sd(size_t ix_arr[], size_t st, size_t end, double x[], size_t size_quarter, double *mean_central, double *sd_central);
817
818
  void check_for_tails(size_t ix_arr[], size_t st, size_t end, double *restrict x,
818
819
  double z_norm, double max_perc_outliers,
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: outliertree
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.0
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-06-13 00:00:00.000000000 Z
11
+ date: 2024-06-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rice
@@ -16,14 +16,14 @@ dependencies:
16
16
  requirements:
17
17
  - - ">="
18
18
  - !ruby/object:Gem::Version
19
- version: 4.0.2
19
+ version: '4.3'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - ">="
25
25
  - !ruby/object:Gem::Version
26
- version: 4.0.2
26
+ version: '4.3'
27
27
  description:
28
28
  email: andrew@ankane.org
29
29
  executables: []
@@ -68,14 +68,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
68
68
  requirements:
69
69
  - - ">="
70
70
  - !ruby/object:Gem::Version
71
- version: '2.7'
71
+ version: '3.1'
72
72
  required_rubygems_version: !ruby/object:Gem::Requirement
73
73
  requirements:
74
74
  - - ">="
75
75
  - !ruby/object:Gem::Version
76
76
  version: '0'
77
77
  requirements: []
78
- rubygems_version: 3.3.7
78
+ rubygems_version: 3.5.9
79
79
  signing_key:
80
80
  specification_version: 4
81
81
  summary: Explainable outlier/anomaly detection for Ruby