outliertree 0.3.0 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/ext/outliertree/ext.cpp +23 -0
- data/lib/outliertree/version.rb +1 -1
- data/vendor/outliertree/README.md +12 -7
- data/vendor/outliertree/src/Makevars.in +1 -2
- data/vendor/outliertree/src/Makevars.win +1 -2
- data/vendor/outliertree/src/RcppExports.cpp +2 -23
- data/vendor/outliertree/src/Rwrapper.cpp +137 -44
- data/vendor/outliertree/src/fit_model.cpp +32 -35
- data/vendor/outliertree/src/misc.cpp +5 -17
- data/vendor/outliertree/src/outlier_tree.hpp +5 -5
- metadata +3 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 8ed2332f581bd9cf68d32fe19bdb89c58f268c6ea6feb30e34e8422595920dc3
|
4
|
+
data.tar.gz: d7cc844317fc023bee7d461838ae3fad3567268845013d2fc4e761f325934534
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4afaa3d661d2d225dc55d708ba8d263b0a017fe2a5388d77cb881d5765f9e9cc9ef682b52ef541b706cf00fc0cc7d3834f537a7900c30d3783df1821ef7d432e
|
7
|
+
data.tar.gz: 6c0feea6f531277847d84c76cbf146ca3d3821f9dd9635c8f7bf1fe592b1905f02db2d288027600310609d817d891129b510f5dad377946e735364705d0949cd
|
data/CHANGELOG.md
CHANGED
data/ext/outliertree/ext.cpp
CHANGED
@@ -30,6 +30,29 @@ namespace Rice::detail
|
|
30
30
|
}
|
31
31
|
};
|
32
32
|
|
33
|
+
template<>
|
34
|
+
class To_Ruby<std::vector<signed char>>
|
35
|
+
{
|
36
|
+
public:
|
37
|
+
VALUE convert(std::vector<signed char> const & x)
|
38
|
+
{
|
39
|
+
auto a = rb_ary_new2(x.size());
|
40
|
+
for (const auto& v : x) {
|
41
|
+
rb_ary_push(a, To_Ruby<signed char>().convert(v));
|
42
|
+
}
|
43
|
+
return a;
|
44
|
+
}
|
45
|
+
};
|
46
|
+
|
47
|
+
template<>
|
48
|
+
struct Type<std::vector<signed char>>
|
49
|
+
{
|
50
|
+
static bool verify()
|
51
|
+
{
|
52
|
+
return true;
|
53
|
+
}
|
54
|
+
};
|
55
|
+
|
33
56
|
template<>
|
34
57
|
struct Type<ColType>
|
35
58
|
{
|
data/lib/outliertree/version.rb
CHANGED
@@ -58,12 +58,18 @@ Procedure is described in more detail in [Explainable outlier detection through
|
|
58
58
|
# Installation
|
59
59
|
|
60
60
|
* For R:
|
61
|
+
|
62
|
+
**Note:** This package benefits from extra optimizations that aren't enabled by default for R packages. See [this guide](https://github.com/david-cortes/installing-optimized-libraries) for instructions on how to enable them.
|
63
|
+
|
61
64
|
```r
|
62
65
|
install.packages("outliertree")
|
63
66
|
```
|
64
67
|
|
65
68
|
|
66
69
|
* For Python:
|
70
|
+
|
71
|
+
**Note:** requires C/C++ compilers configured for Python. See [this guide](https://github.com/david-cortes/installing-optimized-libraries) for instructions.
|
72
|
+
|
67
73
|
```
|
68
74
|
pip install outliertree
|
69
75
|
```
|
@@ -77,22 +83,21 @@ pip install --no-use-pep517 outliertree
|
|
77
83
|
```
|
78
84
|
brew install libomp
|
79
85
|
```
|
80
|
-
And then reinstall this package: `pip install --force-reinstall outliertree`.
|
86
|
+
And then reinstall this package: `pip install --upgrade --no-deps --force-reinstall outliertree`.
|
81
87
|
|
82
88
|
** *
|
83
|
-
**IMPORTANT:** the setup script will try to add compilation flag `-march=native`. This instructs the compiler to tune the package for the CPU in which it is being installed, but the result might not be usable in other computers. If building a binary wheel of this package or putting it into a docker image which will be used in different machines, this can be overriden by manually supplying compilation `CFLAGS`
|
89
|
+
**IMPORTANT:** the setup script will try to add compilation flag `-march=native`. This instructs the compiler to tune the package for the CPU in which it is being installed (by e.g. using AVX instructions if available), but the result might not be usable in other computers. If building a binary wheel of this package or putting it into a docker image which will be used in different machines, this can be overriden either by (a) defining an environment variable `DONT_SET_MARCH=1`, or by (b) manually supplying compilation `CFLAGS` as an environment variable with something related to architecture. For maximum compatibility (but slowest speed), it's possible to do something like this:
|
84
90
|
|
85
91
|
```
|
86
|
-
export
|
87
|
-
export CXXFLAGS="-march=x86-64"
|
92
|
+
export DONT_SET_MARCH=1
|
88
93
|
pip install outliertree
|
89
94
|
```
|
90
95
|
|
91
|
-
or for
|
96
|
+
or, by specifying some compilation flag for architecture:
|
92
97
|
```
|
93
98
|
export CFLAGS="-march=x86-64"
|
94
99
|
export CXXFLAGS="-march=x86-64"
|
95
|
-
|
100
|
+
pip install outliertree
|
96
101
|
```
|
97
102
|
** *
|
98
103
|
|
@@ -134,7 +139,7 @@ summary(new_outliers)
|
|
134
139
|
```
|
135
140
|
(see documentation for more examples)
|
136
141
|
|
137
|
-
Example [RMarkdown](http://htmlpreview.github.io/?https://github.com/david-cortes/outliertree/blob/master/
|
142
|
+
Example [RMarkdown](http://htmlpreview.github.io/?https://github.com/david-cortes/outliertree/blob/master/vignettes/Explainable_Outlier_Detection_in_Titanic_dataset.html) using the Titanic dataset.
|
138
143
|
|
139
144
|
|
140
145
|
* For Python:
|
@@ -10,27 +10,6 @@ Rcpp::Rostream<true>& Rcpp::Rcout = Rcpp::Rcpp_cout_get();
|
|
10
10
|
Rcpp::Rostream<false>& Rcpp::Rcerr = Rcpp::Rcpp_cerr_get();
|
11
11
|
#endif
|
12
12
|
|
13
|
-
// deserialize_OutlierTree
|
14
|
-
SEXP deserialize_OutlierTree(Rcpp::RawVector src, SEXP ptr_obj);
|
15
|
-
RcppExport SEXP _outliertree_deserialize_OutlierTree(SEXP srcSEXP, SEXP ptr_objSEXP) {
|
16
|
-
BEGIN_RCPP
|
17
|
-
Rcpp::RObject rcpp_result_gen;
|
18
|
-
Rcpp::traits::input_parameter< Rcpp::RawVector >::type src(srcSEXP);
|
19
|
-
Rcpp::traits::input_parameter< SEXP >::type ptr_obj(ptr_objSEXP);
|
20
|
-
rcpp_result_gen = Rcpp::wrap(deserialize_OutlierTree(src, ptr_obj));
|
21
|
-
return rcpp_result_gen;
|
22
|
-
END_RCPP
|
23
|
-
}
|
24
|
-
// check_null_ptr_model
|
25
|
-
Rcpp::LogicalVector check_null_ptr_model(SEXP ptr_model);
|
26
|
-
RcppExport SEXP _outliertree_check_null_ptr_model(SEXP ptr_modelSEXP) {
|
27
|
-
BEGIN_RCPP
|
28
|
-
Rcpp::RObject rcpp_result_gen;
|
29
|
-
Rcpp::traits::input_parameter< SEXP >::type ptr_model(ptr_modelSEXP);
|
30
|
-
rcpp_result_gen = Rcpp::wrap(check_null_ptr_model(ptr_model));
|
31
|
-
return rcpp_result_gen;
|
32
|
-
END_RCPP
|
33
|
-
}
|
34
13
|
// fit_OutlierTree
|
35
14
|
Rcpp::List fit_OutlierTree(Rcpp::NumericVector arr_num, size_t ncols_numeric, Rcpp::IntegerVector arr_cat, size_t ncols_categ, Rcpp::IntegerVector ncat, Rcpp::IntegerVector arr_ord, size_t ncols_ord, Rcpp::IntegerVector ncat_ord, size_t nrows, Rcpp::LogicalVector cols_ignore_r, int nthreads, bool categ_as_bin, bool ord_as_bin, bool cat_bruteforce_subset, bool categ_from_maj, bool take_mid, size_t max_depth, double max_perc_outliers, size_t min_size_numeric, size_t min_size_categ, double min_gain, bool follow_all, bool gain_as_pct, double z_norm, double z_outlier, bool return_outliers, Rcpp::ListOf<Rcpp::StringVector> cat_levels, Rcpp::ListOf<Rcpp::StringVector> ord_levels, Rcpp::StringVector colnames_num, Rcpp::StringVector colnames_cat, Rcpp::StringVector colnames_ord, Rcpp::NumericVector min_date, Rcpp::NumericVector min_ts);
|
36
15
|
RcppExport SEXP _outliertree_fit_OutlierTree(SEXP arr_numSEXP, SEXP ncols_numericSEXP, SEXP arr_catSEXP, SEXP ncols_categSEXP, SEXP ncatSEXP, SEXP arr_ordSEXP, SEXP ncols_ordSEXP, SEXP ncat_ordSEXP, SEXP nrowsSEXP, SEXP cols_ignore_rSEXP, SEXP nthreadsSEXP, SEXP categ_as_binSEXP, SEXP ord_as_binSEXP, SEXP cat_bruteforce_subsetSEXP, SEXP categ_from_majSEXP, SEXP take_midSEXP, SEXP max_depthSEXP, SEXP max_perc_outliersSEXP, SEXP min_size_numericSEXP, SEXP min_size_categSEXP, SEXP min_gainSEXP, SEXP follow_allSEXP, SEXP gain_as_pctSEXP, SEXP z_normSEXP, SEXP z_outlierSEXP, SEXP return_outliersSEXP, SEXP cat_levelsSEXP, SEXP ord_levelsSEXP, SEXP colnames_numSEXP, SEXP colnames_catSEXP, SEXP colnames_ordSEXP, SEXP min_dateSEXP, SEXP min_tsSEXP) {
|
@@ -119,8 +98,6 @@ END_RCPP
|
|
119
98
|
}
|
120
99
|
|
121
100
|
static const R_CallMethodDef CallEntries[] = {
|
122
|
-
{"_outliertree_deserialize_OutlierTree", (DL_FUNC) &_outliertree_deserialize_OutlierTree, 2},
|
123
|
-
{"_outliertree_check_null_ptr_model", (DL_FUNC) &_outliertree_check_null_ptr_model, 1},
|
124
101
|
{"_outliertree_fit_OutlierTree", (DL_FUNC) &_outliertree_fit_OutlierTree, 33},
|
125
102
|
{"_outliertree_predict_OutlierTree", (DL_FUNC) &_outliertree_predict_OutlierTree, 13},
|
126
103
|
{"_outliertree_check_few_values", (DL_FUNC) &_outliertree_check_few_values, 4},
|
@@ -128,7 +105,9 @@ static const R_CallMethodDef CallEntries[] = {
|
|
128
105
|
{NULL, NULL, 0}
|
129
106
|
};
|
130
107
|
|
108
|
+
void init_altrepped_class(DllInfo* dll);
|
131
109
|
RcppExport void R_init_outliertree(DllInfo *dll) {
|
132
110
|
R_registerRoutines(dll, NULL, CallEntries, NULL, NULL);
|
133
111
|
R_useDynamicSymbols(dll, FALSE);
|
112
|
+
init_altrepped_class(dll);
|
134
113
|
}
|
@@ -2,8 +2,8 @@
|
|
2
2
|
|
3
3
|
#include <Rcpp.h>
|
4
4
|
#include <Rcpp/unwindProtect.h>
|
5
|
+
#include <R_ext/Altrep.h>
|
5
6
|
// [[Rcpp::plugins(cpp11)]]
|
6
|
-
// [[Rcpp::plugins(unwindProtect)]]
|
7
7
|
|
8
8
|
/* This is to serialize the model objects */
|
9
9
|
// [[Rcpp::depends(Rcereal)]]
|
@@ -16,18 +16,26 @@
|
|
16
16
|
/* This is the package's header */
|
17
17
|
#include "outlier_tree.hpp"
|
18
18
|
|
19
|
+
void delete_model_from_R_ptr(SEXP R_ptr)
|
20
|
+
{
|
21
|
+
ModelOutputs *model = static_cast<ModelOutputs*>(R_ExternalPtrAddr(R_ptr));
|
22
|
+
delete model;
|
23
|
+
R_SetExternalPtrAddr(R_ptr, nullptr);
|
24
|
+
R_ClearExternalPtr(R_ptr);
|
25
|
+
}
|
26
|
+
|
19
27
|
SEXP alloc_RawVec(void *data)
|
20
28
|
{
|
21
29
|
size_t vec_size = *(size_t*)data;
|
22
30
|
if (vec_size > (size_t)std::numeric_limits<R_xlen_t>::max())
|
23
31
|
Rcpp::stop("Resulting model is too large for R to handle.");
|
24
|
-
return
|
32
|
+
return Rf_allocVector(RAWSXP, vec_size);
|
25
33
|
}
|
26
34
|
|
27
35
|
/* for model serialization and re-usage in R */
|
28
36
|
/* https://stackoverflow.com/questions/18474292/how-to-handle-c-internal-data-structure-in-r-in-order-to-allow-save-load */
|
29
37
|
/* this extra comment below the link is a workaround for Rcpp issue 675 in GitHub, do not remove it */
|
30
|
-
|
38
|
+
SEXP serialize_OutlierTree(ModelOutputs *model_outputs)
|
31
39
|
{
|
32
40
|
std::stringstream ss;
|
33
41
|
{
|
@@ -37,35 +45,20 @@ Rcpp::RawVector serialize_OutlierTree(ModelOutputs *model_outputs)
|
|
37
45
|
ss.seekg(0, ss.end);
|
38
46
|
std::stringstream::pos_type vec_size = ss.tellg();
|
39
47
|
if (vec_size <= 0) {
|
40
|
-
|
41
|
-
return Rcpp::RawVector();
|
48
|
+
Rf_error("Error: model is too big to serialize, resulting object will not be usable.\n");
|
42
49
|
}
|
43
50
|
size_t vec_size_ = (size_t)vec_size;
|
44
|
-
|
45
|
-
if (!retval.size())
|
46
|
-
return retval;
|
51
|
+
SEXP retval = PROTECT(Rcpp::unwindProtect(alloc_RawVec, (void*)&vec_size_));
|
47
52
|
ss.seekg(0, ss.beg);
|
48
|
-
ss.read(reinterpret_cast<char*>(RAW(retval)),
|
53
|
+
ss.read(reinterpret_cast<char*>(RAW(retval)), vec_size_);
|
54
|
+
UNPROTECT(1);
|
49
55
|
return retval;
|
50
56
|
}
|
51
57
|
|
52
|
-
SEXP
|
53
|
-
{
|
54
|
-
return Rcpp::XPtr<ModelOutputs>((ModelOutputs*)model_ptr, true);
|
55
|
-
}
|
56
|
-
|
57
|
-
void R_delete_model(SEXP R_ptr)
|
58
|
-
{
|
59
|
-
ModelOutputs *model = static_cast<ModelOutputs*>(R_ExternalPtrAddr(R_ptr));
|
60
|
-
delete model;
|
61
|
-
R_ClearExternalPtr(R_ptr);
|
62
|
-
}
|
63
|
-
|
64
|
-
// [[Rcpp::export(rng = false)]]
|
65
|
-
SEXP deserialize_OutlierTree(Rcpp::RawVector src, SEXP ptr_obj)
|
58
|
+
SEXP deserialize_OutlierTree(SEXP src, SEXP ptr_obj)
|
66
59
|
{
|
67
60
|
std::stringstream ss;
|
68
|
-
ss.write(reinterpret_cast<char*>(RAW(src)), src
|
61
|
+
ss.write(reinterpret_cast<char*>(RAW(src)), Rf_xlength(src));
|
69
62
|
ss.seekg(0, ss.beg);
|
70
63
|
std::unique_ptr<ModelOutputs> model_outputs = std::unique_ptr<ModelOutputs>(new ModelOutputs());
|
71
64
|
{
|
@@ -73,25 +66,134 @@ SEXP deserialize_OutlierTree(Rcpp::RawVector src, SEXP ptr_obj)
|
|
73
66
|
iarchive(*model_outputs);
|
74
67
|
}
|
75
68
|
R_SetExternalPtrAddr(ptr_obj, model_outputs.get());
|
76
|
-
R_RegisterCFinalizerEx(ptr_obj,
|
69
|
+
R_RegisterCFinalizerEx(ptr_obj, delete_model_from_R_ptr, TRUE);
|
77
70
|
model_outputs.release();
|
78
71
|
return R_NilValue;
|
79
72
|
}
|
80
73
|
|
81
|
-
|
74
|
+
static R_altrep_class_t otree_altrepped_pointer_class;
|
75
|
+
|
76
|
+
R_xlen_t altrepped_pointer_length(SEXP obj)
|
82
77
|
{
|
83
|
-
return
|
78
|
+
return 1;
|
84
79
|
}
|
85
80
|
|
86
|
-
SEXP
|
81
|
+
SEXP get_element_from_altrepped_ptr(SEXP R_altrepped_obj, R_xlen_t idx)
|
87
82
|
{
|
88
|
-
return
|
83
|
+
return R_altrep_data1(R_altrepped_obj);
|
89
84
|
}
|
90
85
|
|
91
|
-
|
92
|
-
|
86
|
+
Rboolean inspect_altrepped_pointer(SEXP x, int pre, int deep, int pvec, void (*inspect_subtree)(SEXP, int, int, int))
|
87
|
+
{
|
88
|
+
Rprintf("Altrepped pointer [address:%p]\n", R_ExternalPtrAddr(R_altrep_data1(x)));
|
89
|
+
return TRUE;
|
90
|
+
}
|
91
|
+
|
92
|
+
SEXP duplicate_altrepped_pointer(SEXP altrepped_obj, Rboolean deep)
|
93
|
+
{
|
94
|
+
SEXP R_ptr_name = PROTECT(Rf_mkString("ptr"));
|
95
|
+
SEXP R_ptr_class = PROTECT(Rf_mkString("otree_altrepped_handle"));
|
96
|
+
SEXP out = PROTECT(R_new_altrep(otree_altrepped_pointer_class, R_NilValue, R_NilValue));
|
97
|
+
|
98
|
+
if (!deep) {
|
99
|
+
R_set_altrep_data1(out, R_altrep_data1(altrepped_obj));
|
100
|
+
}
|
101
|
+
|
102
|
+
else {
|
103
|
+
|
104
|
+
SEXP R_ptr = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
|
105
|
+
|
106
|
+
try {
|
107
|
+
std::unique_ptr<ModelOutputs> new_obj(new ModelOutputs());
|
108
|
+
ModelOutputs *cpp_ptr = (ModelOutputs*)R_ExternalPtrAddr(R_altrep_data1(altrepped_obj));
|
109
|
+
*new_obj = *cpp_ptr;
|
110
|
+
|
111
|
+
R_SetExternalPtrAddr(R_ptr, new_obj.get());
|
112
|
+
R_RegisterCFinalizerEx(R_ptr, delete_model_from_R_ptr, TRUE);
|
113
|
+
new_obj.release();
|
114
|
+
}
|
115
|
+
|
116
|
+
catch (const std::exception &ex) {
|
117
|
+
Rf_error("%s\n", ex.what());
|
118
|
+
}
|
119
|
+
|
120
|
+
R_set_altrep_data1(out, R_ptr);
|
121
|
+
UNPROTECT(1);
|
122
|
+
}
|
123
|
+
|
124
|
+
Rf_setAttrib(out, R_NamesSymbol, R_ptr_name);
|
125
|
+
Rf_setAttrib(out, R_NamesSymbol, R_ptr_class);
|
126
|
+
UNPROTECT(3);
|
127
|
+
return out;
|
128
|
+
}
|
129
|
+
|
130
|
+
SEXP serialize_altrepped_pointer(SEXP altrepped_obj)
|
93
131
|
{
|
94
|
-
return
|
132
|
+
return serialize_OutlierTree((ModelOutputs*)R_ExternalPtrAddr(R_altrep_data1(altrepped_obj)));
|
133
|
+
}
|
134
|
+
|
135
|
+
SEXP deserialize_altrepped_pointer(SEXP cls, SEXP R_state)
|
136
|
+
{
|
137
|
+
SEXP R_ptr_name = PROTECT(Rf_mkString("ptr"));
|
138
|
+
SEXP R_ptr_class = PROTECT(Rf_mkString("otree_altrepped_handle"));
|
139
|
+
SEXP R_ptr = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
|
140
|
+
SEXP out = PROTECT(R_new_altrep(otree_altrepped_pointer_class, R_NilValue, R_NilValue));
|
141
|
+
|
142
|
+
try {
|
143
|
+
deserialize_OutlierTree(R_state, R_ptr);
|
144
|
+
}
|
145
|
+
catch (const std::exception &ex) {
|
146
|
+
Rf_error("%s\n", ex.what());
|
147
|
+
}
|
148
|
+
|
149
|
+
R_set_altrep_data1(out, R_ptr);
|
150
|
+
Rf_setAttrib(out, R_NamesSymbol, R_ptr_name);
|
151
|
+
Rf_setAttrib(out, R_ClassSymbol, R_ptr_class);
|
152
|
+
|
153
|
+
UNPROTECT(4);
|
154
|
+
return out;
|
155
|
+
}
|
156
|
+
|
157
|
+
SEXP get_altrepped_pointer(void *void_ptr)
|
158
|
+
{
|
159
|
+
SEXP R_ptr_name = PROTECT(Rf_mkString("ptr"));
|
160
|
+
SEXP R_ptr_class = PROTECT(Rf_mkString("otree_altrepped_handle"));
|
161
|
+
SEXP R_ptr = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
|
162
|
+
SEXP out = PROTECT(R_new_altrep(otree_altrepped_pointer_class, R_NilValue, R_NilValue));
|
163
|
+
|
164
|
+
std::unique_ptr<ModelOutputs> *ptr = (std::unique_ptr<ModelOutputs>*)void_ptr;
|
165
|
+
R_SetExternalPtrAddr(R_ptr, ptr->get());
|
166
|
+
R_RegisterCFinalizerEx(R_ptr, delete_model_from_R_ptr, TRUE);
|
167
|
+
ptr->release();
|
168
|
+
|
169
|
+
R_set_altrep_data1(out, R_ptr);
|
170
|
+
Rf_setAttrib(out, R_NamesSymbol, R_ptr_name);
|
171
|
+
Rf_setAttrib(out, R_ClassSymbol, R_ptr_class);
|
172
|
+
|
173
|
+
UNPROTECT(4);
|
174
|
+
return out;
|
175
|
+
}
|
176
|
+
|
177
|
+
// [[Rcpp::init]]
|
178
|
+
void init_altrepped_class(DllInfo* dll)
|
179
|
+
{
|
180
|
+
otree_altrepped_pointer_class = R_make_altlist_class("otree_altrepped_pointer_class", "outliertree", dll);
|
181
|
+
R_set_altrep_Length_method(otree_altrepped_pointer_class, altrepped_pointer_length);
|
182
|
+
R_set_altrep_Inspect_method(otree_altrepped_pointer_class, inspect_altrepped_pointer);
|
183
|
+
R_set_altrep_Serialized_state_method(otree_altrepped_pointer_class, serialize_altrepped_pointer);
|
184
|
+
R_set_altrep_Unserialize_method(otree_altrepped_pointer_class, deserialize_altrepped_pointer);
|
185
|
+
R_set_altrep_Duplicate_method(otree_altrepped_pointer_class, duplicate_altrepped_pointer);
|
186
|
+
R_set_altlist_Elt_method(otree_altrepped_pointer_class, get_element_from_altrepped_ptr);
|
187
|
+
}
|
188
|
+
|
189
|
+
SEXP safe_int(void *x)
|
190
|
+
{
|
191
|
+
return Rf_ScalarInteger(*(int*)x);
|
192
|
+
}
|
193
|
+
|
194
|
+
SEXP safe_bool(void *x)
|
195
|
+
{
|
196
|
+
return Rf_ScalarLogical(*(bool*)x);
|
95
197
|
}
|
96
198
|
|
97
199
|
double* set_R_nan_as_C_nan(double *restrict x_R, std::vector<double> &x_C, size_t n, int nthreads)
|
@@ -264,7 +366,7 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
|
264
366
|
} else if (outl_col < (ncols_num + ncols_cat)) {
|
265
367
|
if (outl_col < (ncols_num + ncols_cat_cat)) {
|
266
368
|
tmp_bool = Rcpp::LogicalVector(model_outputs.all_clusters[outl_col][outl_clust].subset_common.size(), false);
|
267
|
-
for (size_t cat = 0; cat < tmp_bool.size(); cat++) {
|
369
|
+
for (size_t cat = 0; cat < (size_t)tmp_bool.size(); cat++) {
|
268
370
|
if (model_outputs.all_clusters[outl_col][outl_clust].subset_common[cat] == 0) {
|
269
371
|
tmp_bool[cat] = true;
|
270
372
|
}
|
@@ -308,7 +410,7 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
|
308
410
|
}
|
309
411
|
} else {
|
310
412
|
tmp_bool = Rcpp::LogicalVector(model_outputs.all_clusters[outl_col][outl_clust].subset_common.size(), false);
|
311
|
-
for (size_t cat = 0; cat < tmp_bool.size(); cat++) {
|
413
|
+
for (size_t cat = 0; cat < (size_t)tmp_bool.size(); cat++) {
|
312
414
|
if (model_outputs.all_clusters[outl_col][outl_clust].subset_common[cat] == 0) {
|
313
415
|
tmp_bool[cat] = true;
|
314
416
|
}
|
@@ -1274,7 +1376,6 @@ Rcpp::List fit_OutlierTree(Rcpp::NumericVector arr_num, size_t ncols_numeric,
|
|
1274
1376
|
{
|
1275
1377
|
Rcpp::List outp = Rcpp::List::create(
|
1276
1378
|
Rcpp::_["ptr_model"] = R_NilValue,
|
1277
|
-
Rcpp::_["serialized_obj"] = R_NilValue,
|
1278
1379
|
Rcpp::_["bounds"] = R_NilValue,
|
1279
1380
|
Rcpp::_["outliers_info"] = R_NilValue,
|
1280
1381
|
Rcpp::_["ntrees"] = R_NilValue,
|
@@ -1295,7 +1396,6 @@ Rcpp::List fit_OutlierTree(Rcpp::NumericVector arr_num, size_t ncols_numeric,
|
|
1295
1396
|
double *arr_num_C = set_R_nan_as_C_nan(REAL(arr_num), Xcpp, arr_num.size(), nthreads);
|
1296
1397
|
|
1297
1398
|
std::unique_ptr<ModelOutputs> model_outputs = std::unique_ptr<ModelOutputs>(new ModelOutputs());
|
1298
|
-
try {
|
1299
1399
|
found_outliers = fit_outliers_models(*model_outputs,
|
1300
1400
|
arr_num_C, ncols_numeric,
|
1301
1401
|
INTEGER(arr_cat), ncols_categ, INTEGER(ncat),
|
@@ -1313,13 +1413,7 @@ Rcpp::List fit_OutlierTree(Rcpp::NumericVector arr_num, size_t ncols_numeric,
|
|
1313
1413
|
&min_ts
|
1314
1414
|
};
|
1315
1415
|
outp["bounds"] = Rcpp::unwindProtect(extract_outl_bounds_wrapper, (void*)&temp);
|
1316
|
-
outp["serialized_obj"] = serialize_OutlierTree(model_outputs.get());
|
1317
|
-
} catch(std::bad_alloc &e) {
|
1318
|
-
Rcpp::stop("Insufficient memory.\n");
|
1319
|
-
}
|
1320
1416
|
|
1321
|
-
if (!Rf_xlength(outp["serialized_obj"]))
|
1322
|
-
return outp;
|
1323
1417
|
if (return_outliers) {
|
1324
1418
|
args_describe_outliers temp = {
|
1325
1419
|
model_outputs.get(),
|
@@ -1350,8 +1444,7 @@ Rcpp::List fit_OutlierTree(Rcpp::NumericVector arr_num, size_t ncols_numeric,
|
|
1350
1444
|
outp["nclust"] = Rcpp::unwindProtect(safe_int, (void*)&nclust_int);
|
1351
1445
|
outp["found_outliers"] = Rcpp::unwindProtect(safe_bool, (void*)&found_outliers);
|
1352
1446
|
|
1353
|
-
outp["ptr_model"] = Rcpp::unwindProtect(
|
1354
|
-
model_outputs.release();
|
1447
|
+
outp["ptr_model"] = Rcpp::unwindProtect(get_altrepped_pointer, &model_outputs);
|
1355
1448
|
return outp;
|
1356
1449
|
}
|
1357
1450
|
|
@@ -190,9 +190,9 @@ bool fit_outliers_models(ModelOutputs &model_outputs,
|
|
190
190
|
model_outputs.start_ix_cat_counts[0] = 0;
|
191
191
|
if (tot_cols > ncols_numeric) {
|
192
192
|
input_data.max_categ = calculate_category_indices(model_outputs.start_ix_cat_counts.data(), input_data.ncat, input_data.ncols_categ,
|
193
|
-
|
193
|
+
input_data.skip_col.data() + ncols_numeric);
|
194
194
|
input_data.max_categ = calculate_category_indices(model_outputs.start_ix_cat_counts.data() + input_data.ncols_categ, input_data.ncat_ord, input_data.ncols_ord,
|
195
|
-
|
195
|
+
input_data.skip_col.data() + input_data.ncols_numeric + input_data.ncols_categ, input_data.max_categ);
|
196
196
|
} else {
|
197
197
|
input_data.max_categ = 0;
|
198
198
|
}
|
@@ -209,42 +209,39 @@ bool fit_outliers_models(ModelOutputs &model_outputs,
|
|
209
209
|
|
210
210
|
/* calculate prior probabilities for categorical variables (in parallel), see if any is unsplittable */
|
211
211
|
if (tot_cols > ncols_numeric) {
|
212
|
-
#pragma omp parallel
|
212
|
+
#pragma omp parallel sections if(nthreads > 1)
|
213
213
|
{
|
214
|
-
#pragma omp sections
|
215
|
-
{
|
216
214
|
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
215
|
+
#pragma omp section
|
216
|
+
{
|
217
|
+
if (ncols_categ > 0) {
|
218
|
+
calculate_all_cat_counts(model_outputs.start_ix_cat_counts.data(), input_data.cat_counts.data(), input_data.ncat,
|
219
|
+
input_data.categorical_data, input_data.ncols_categ, input_data.nrows,
|
220
|
+
input_data.has_NA.data() + ncols_numeric, input_data.skip_col.data() + input_data.ncols_numeric,
|
221
|
+
std::min(input_data.ncols_categ, (size_t)std::max(1, nthreads - 1)) );
|
222
|
+
|
223
|
+
check_cat_col_unsplittable(model_outputs.start_ix_cat_counts.data(), input_data.cat_counts.data(), input_data.ncat,
|
224
|
+
input_data.ncols_categ, std::min(model_params.min_size_numeric, model_params.min_size_categ), input_data.nrows,
|
225
|
+
input_data.skip_col.data() + input_data.ncols_numeric,
|
226
|
+
std::min(input_data.ncols_categ, (size_t)std::max(1, nthreads - 1)));
|
227
|
+
}
|
230
228
|
|
231
229
|
|
232
|
-
|
230
|
+
}
|
233
231
|
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
|
242
|
-
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
}
|
232
|
+
#pragma omp section
|
233
|
+
{
|
234
|
+
if (ncols_ord > 0) {
|
235
|
+
calculate_all_cat_counts(model_outputs.start_ix_cat_counts.data() + input_data.ncols_categ, input_data.cat_counts.data(), input_data.ncat_ord,
|
236
|
+
input_data.ordinal_data, input_data.ncols_ord, input_data.nrows,
|
237
|
+
input_data.has_NA.data() + input_data.ncols_numeric + input_data.ncols_categ,
|
238
|
+
input_data.skip_col.data() + input_data.ncols_numeric + input_data.ncols_categ,
|
239
|
+
std::max((int)1, nthreads - (int)input_data.ncols_categ) );
|
240
|
+
|
241
|
+
check_cat_col_unsplittable(model_outputs.start_ix_cat_counts.data() + input_data.ncols_categ, input_data.cat_counts.data(), input_data.ncat_ord,
|
242
|
+
ncols_ord, std::min(model_params.min_size_numeric, model_params.min_size_categ), input_data.nrows,
|
243
|
+
input_data.skip_col.data() + input_data.ncols_numeric + input_data.ncols_categ,
|
244
|
+
std::max((int)1, nthreads - (int)input_data.ncols_categ));
|
248
245
|
}
|
249
246
|
}
|
250
247
|
|
@@ -260,13 +257,13 @@ bool fit_outliers_models(ModelOutputs &model_outputs,
|
|
260
257
|
|
261
258
|
/* for numerical columns, check if they have NAs or if total variance is too small */
|
262
259
|
check_missing_no_variance(input_data.numeric_data, input_data.ncols_numeric, input_data.nrows,
|
263
|
-
|
260
|
+
input_data.has_NA.data(), input_data.skip_col.data(),
|
264
261
|
model_outputs.min_decimals_col.data(), nthreads);
|
265
262
|
|
266
263
|
/* determine an approximate size for the output clusters, and reserve memory right away */
|
267
264
|
model_outputs.all_clusters.resize(tot_cols);
|
268
265
|
model_outputs.all_trees.resize(tot_cols);
|
269
|
-
#pragma omp parallel for shared(model_outputs, input_data, model_params, tot_cols)
|
266
|
+
#pragma omp parallel for num_threads(nthreads) shared(model_outputs, input_data, model_params, tot_cols)
|
270
267
|
for (size_t_for col = 0; col < tot_cols; col++) {
|
271
268
|
if (input_data.skip_col[col]) continue;
|
272
269
|
if (cols_ignore != NULL && cols_ignore[col]) continue;
|
@@ -39,7 +39,7 @@
|
|
39
39
|
* at which position will the counts for a given column start. Note that NAs are stored as the last index in each
|
40
40
|
* column, so each one needs one extra category
|
41
41
|
*/
|
42
|
-
int calculate_category_indices(size_t start_ix_cat_counts[], int ncat[], size_t ncols,
|
42
|
+
int calculate_category_indices(size_t start_ix_cat_counts[], int ncat[], size_t ncols, char skip_col[], int max_categ)
|
43
43
|
{
|
44
44
|
for (size_t col = 0; col < ncols; col++) {
|
45
45
|
max_categ = std::max(ncat[col], max_categ);
|
@@ -53,7 +53,7 @@ int calculate_category_indices(size_t start_ix_cat_counts[], int ncat[], size_t
|
|
53
53
|
/* Save the counts of each category for each column in the array determined above */
|
54
54
|
void calculate_all_cat_counts(size_t start_ix_cat_counts[], size_t cat_counts[], int ncat[],
|
55
55
|
int categorical_data[], size_t ncols, size_t nrows,
|
56
|
-
|
56
|
+
char has_NA[], char skip_col[], int nthreads)
|
57
57
|
{
|
58
58
|
size_t col_st_offset;
|
59
59
|
size_t col_stop;
|
@@ -80,7 +80,7 @@ void calculate_all_cat_counts(size_t start_ix_cat_counts[], size_t cat_counts[],
|
|
80
80
|
|
81
81
|
/* Check if some column has a large majority that would make any split fail to meet minimum sizes */
|
82
82
|
void check_cat_col_unsplittable(size_t start_ix_cat_counts[], size_t cat_counts[], int ncat[],
|
83
|
-
size_t ncols, size_t min_conditioned_size, size_t nrows,
|
83
|
+
size_t ncols, size_t min_conditioned_size, size_t nrows, char skip_col[], int nthreads)
|
84
84
|
{
|
85
85
|
size_t largest_cnt;
|
86
86
|
#pragma omp parallel for num_threads(nthreads) private(largest_cnt) shared(ncols, nrows, ncat, cat_counts, start_ix_cat_counts, min_conditioned_size, skip_col)
|
@@ -127,8 +127,8 @@ void calculate_lowerlim_proportion(long double *restrict prop_small, long double
|
|
127
127
|
|
128
128
|
/* Check if a numerical column has no variance (i.e. will not be splittable).
|
129
129
|
Along the way, also record the number of decimals to display for this column. */
|
130
|
-
void check_missing_no_variance(double numeric_data[], size_t ncols, size_t nrows,
|
131
|
-
|
130
|
+
void check_missing_no_variance(double numeric_data[], size_t ncols, size_t nrows, char has_NA[],
|
131
|
+
char skip_col[], int min_decimals[], int nthreads)
|
132
132
|
{
|
133
133
|
long double running_mean;
|
134
134
|
long double mean_prev;
|
@@ -680,18 +680,6 @@ int decimals_diff(double val1, double val2)
|
|
680
680
|
return (int) res;
|
681
681
|
}
|
682
682
|
|
683
|
-
|
684
|
-
/* Reason behind this function: Cython (as of v0.29) will not auto-deallocate
|
685
|
-
structs which are part of a cdef'd class, which produces a memory leak
|
686
|
-
but can be force-destructed. Unfortunately, Cython itself doesn't even
|
687
|
-
allow calling destructors for structs, so it has to be done externally.
|
688
|
-
This function should otherwise have no reason to exist.
|
689
|
-
*/
|
690
|
-
void dealloc_ModelOutputs(ModelOutputs &model_outputs)
|
691
|
-
{
|
692
|
-
model_outputs.~ModelOutputs();
|
693
|
-
}
|
694
|
-
|
695
683
|
ModelOutputs get_empty_ModelOutputs()
|
696
684
|
{
|
697
685
|
return ModelOutputs();
|
@@ -802,17 +802,17 @@ typedef struct {
|
|
802
802
|
} RecursionState;
|
803
803
|
|
804
804
|
|
805
|
-
int calculate_category_indices(size_t start_ix_cat_counts[], int ncat[], size_t ncols,
|
805
|
+
int calculate_category_indices(size_t start_ix_cat_counts[], int ncat[], size_t ncols, char skip_col[], int max_categ = 0);
|
806
806
|
void calculate_all_cat_counts(size_t start_ix_cat_counts[], size_t cat_counts[], int ncat[],
|
807
807
|
int categorical_data[], size_t ncols, size_t nrows,
|
808
|
-
|
808
|
+
char has_NA[], char skip_col[], int nthreads);
|
809
809
|
void check_cat_col_unsplittable(size_t start_ix_cat_counts[], size_t cat_counts[], int ncat[],
|
810
|
-
size_t ncols, size_t min_conditioned_size, size_t nrows,
|
810
|
+
size_t ncols, size_t min_conditioned_size, size_t nrows, char skip_col[], int nthreads);
|
811
811
|
void calculate_lowerlim_proportion(long double *restrict prop_small, long double *restrict prop,
|
812
812
|
size_t start_ix_cat_counts[], size_t cat_counts[],
|
813
813
|
size_t ncols, size_t nrows, double z_norm, double z_tail);
|
814
|
-
void check_missing_no_variance(double numeric_data[], size_t ncols, size_t nrows,
|
815
|
-
|
814
|
+
void check_missing_no_variance(double numeric_data[], size_t ncols, size_t nrows, char has_NA[],
|
815
|
+
char skip_col[], int min_decimals[], int nthreads);
|
816
816
|
void calc_central_mean_and_sd(size_t ix_arr[], size_t st, size_t end, double x[], size_t size_quarter, double *mean_central, double *sd_central);
|
817
817
|
void check_for_tails(size_t ix_arr[], size_t st, size_t end, double *restrict x,
|
818
818
|
double z_norm, double max_perc_outliers,
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: outliertree
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2023-12-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rice
|
@@ -75,7 +75,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
75
75
|
- !ruby/object:Gem::Version
|
76
76
|
version: '0'
|
77
77
|
requirements: []
|
78
|
-
rubygems_version: 3.
|
78
|
+
rubygems_version: 3.4.10
|
79
79
|
signing_key:
|
80
80
|
specification_version: 4
|
81
81
|
summary: Explainable outlier/anomaly detection for Ruby
|