outliertree 0.2.1 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/NOTICE.txt +1 -1
- data/README.md +11 -10
- data/ext/outliertree/ext.cpp +23 -0
- data/ext/outliertree/extconf.rb +1 -1
- data/lib/outliertree/result.rb +3 -3
- data/lib/outliertree/version.rb +1 -1
- data/vendor/outliertree/README.md +83 -41
- data/vendor/outliertree/src/Makevars.in +3 -0
- data/vendor/outliertree/src/Makevars.win +3 -0
- data/vendor/outliertree/src/RcppExports.cpp +17 -27
- data/vendor/outliertree/src/Rwrapper.cpp +354 -62
- data/vendor/outliertree/src/cat_outlier.cpp +6 -6
- data/vendor/outliertree/src/clusters.cpp +114 -9
- data/vendor/outliertree/src/fit_model.cpp +525 -331
- data/vendor/outliertree/src/misc.cpp +166 -17
- data/vendor/outliertree/src/outlier_tree.hpp +164 -56
- data/vendor/outliertree/src/outliertree-win.def +3 -0
- data/vendor/outliertree/src/predict.cpp +33 -0
- data/vendor/outliertree/src/split.cpp +124 -20
- metadata +8 -6
- data/vendor/outliertree/src/Makevars +0 -3
@@ -1,4 +1,8 @@
|
|
1
|
+
#ifdef _FOR_R
|
2
|
+
|
1
3
|
#include <Rcpp.h>
|
4
|
+
#include <Rcpp/unwindProtect.h>
|
5
|
+
#include <R_ext/Altrep.h>
|
2
6
|
// [[Rcpp::plugins(cpp11)]]
|
3
7
|
|
4
8
|
/* This is to serialize the model objects */
|
@@ -7,15 +11,31 @@
|
|
7
11
|
#include <cereal/types/vector.hpp>
|
8
12
|
#include <sstream>
|
9
13
|
#include <string>
|
14
|
+
#include <limits>
|
10
15
|
|
11
16
|
/* This is the package's header */
|
12
17
|
#include "outlier_tree.hpp"
|
13
18
|
|
19
|
+
void delete_model_from_R_ptr(SEXP R_ptr)
|
20
|
+
{
|
21
|
+
ModelOutputs *model = static_cast<ModelOutputs*>(R_ExternalPtrAddr(R_ptr));
|
22
|
+
delete model;
|
23
|
+
R_SetExternalPtrAddr(R_ptr, nullptr);
|
24
|
+
R_ClearExternalPtr(R_ptr);
|
25
|
+
}
|
26
|
+
|
27
|
+
SEXP alloc_RawVec(void *data)
|
28
|
+
{
|
29
|
+
size_t vec_size = *(size_t*)data;
|
30
|
+
if (vec_size > (size_t)std::numeric_limits<R_xlen_t>::max())
|
31
|
+
Rcpp::stop("Resulting model is too large for R to handle.");
|
32
|
+
return Rf_allocVector(RAWSXP, vec_size);
|
33
|
+
}
|
34
|
+
|
14
35
|
/* for model serialization and re-usage in R */
|
15
36
|
/* https://stackoverflow.com/questions/18474292/how-to-handle-c-internal-data-structure-in-r-in-order-to-allow-save-load */
|
16
37
|
/* this extra comment below the link is a workaround for Rcpp issue 675 in GitHub, do not remove it */
|
17
|
-
|
18
|
-
Rcpp::RawVector serialize_OutlierTree(ModelOutputs *model_outputs)
|
38
|
+
SEXP serialize_OutlierTree(ModelOutputs *model_outputs)
|
19
39
|
{
|
20
40
|
std::stringstream ss;
|
21
41
|
{
|
@@ -23,30 +43,157 @@ Rcpp::RawVector serialize_OutlierTree(ModelOutputs *model_outputs)
|
|
23
43
|
oarchive(*model_outputs);
|
24
44
|
}
|
25
45
|
ss.seekg(0, ss.end);
|
26
|
-
|
46
|
+
std::stringstream::pos_type vec_size = ss.tellg();
|
47
|
+
if (vec_size <= 0) {
|
48
|
+
Rf_error("Error: model is too big to serialize, resulting object will not be usable.\n");
|
49
|
+
}
|
50
|
+
size_t vec_size_ = (size_t)vec_size;
|
51
|
+
SEXP retval = PROTECT(Rcpp::unwindProtect(alloc_RawVec, (void*)&vec_size_));
|
27
52
|
ss.seekg(0, ss.beg);
|
28
|
-
ss.read(reinterpret_cast<char*>(
|
53
|
+
ss.read(reinterpret_cast<char*>(RAW(retval)), vec_size_);
|
54
|
+
UNPROTECT(1);
|
29
55
|
return retval;
|
30
56
|
}
|
31
57
|
|
32
|
-
|
33
|
-
SEXP deserialize_OutlierTree(Rcpp::RawVector src)
|
58
|
+
SEXP deserialize_OutlierTree(SEXP src, SEXP ptr_obj)
|
34
59
|
{
|
35
60
|
std::stringstream ss;
|
36
|
-
ss.write(reinterpret_cast<char*>(
|
61
|
+
ss.write(reinterpret_cast<char*>(RAW(src)), Rf_xlength(src));
|
37
62
|
ss.seekg(0, ss.beg);
|
38
63
|
std::unique_ptr<ModelOutputs> model_outputs = std::unique_ptr<ModelOutputs>(new ModelOutputs());
|
39
64
|
{
|
40
65
|
cereal::BinaryInputArchive iarchive(ss);
|
41
66
|
iarchive(*model_outputs);
|
42
67
|
}
|
43
|
-
|
68
|
+
R_SetExternalPtrAddr(ptr_obj, model_outputs.get());
|
69
|
+
R_RegisterCFinalizerEx(ptr_obj, delete_model_from_R_ptr, TRUE);
|
70
|
+
model_outputs.release();
|
71
|
+
return R_NilValue;
|
72
|
+
}
|
73
|
+
|
74
|
+
static R_altrep_class_t otree_altrepped_pointer_class;
|
75
|
+
|
76
|
+
R_xlen_t altrepped_pointer_length(SEXP obj)
|
77
|
+
{
|
78
|
+
return 1;
|
79
|
+
}
|
80
|
+
|
81
|
+
SEXP get_element_from_altrepped_ptr(SEXP R_altrepped_obj, R_xlen_t idx)
|
82
|
+
{
|
83
|
+
return R_altrep_data1(R_altrepped_obj);
|
84
|
+
}
|
85
|
+
|
86
|
+
Rboolean inspect_altrepped_pointer(SEXP x, int pre, int deep, int pvec, void (*inspect_subtree)(SEXP, int, int, int))
|
87
|
+
{
|
88
|
+
Rprintf("Altrepped pointer [address:%p]\n", R_ExternalPtrAddr(R_altrep_data1(x)));
|
89
|
+
return TRUE;
|
90
|
+
}
|
91
|
+
|
92
|
+
SEXP duplicate_altrepped_pointer(SEXP altrepped_obj, Rboolean deep)
|
93
|
+
{
|
94
|
+
SEXP R_ptr_name = PROTECT(Rf_mkString("ptr"));
|
95
|
+
SEXP R_ptr_class = PROTECT(Rf_mkString("otree_altrepped_handle"));
|
96
|
+
SEXP out = PROTECT(R_new_altrep(otree_altrepped_pointer_class, R_NilValue, R_NilValue));
|
97
|
+
|
98
|
+
if (!deep) {
|
99
|
+
R_set_altrep_data1(out, R_altrep_data1(altrepped_obj));
|
100
|
+
}
|
101
|
+
|
102
|
+
else {
|
103
|
+
|
104
|
+
SEXP R_ptr = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
|
105
|
+
|
106
|
+
try {
|
107
|
+
std::unique_ptr<ModelOutputs> new_obj(new ModelOutputs());
|
108
|
+
ModelOutputs *cpp_ptr = (ModelOutputs*)R_ExternalPtrAddr(R_altrep_data1(altrepped_obj));
|
109
|
+
*new_obj = *cpp_ptr;
|
110
|
+
|
111
|
+
R_SetExternalPtrAddr(R_ptr, new_obj.get());
|
112
|
+
R_RegisterCFinalizerEx(R_ptr, delete_model_from_R_ptr, TRUE);
|
113
|
+
new_obj.release();
|
114
|
+
}
|
115
|
+
|
116
|
+
catch (const std::exception &ex) {
|
117
|
+
Rf_error("%s\n", ex.what());
|
118
|
+
}
|
119
|
+
|
120
|
+
R_set_altrep_data1(out, R_ptr);
|
121
|
+
UNPROTECT(1);
|
122
|
+
}
|
123
|
+
|
124
|
+
Rf_setAttrib(out, R_NamesSymbol, R_ptr_name);
|
125
|
+
Rf_setAttrib(out, R_NamesSymbol, R_ptr_class);
|
126
|
+
UNPROTECT(3);
|
127
|
+
return out;
|
44
128
|
}
|
45
129
|
|
46
|
-
|
47
|
-
Rcpp::LogicalVector check_null_ptr_model(SEXP ptr_model)
|
130
|
+
SEXP serialize_altrepped_pointer(SEXP altrepped_obj)
|
48
131
|
{
|
49
|
-
return
|
132
|
+
return serialize_OutlierTree((ModelOutputs*)R_ExternalPtrAddr(R_altrep_data1(altrepped_obj)));
|
133
|
+
}
|
134
|
+
|
135
|
+
SEXP deserialize_altrepped_pointer(SEXP cls, SEXP R_state)
|
136
|
+
{
|
137
|
+
SEXP R_ptr_name = PROTECT(Rf_mkString("ptr"));
|
138
|
+
SEXP R_ptr_class = PROTECT(Rf_mkString("otree_altrepped_handle"));
|
139
|
+
SEXP R_ptr = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
|
140
|
+
SEXP out = PROTECT(R_new_altrep(otree_altrepped_pointer_class, R_NilValue, R_NilValue));
|
141
|
+
|
142
|
+
try {
|
143
|
+
deserialize_OutlierTree(R_state, R_ptr);
|
144
|
+
}
|
145
|
+
catch (const std::exception &ex) {
|
146
|
+
Rf_error("%s\n", ex.what());
|
147
|
+
}
|
148
|
+
|
149
|
+
R_set_altrep_data1(out, R_ptr);
|
150
|
+
Rf_setAttrib(out, R_NamesSymbol, R_ptr_name);
|
151
|
+
Rf_setAttrib(out, R_ClassSymbol, R_ptr_class);
|
152
|
+
|
153
|
+
UNPROTECT(4);
|
154
|
+
return out;
|
155
|
+
}
|
156
|
+
|
157
|
+
SEXP get_altrepped_pointer(void *void_ptr)
|
158
|
+
{
|
159
|
+
SEXP R_ptr_name = PROTECT(Rf_mkString("ptr"));
|
160
|
+
SEXP R_ptr_class = PROTECT(Rf_mkString("otree_altrepped_handle"));
|
161
|
+
SEXP R_ptr = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
|
162
|
+
SEXP out = PROTECT(R_new_altrep(otree_altrepped_pointer_class, R_NilValue, R_NilValue));
|
163
|
+
|
164
|
+
std::unique_ptr<ModelOutputs> *ptr = (std::unique_ptr<ModelOutputs>*)void_ptr;
|
165
|
+
R_SetExternalPtrAddr(R_ptr, ptr->get());
|
166
|
+
R_RegisterCFinalizerEx(R_ptr, delete_model_from_R_ptr, TRUE);
|
167
|
+
ptr->release();
|
168
|
+
|
169
|
+
R_set_altrep_data1(out, R_ptr);
|
170
|
+
Rf_setAttrib(out, R_NamesSymbol, R_ptr_name);
|
171
|
+
Rf_setAttrib(out, R_ClassSymbol, R_ptr_class);
|
172
|
+
|
173
|
+
UNPROTECT(4);
|
174
|
+
return out;
|
175
|
+
}
|
176
|
+
|
177
|
+
// [[Rcpp::init]]
|
178
|
+
void init_altrepped_class(DllInfo* dll)
|
179
|
+
{
|
180
|
+
otree_altrepped_pointer_class = R_make_altlist_class("otree_altrepped_pointer_class", "outliertree", dll);
|
181
|
+
R_set_altrep_Length_method(otree_altrepped_pointer_class, altrepped_pointer_length);
|
182
|
+
R_set_altrep_Inspect_method(otree_altrepped_pointer_class, inspect_altrepped_pointer);
|
183
|
+
R_set_altrep_Serialized_state_method(otree_altrepped_pointer_class, serialize_altrepped_pointer);
|
184
|
+
R_set_altrep_Unserialize_method(otree_altrepped_pointer_class, deserialize_altrepped_pointer);
|
185
|
+
R_set_altrep_Duplicate_method(otree_altrepped_pointer_class, duplicate_altrepped_pointer);
|
186
|
+
R_set_altlist_Elt_method(otree_altrepped_pointer_class, get_element_from_altrepped_ptr);
|
187
|
+
}
|
188
|
+
|
189
|
+
SEXP safe_int(void *x)
|
190
|
+
{
|
191
|
+
return Rf_ScalarInteger(*(int*)x);
|
192
|
+
}
|
193
|
+
|
194
|
+
SEXP safe_bool(void *x)
|
195
|
+
{
|
196
|
+
return Rf_ScalarLogical(*(bool*)x);
|
50
197
|
}
|
51
198
|
|
52
199
|
double* set_R_nan_as_C_nan(double *restrict x_R, std::vector<double> &x_C, size_t n, int nthreads)
|
@@ -59,12 +206,22 @@ double* set_R_nan_as_C_nan(double *restrict x_R, std::vector<double> &x_C, size_
|
|
59
206
|
return x_C.data();
|
60
207
|
}
|
61
208
|
|
209
|
+
double* set_R_nan_as_C_nan(double *restrict x_R, Rcpp::NumericVector &x_C, size_t n, int nthreads)
|
210
|
+
{
|
211
|
+
x_C = Rcpp::NumericVector(x_R, x_R + n);
|
212
|
+
#pragma omp parallel for schedule(static) num_threads(nthreads) shared(x_R, x_C, n)
|
213
|
+
for (size_t_for i = 0; i < n; i++)
|
214
|
+
if (isnan(x_R[i]) || Rcpp::NumericVector::is_na(x_R[i]) || Rcpp::traits::is_nan<REALSXP>(x_R[i]))
|
215
|
+
x_C[i] = NAN;
|
216
|
+
return REAL(x_C);
|
217
|
+
}
|
218
|
+
|
62
219
|
|
63
220
|
/* for predicting outliers */
|
64
221
|
Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
65
|
-
double *arr_num,
|
66
|
-
int *arr_cat,
|
67
|
-
int *arr_ord,
|
222
|
+
double *restrict arr_num,
|
223
|
+
int *restrict arr_cat,
|
224
|
+
int *restrict arr_ord,
|
68
225
|
Rcpp::ListOf<Rcpp::StringVector> cat_levels,
|
69
226
|
Rcpp::ListOf<Rcpp::StringVector> ord_levels,
|
70
227
|
Rcpp::StringVector colnames_num,
|
@@ -209,7 +366,7 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
|
209
366
|
} else if (outl_col < (ncols_num + ncols_cat)) {
|
210
367
|
if (outl_col < (ncols_num + ncols_cat_cat)) {
|
211
368
|
tmp_bool = Rcpp::LogicalVector(model_outputs.all_clusters[outl_col][outl_clust].subset_common.size(), false);
|
212
|
-
for (size_t cat = 0; cat < tmp_bool.size(); cat++) {
|
369
|
+
for (size_t cat = 0; cat < (size_t)tmp_bool.size(); cat++) {
|
213
370
|
if (model_outputs.all_clusters[outl_col][outl_clust].subset_common[cat] == 0) {
|
214
371
|
tmp_bool[cat] = true;
|
215
372
|
}
|
@@ -253,7 +410,7 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
|
253
410
|
}
|
254
411
|
} else {
|
255
412
|
tmp_bool = Rcpp::LogicalVector(model_outputs.all_clusters[outl_col][outl_clust].subset_common.size(), false);
|
256
|
-
for (size_t cat = 0; cat < tmp_bool.size(); cat++) {
|
413
|
+
for (size_t cat = 0; cat < (size_t)tmp_bool.size(); cat++) {
|
257
414
|
if (model_outputs.all_clusters[outl_col][outl_clust].subset_common[cat] == 0) {
|
258
415
|
tmp_bool[cat] = true;
|
259
416
|
}
|
@@ -345,6 +502,12 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
|
345
502
|
}
|
346
503
|
break;
|
347
504
|
}
|
505
|
+
|
506
|
+
default:
|
507
|
+
{
|
508
|
+
assert(0);
|
509
|
+
break;
|
510
|
+
}
|
348
511
|
}
|
349
512
|
|
350
513
|
/* add the comparison point */
|
@@ -377,6 +540,11 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
|
377
540
|
cond_clust["value_comp"] = Rcpp::as<Rcpp::CharacterVector>(NA_STRING);
|
378
541
|
break;
|
379
542
|
}
|
543
|
+
|
544
|
+
default:
|
545
|
+
{
|
546
|
+
unexpected_error();
|
547
|
+
}
|
380
548
|
}
|
381
549
|
break;
|
382
550
|
}
|
@@ -492,6 +660,12 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
|
492
660
|
}
|
493
661
|
break;
|
494
662
|
}
|
663
|
+
|
664
|
+
default:
|
665
|
+
{
|
666
|
+
assert(0);
|
667
|
+
break;
|
668
|
+
}
|
495
669
|
|
496
670
|
}
|
497
671
|
lst_cond[row] = Rcpp::List::create(Rcpp::clone(cond_clust));
|
@@ -528,6 +702,12 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
|
528
702
|
cond_clust["column"] = Rcpp::as<Rcpp::CharacterVector>(colnames_ord[model_outputs.all_trees[outl_col][curr_tree].col_num]);
|
529
703
|
break;
|
530
704
|
}
|
705
|
+
|
706
|
+
default:
|
707
|
+
{
|
708
|
+
assert(0);
|
709
|
+
break;
|
710
|
+
}
|
531
711
|
}
|
532
712
|
|
533
713
|
/* add conditions from tree */
|
@@ -599,6 +779,7 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
|
599
779
|
break;
|
600
780
|
}
|
601
781
|
|
782
|
+
default: {}
|
602
783
|
}
|
603
784
|
break;
|
604
785
|
}
|
@@ -696,6 +877,7 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
|
696
877
|
break;
|
697
878
|
}
|
698
879
|
|
880
|
+
default: {}
|
699
881
|
}
|
700
882
|
break;
|
701
883
|
}
|
@@ -758,10 +940,16 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
|
758
940
|
break;
|
759
941
|
}
|
760
942
|
|
943
|
+
default: {}
|
761
944
|
}
|
762
945
|
break;
|
763
946
|
}
|
764
947
|
|
948
|
+
default:
|
949
|
+
{
|
950
|
+
assert(0);
|
951
|
+
break;
|
952
|
+
}
|
765
953
|
}
|
766
954
|
}
|
767
955
|
|
@@ -796,6 +984,12 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
|
796
984
|
cond_clust["column"] = Rcpp::as<Rcpp::CharacterVector>(colnames_ord[model_outputs.all_trees[outl_col][parent_tree].col_num]);
|
797
985
|
break;
|
798
986
|
}
|
987
|
+
|
988
|
+
default:
|
989
|
+
{
|
990
|
+
assert(0);
|
991
|
+
break;
|
992
|
+
}
|
799
993
|
}
|
800
994
|
|
801
995
|
|
@@ -835,6 +1029,11 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
|
835
1029
|
cond_clust["value_comp"] = Rcpp::as<Rcpp::CharacterVector>(NA_STRING);
|
836
1030
|
break;
|
837
1031
|
}
|
1032
|
+
|
1033
|
+
default:
|
1034
|
+
{
|
1035
|
+
unexpected_error();
|
1036
|
+
}
|
838
1037
|
}
|
839
1038
|
break;
|
840
1039
|
}
|
@@ -1011,6 +1210,11 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
|
1011
1210
|
break;
|
1012
1211
|
}
|
1013
1212
|
|
1213
|
+
default:
|
1214
|
+
{
|
1215
|
+
assert(0);
|
1216
|
+
break;
|
1217
|
+
}
|
1014
1218
|
}
|
1015
1219
|
|
1016
1220
|
|
@@ -1038,6 +1242,37 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
|
1038
1242
|
return outp;
|
1039
1243
|
}
|
1040
1244
|
|
1245
|
+
struct args_describe_outliers {
|
1246
|
+
ModelOutputs *model_outputs;
|
1247
|
+
double *arr_num;
|
1248
|
+
int *arr_cat;
|
1249
|
+
int *arr_ord;
|
1250
|
+
Rcpp::ListOf<Rcpp::StringVector> *cat_levels;
|
1251
|
+
Rcpp::ListOf<Rcpp::StringVector> *ord_levels;
|
1252
|
+
Rcpp::StringVector *colnames_num;
|
1253
|
+
Rcpp::StringVector *colnames_cat;
|
1254
|
+
Rcpp::StringVector *colnames_ord;
|
1255
|
+
Rcpp::NumericVector *min_date;
|
1256
|
+
Rcpp::NumericVector *min_ts;
|
1257
|
+
};
|
1258
|
+
|
1259
|
+
SEXP describe_outliers_wrapper(void *args_)
|
1260
|
+
{
|
1261
|
+
args_describe_outliers *args = (args_describe_outliers*)args_;
|
1262
|
+
return describe_outliers(*(args->model_outputs),
|
1263
|
+
args->arr_num,
|
1264
|
+
args->arr_cat,
|
1265
|
+
args->arr_ord,
|
1266
|
+
*(args->cat_levels),
|
1267
|
+
*(args->ord_levels),
|
1268
|
+
*(args->colnames_num),
|
1269
|
+
*(args->colnames_cat),
|
1270
|
+
*(args->colnames_ord),
|
1271
|
+
*(args->min_date),
|
1272
|
+
*(args->min_ts));
|
1273
|
+
}
|
1274
|
+
|
1275
|
+
|
1041
1276
|
/* for extracting info about flaggable outliers */
|
1042
1277
|
Rcpp::List extract_outl_bounds(ModelOutputs &model_outputs,
|
1043
1278
|
Rcpp::ListOf<Rcpp::StringVector> cat_levels,
|
@@ -1102,9 +1337,27 @@ Rcpp::List extract_outl_bounds(ModelOutputs &model_outputs,
|
|
1102
1337
|
return outp;
|
1103
1338
|
}
|
1104
1339
|
|
1340
|
+
struct args_extract_outl_bounds {
|
1341
|
+
ModelOutputs *model_outputs;
|
1342
|
+
Rcpp::ListOf<Rcpp::StringVector> *cat_levels;
|
1343
|
+
Rcpp::ListOf<Rcpp::StringVector> *ord_levels;
|
1344
|
+
Rcpp::NumericVector *min_date;
|
1345
|
+
Rcpp::NumericVector *min_ts;
|
1346
|
+
};
|
1347
|
+
|
1348
|
+
SEXP extract_outl_bounds_wrapper(void *args_)
|
1349
|
+
{
|
1350
|
+
args_extract_outl_bounds *args = (args_extract_outl_bounds*)args_;
|
1351
|
+
return extract_outl_bounds(*(args->model_outputs),
|
1352
|
+
*(args->cat_levels),
|
1353
|
+
*(args->ord_levels),
|
1354
|
+
*(args->min_date),
|
1355
|
+
*(args->min_ts));
|
1356
|
+
}
|
1357
|
+
|
1105
1358
|
|
1106
1359
|
/* external functions for fitting the model and predicting outliers */
|
1107
|
-
// [[Rcpp::export]]
|
1360
|
+
// [[Rcpp::export(rng = false)]]
|
1108
1361
|
Rcpp::List fit_OutlierTree(Rcpp::NumericVector arr_num, size_t ncols_numeric,
|
1109
1362
|
Rcpp::IntegerVector arr_cat, size_t ncols_categ, Rcpp::IntegerVector ncat,
|
1110
1363
|
Rcpp::IntegerVector arr_ord, size_t ncols_ord, Rcpp::IntegerVector ncat_ord,
|
@@ -1121,8 +1374,16 @@ Rcpp::List fit_OutlierTree(Rcpp::NumericVector arr_num, size_t ncols_numeric,
|
|
1121
1374
|
Rcpp::NumericVector min_date,
|
1122
1375
|
Rcpp::NumericVector min_ts)
|
1123
1376
|
{
|
1377
|
+
Rcpp::List outp = Rcpp::List::create(
|
1378
|
+
Rcpp::_["ptr_model"] = R_NilValue,
|
1379
|
+
Rcpp::_["bounds"] = R_NilValue,
|
1380
|
+
Rcpp::_["outliers_info"] = R_NilValue,
|
1381
|
+
Rcpp::_["ntrees"] = R_NilValue,
|
1382
|
+
Rcpp::_["nclust"] = R_NilValue,
|
1383
|
+
Rcpp::_["found_outliers"] = R_NilValue
|
1384
|
+
);
|
1385
|
+
|
1124
1386
|
bool found_outliers;
|
1125
|
-
Rcpp::List outp;
|
1126
1387
|
size_t tot_cols = ncols_numeric + ncols_categ + ncols_ord;
|
1127
1388
|
std::vector<char> cols_ignore;
|
1128
1389
|
char *cols_ignore_ptr = NULL;
|
@@ -1132,54 +1393,62 @@ Rcpp::List fit_OutlierTree(Rcpp::NumericVector arr_num, size_t ncols_numeric,
|
|
1132
1393
|
cols_ignore_ptr = &cols_ignore[0];
|
1133
1394
|
}
|
1134
1395
|
std::vector<double> Xcpp;
|
1135
|
-
double *arr_num_C = set_R_nan_as_C_nan(
|
1396
|
+
double *arr_num_C = set_R_nan_as_C_nan(REAL(arr_num), Xcpp, arr_num.size(), nthreads);
|
1136
1397
|
|
1137
1398
|
std::unique_ptr<ModelOutputs> model_outputs = std::unique_ptr<ModelOutputs>(new ModelOutputs());
|
1138
1399
|
found_outliers = fit_outliers_models(*model_outputs,
|
1139
1400
|
arr_num_C, ncols_numeric,
|
1140
|
-
|
1141
|
-
|
1401
|
+
INTEGER(arr_cat), ncols_categ, INTEGER(ncat),
|
1402
|
+
INTEGER(arr_ord), ncols_ord, INTEGER(ncat_ord),
|
1142
1403
|
nrows, cols_ignore_ptr, nthreads,
|
1143
1404
|
categ_as_bin, ord_as_bin, cat_bruteforce_subset, categ_from_maj, take_mid,
|
1144
1405
|
max_depth, max_perc_outliers, min_size_numeric, min_size_categ,
|
1145
1406
|
min_gain, gain_as_pct, follow_all, z_norm, z_outlier);
|
1146
1407
|
|
1147
|
-
|
1148
|
-
|
1149
|
-
|
1150
|
-
|
1151
|
-
|
1408
|
+
args_extract_outl_bounds temp = {
|
1409
|
+
model_outputs.get(),
|
1410
|
+
&cat_levels,
|
1411
|
+
&ord_levels,
|
1412
|
+
&min_date,
|
1413
|
+
&min_ts
|
1414
|
+
};
|
1415
|
+
outp["bounds"] = Rcpp::unwindProtect(extract_outl_bounds_wrapper, (void*)&temp);
|
1152
1416
|
|
1153
|
-
outp["serialized_obj"] = serialize_OutlierTree(model_outputs.get());
|
1154
1417
|
if (return_outliers) {
|
1155
|
-
|
1156
|
-
|
1157
|
-
|
1158
|
-
|
1159
|
-
|
1160
|
-
|
1161
|
-
|
1162
|
-
|
1163
|
-
|
1164
|
-
|
1165
|
-
|
1418
|
+
args_describe_outliers temp = {
|
1419
|
+
model_outputs.get(),
|
1420
|
+
arr_num_C,
|
1421
|
+
INTEGER(arr_cat),
|
1422
|
+
INTEGER(arr_ord),
|
1423
|
+
&cat_levels,
|
1424
|
+
&ord_levels,
|
1425
|
+
&colnames_num,
|
1426
|
+
&colnames_cat,
|
1427
|
+
&colnames_ord,
|
1428
|
+
&min_date,
|
1429
|
+
&min_ts
|
1430
|
+
};
|
1431
|
+
outp["outliers_info"] = Rcpp::unwindProtect(describe_outliers_wrapper, (void*)&temp);
|
1166
1432
|
}
|
1433
|
+
forget_row_outputs(*model_outputs);
|
1434
|
+
|
1167
1435
|
/* add number of trees and clusters */
|
1168
1436
|
size_t ntrees = 0, nclust = 0;
|
1169
1437
|
for (size_t col = 0; col < model_outputs->all_trees.size(); col++) {
|
1170
1438
|
ntrees += model_outputs->all_trees[col].size();
|
1171
1439
|
nclust += model_outputs->all_clusters[col].size();
|
1172
1440
|
}
|
1173
|
-
|
1174
|
-
|
1175
|
-
outp["
|
1441
|
+
int ntrees_int = (int)ntrees;
|
1442
|
+
int nclust_int = (int)nclust;
|
1443
|
+
outp["ntrees"] = Rcpp::unwindProtect(safe_int, (void*)&ntrees_int);
|
1444
|
+
outp["nclust"] = Rcpp::unwindProtect(safe_int, (void*)&nclust_int);
|
1445
|
+
outp["found_outliers"] = Rcpp::unwindProtect(safe_bool, (void*)&found_outliers);
|
1176
1446
|
|
1177
|
-
|
1178
|
-
outp["ptr_model"] = Rcpp::XPtr<ModelOutputs>(model_outputs.release(), true);
|
1447
|
+
outp["ptr_model"] = Rcpp::unwindProtect(get_altrepped_pointer, &model_outputs);
|
1179
1448
|
return outp;
|
1180
1449
|
}
|
1181
1450
|
|
1182
|
-
// [[Rcpp::export]]
|
1451
|
+
// [[Rcpp::export(rng = false)]]
|
1183
1452
|
Rcpp::List predict_OutlierTree(SEXP ptr_model, size_t nrows, int nthreads,
|
1184
1453
|
Rcpp::NumericVector arr_num, Rcpp::IntegerVector arr_cat, Rcpp::IntegerVector arr_ord,
|
1185
1454
|
Rcpp::ListOf<Rcpp::StringVector> cat_levels,
|
@@ -1190,36 +1459,59 @@ Rcpp::List predict_OutlierTree(SEXP ptr_model, size_t nrows, int nthreads,
|
|
1190
1459
|
Rcpp::NumericVector min_date,
|
1191
1460
|
Rcpp::NumericVector min_ts)
|
1192
1461
|
{
|
1193
|
-
|
1194
|
-
double *arr_num_C = set_R_nan_as_C_nan(
|
1462
|
+
Rcpp::NumericVector Xcpp;
|
1463
|
+
double *arr_num_C = set_R_nan_as_C_nan(REAL(arr_num), Xcpp, arr_num.size(), nthreads);
|
1195
1464
|
|
1196
1465
|
ModelOutputs *model_outputs = static_cast<ModelOutputs*>(R_ExternalPtrAddr(ptr_model));
|
1197
|
-
bool found_outliers = find_new_outliers(
|
1466
|
+
bool found_outliers = find_new_outliers(arr_num_C, INTEGER(arr_cat), INTEGER(arr_ord),
|
1198
1467
|
nrows, nthreads, *model_outputs);
|
1199
|
-
|
1200
|
-
|
1201
|
-
|
1202
|
-
|
1203
|
-
|
1204
|
-
|
1205
|
-
|
1206
|
-
|
1207
|
-
|
1208
|
-
|
1209
|
-
|
1210
|
-
|
1468
|
+
args_describe_outliers temp = {
|
1469
|
+
model_outputs,
|
1470
|
+
arr_num_C,
|
1471
|
+
INTEGER(arr_cat),
|
1472
|
+
INTEGER(arr_ord),
|
1473
|
+
&cat_levels,
|
1474
|
+
&ord_levels,
|
1475
|
+
&colnames_num,
|
1476
|
+
&colnames_cat,
|
1477
|
+
&colnames_ord,
|
1478
|
+
&min_date,
|
1479
|
+
&min_ts
|
1480
|
+
};
|
1481
|
+
|
1482
|
+
Rcpp::List outp;
|
1483
|
+
try {
|
1484
|
+
outp = Rcpp::unwindProtect(describe_outliers_wrapper, (void*)&temp);
|
1485
|
+
} catch(...) {
|
1486
|
+
forget_row_outputs(*model_outputs);
|
1487
|
+
throw;
|
1488
|
+
}
|
1211
1489
|
forget_row_outputs(*model_outputs);
|
1490
|
+
outp["found_outliers"] = Rcpp::LogicalVector(found_outliers);
|
1212
1491
|
return outp;
|
1213
1492
|
}
|
1214
1493
|
|
1215
|
-
// [[Rcpp::export]]
|
1494
|
+
// [[Rcpp::export(rng = false)]]
|
1216
1495
|
Rcpp::LogicalVector check_few_values(Rcpp::NumericVector arr_num, size_t nrows, size_t ncols, int nthreads)
|
1217
1496
|
{
|
1218
|
-
std::vector<char> too_few_vals(ncols, 0);
|
1219
|
-
check_more_two_values(&arr_num[0], nrows, ncols, nthreads, too_few_vals.data());
|
1220
1497
|
Rcpp::LogicalVector outp(ncols);
|
1498
|
+
std::vector<char> too_few_vals(ncols, 0);
|
1499
|
+
check_more_two_values(REAL(arr_num), nrows, ncols, nthreads, too_few_vals.data());
|
1221
1500
|
for (size_t col = 0; col < ncols; col++) {
|
1222
1501
|
outp[col] = (bool) too_few_vals[col];
|
1223
1502
|
}
|
1224
1503
|
return outp;
|
1225
1504
|
}
|
1505
|
+
|
1506
|
+
|
1507
|
+
// [[Rcpp::export(rng = false)]]
|
1508
|
+
bool R_has_openmp()
|
1509
|
+
{
|
1510
|
+
#ifdef _OPENMP
|
1511
|
+
return true;
|
1512
|
+
#else
|
1513
|
+
return false;
|
1514
|
+
#endif
|
1515
|
+
}
|
1516
|
+
|
1517
|
+
#endif /* _FOR_R */
|
@@ -74,7 +74,7 @@
|
|
74
74
|
*/
|
75
75
|
void find_outlier_categories(size_t categ_counts[], size_t ncateg, size_t tot, double max_perc_outliers,
|
76
76
|
long double perc_threshold[], size_t buffer_ix[], long double buffer_perc[],
|
77
|
-
double z_norm, char is_outlier[], bool *found_outliers, bool *new_is_outlier,
|
77
|
+
double z_norm, signed char is_outlier[], bool *found_outliers, bool *new_is_outlier,
|
78
78
|
double *next_most_comm)
|
79
79
|
{
|
80
80
|
//TODO: must also establish bounds for new, unseen categories
|
@@ -90,7 +90,7 @@ void find_outlier_categories(size_t categ_counts[], size_t ncateg, size_t tot, d
|
|
90
90
|
size_t size_tail = 0;
|
91
91
|
|
92
92
|
/* reset the temporary arrays and fill them */
|
93
|
-
memset(is_outlier, 0, ncateg * sizeof(char));
|
93
|
+
memset(is_outlier, 0, ncateg * sizeof(signed char));
|
94
94
|
for (size_t cat = 0; cat < ncateg; cat++) {
|
95
95
|
buffer_ix[cat] = cat;
|
96
96
|
buffer_perc[cat] = (categ_counts[cat] > 0)? ((long double)categ_counts[cat] / tot_dbl) : 0;
|
@@ -225,13 +225,13 @@ void find_outlier_categories(size_t categ_counts[], size_t ncateg, size_t tot, d
|
|
225
225
|
* Category to which the majority of the observations belong.
|
226
226
|
*/
|
227
227
|
void find_outlier_categories_by_maj(size_t categ_counts[], size_t ncateg, size_t tot, double max_perc_outliers,
|
228
|
-
long double prior_prob[], double z_outlier, char is_outlier[],
|
228
|
+
long double prior_prob[], double z_outlier, signed char is_outlier[],
|
229
229
|
bool *found_outliers, bool *new_is_outlier, int *categ_maj)
|
230
230
|
{
|
231
231
|
/* initialize parameters as needed */
|
232
232
|
*found_outliers = false;
|
233
233
|
*new_is_outlier = false;
|
234
|
-
memset(is_outlier, 0, ncateg * sizeof(char));
|
234
|
+
memset(is_outlier, 0, ncateg * sizeof(signed char));
|
235
235
|
size_t max_outliers = (size_t) calculate_max_outliers((long double)tot, max_perc_outliers);
|
236
236
|
long double tot_dbl = (long double) (tot + 1);
|
237
237
|
size_t n_non_maj;
|
@@ -283,7 +283,7 @@ void find_outlier_categories_by_maj(size_t categ_counts[], size_t ncateg, size_t
|
|
283
283
|
* Proportion of the least common non-outlier category.
|
284
284
|
*/
|
285
285
|
bool find_outlier_categories_no_cond(size_t categ_counts[], size_t ncateg, size_t tot,
|
286
|
-
char is_outlier[], double *next_most_comm)
|
286
|
+
signed char is_outlier[], double *next_most_comm)
|
287
287
|
{
|
288
288
|
/* if sample is too small, don't flag any as outliers */
|
289
289
|
if (tot < 1000) return false;
|
@@ -296,7 +296,7 @@ bool find_outlier_categories_no_cond(size_t categ_counts[], size_t ncateg, size_
|
|
296
296
|
|
297
297
|
/* look if there's any category meeting the first condition and none meeting the second one */
|
298
298
|
bool has_outlier_cat = false;
|
299
|
-
memset(is_outlier, 0, sizeof(char) * ncateg);
|
299
|
+
memset(is_outlier, 0, sizeof(signed char) * ncateg);
|
300
300
|
for (size_t cat = 0; cat < ncateg; cat++) {
|
301
301
|
if (categ_counts[cat] > max_outliers && categ_counts[cat] < max_next_most_comm) {
|
302
302
|
has_outlier_cat = false;
|