outliertree 0.2.1 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/NOTICE.txt +1 -1
- data/README.md +11 -10
- data/ext/outliertree/ext.cpp +23 -0
- data/ext/outliertree/extconf.rb +1 -1
- data/lib/outliertree/result.rb +3 -3
- data/lib/outliertree/version.rb +1 -1
- data/vendor/outliertree/README.md +83 -41
- data/vendor/outliertree/src/Makevars.in +3 -0
- data/vendor/outliertree/src/Makevars.win +3 -0
- data/vendor/outliertree/src/RcppExports.cpp +17 -27
- data/vendor/outliertree/src/Rwrapper.cpp +354 -62
- data/vendor/outliertree/src/cat_outlier.cpp +6 -6
- data/vendor/outliertree/src/clusters.cpp +114 -9
- data/vendor/outliertree/src/fit_model.cpp +525 -331
- data/vendor/outliertree/src/misc.cpp +166 -17
- data/vendor/outliertree/src/outlier_tree.hpp +164 -56
- data/vendor/outliertree/src/outliertree-win.def +3 -0
- data/vendor/outliertree/src/predict.cpp +33 -0
- data/vendor/outliertree/src/split.cpp +124 -20
- metadata +8 -6
- data/vendor/outliertree/src/Makevars +0 -3
@@ -1,4 +1,8 @@
|
|
1
|
+
#ifdef _FOR_R
|
2
|
+
|
1
3
|
#include <Rcpp.h>
|
4
|
+
#include <Rcpp/unwindProtect.h>
|
5
|
+
#include <R_ext/Altrep.h>
|
2
6
|
// [[Rcpp::plugins(cpp11)]]
|
3
7
|
|
4
8
|
/* This is to serialize the model objects */
|
@@ -7,15 +11,31 @@
|
|
7
11
|
#include <cereal/types/vector.hpp>
|
8
12
|
#include <sstream>
|
9
13
|
#include <string>
|
14
|
+
#include <limits>
|
10
15
|
|
11
16
|
/* This is the package's header */
|
12
17
|
#include "outlier_tree.hpp"
|
13
18
|
|
19
|
+
void delete_model_from_R_ptr(SEXP R_ptr)
|
20
|
+
{
|
21
|
+
ModelOutputs *model = static_cast<ModelOutputs*>(R_ExternalPtrAddr(R_ptr));
|
22
|
+
delete model;
|
23
|
+
R_SetExternalPtrAddr(R_ptr, nullptr);
|
24
|
+
R_ClearExternalPtr(R_ptr);
|
25
|
+
}
|
26
|
+
|
27
|
+
SEXP alloc_RawVec(void *data)
|
28
|
+
{
|
29
|
+
size_t vec_size = *(size_t*)data;
|
30
|
+
if (vec_size > (size_t)std::numeric_limits<R_xlen_t>::max())
|
31
|
+
Rcpp::stop("Resulting model is too large for R to handle.");
|
32
|
+
return Rf_allocVector(RAWSXP, vec_size);
|
33
|
+
}
|
34
|
+
|
14
35
|
/* for model serialization and re-usage in R */
|
15
36
|
/* https://stackoverflow.com/questions/18474292/how-to-handle-c-internal-data-structure-in-r-in-order-to-allow-save-load */
|
16
37
|
/* this extra comment below the link is a workaround for Rcpp issue 675 in GitHub, do not remove it */
|
17
|
-
|
18
|
-
Rcpp::RawVector serialize_OutlierTree(ModelOutputs *model_outputs)
|
38
|
+
SEXP serialize_OutlierTree(ModelOutputs *model_outputs)
|
19
39
|
{
|
20
40
|
std::stringstream ss;
|
21
41
|
{
|
@@ -23,30 +43,157 @@ Rcpp::RawVector serialize_OutlierTree(ModelOutputs *model_outputs)
|
|
23
43
|
oarchive(*model_outputs);
|
24
44
|
}
|
25
45
|
ss.seekg(0, ss.end);
|
26
|
-
|
46
|
+
std::stringstream::pos_type vec_size = ss.tellg();
|
47
|
+
if (vec_size <= 0) {
|
48
|
+
Rf_error("Error: model is too big to serialize, resulting object will not be usable.\n");
|
49
|
+
}
|
50
|
+
size_t vec_size_ = (size_t)vec_size;
|
51
|
+
SEXP retval = PROTECT(Rcpp::unwindProtect(alloc_RawVec, (void*)&vec_size_));
|
27
52
|
ss.seekg(0, ss.beg);
|
28
|
-
ss.read(reinterpret_cast<char*>(
|
53
|
+
ss.read(reinterpret_cast<char*>(RAW(retval)), vec_size_);
|
54
|
+
UNPROTECT(1);
|
29
55
|
return retval;
|
30
56
|
}
|
31
57
|
|
32
|
-
|
33
|
-
SEXP deserialize_OutlierTree(Rcpp::RawVector src)
|
58
|
+
SEXP deserialize_OutlierTree(SEXP src, SEXP ptr_obj)
|
34
59
|
{
|
35
60
|
std::stringstream ss;
|
36
|
-
ss.write(reinterpret_cast<char*>(
|
61
|
+
ss.write(reinterpret_cast<char*>(RAW(src)), Rf_xlength(src));
|
37
62
|
ss.seekg(0, ss.beg);
|
38
63
|
std::unique_ptr<ModelOutputs> model_outputs = std::unique_ptr<ModelOutputs>(new ModelOutputs());
|
39
64
|
{
|
40
65
|
cereal::BinaryInputArchive iarchive(ss);
|
41
66
|
iarchive(*model_outputs);
|
42
67
|
}
|
43
|
-
|
68
|
+
R_SetExternalPtrAddr(ptr_obj, model_outputs.get());
|
69
|
+
R_RegisterCFinalizerEx(ptr_obj, delete_model_from_R_ptr, TRUE);
|
70
|
+
model_outputs.release();
|
71
|
+
return R_NilValue;
|
72
|
+
}
|
73
|
+
|
74
|
+
static R_altrep_class_t otree_altrepped_pointer_class;
|
75
|
+
|
76
|
+
R_xlen_t altrepped_pointer_length(SEXP obj)
|
77
|
+
{
|
78
|
+
return 1;
|
79
|
+
}
|
80
|
+
|
81
|
+
SEXP get_element_from_altrepped_ptr(SEXP R_altrepped_obj, R_xlen_t idx)
|
82
|
+
{
|
83
|
+
return R_altrep_data1(R_altrepped_obj);
|
84
|
+
}
|
85
|
+
|
86
|
+
Rboolean inspect_altrepped_pointer(SEXP x, int pre, int deep, int pvec, void (*inspect_subtree)(SEXP, int, int, int))
|
87
|
+
{
|
88
|
+
Rprintf("Altrepped pointer [address:%p]\n", R_ExternalPtrAddr(R_altrep_data1(x)));
|
89
|
+
return TRUE;
|
90
|
+
}
|
91
|
+
|
92
|
+
SEXP duplicate_altrepped_pointer(SEXP altrepped_obj, Rboolean deep)
|
93
|
+
{
|
94
|
+
SEXP R_ptr_name = PROTECT(Rf_mkString("ptr"));
|
95
|
+
SEXP R_ptr_class = PROTECT(Rf_mkString("otree_altrepped_handle"));
|
96
|
+
SEXP out = PROTECT(R_new_altrep(otree_altrepped_pointer_class, R_NilValue, R_NilValue));
|
97
|
+
|
98
|
+
if (!deep) {
|
99
|
+
R_set_altrep_data1(out, R_altrep_data1(altrepped_obj));
|
100
|
+
}
|
101
|
+
|
102
|
+
else {
|
103
|
+
|
104
|
+
SEXP R_ptr = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
|
105
|
+
|
106
|
+
try {
|
107
|
+
std::unique_ptr<ModelOutputs> new_obj(new ModelOutputs());
|
108
|
+
ModelOutputs *cpp_ptr = (ModelOutputs*)R_ExternalPtrAddr(R_altrep_data1(altrepped_obj));
|
109
|
+
*new_obj = *cpp_ptr;
|
110
|
+
|
111
|
+
R_SetExternalPtrAddr(R_ptr, new_obj.get());
|
112
|
+
R_RegisterCFinalizerEx(R_ptr, delete_model_from_R_ptr, TRUE);
|
113
|
+
new_obj.release();
|
114
|
+
}
|
115
|
+
|
116
|
+
catch (const std::exception &ex) {
|
117
|
+
Rf_error("%s\n", ex.what());
|
118
|
+
}
|
119
|
+
|
120
|
+
R_set_altrep_data1(out, R_ptr);
|
121
|
+
UNPROTECT(1);
|
122
|
+
}
|
123
|
+
|
124
|
+
Rf_setAttrib(out, R_NamesSymbol, R_ptr_name);
|
125
|
+
Rf_setAttrib(out, R_NamesSymbol, R_ptr_class);
|
126
|
+
UNPROTECT(3);
|
127
|
+
return out;
|
44
128
|
}
|
45
129
|
|
46
|
-
|
47
|
-
Rcpp::LogicalVector check_null_ptr_model(SEXP ptr_model)
|
130
|
+
SEXP serialize_altrepped_pointer(SEXP altrepped_obj)
|
48
131
|
{
|
49
|
-
return
|
132
|
+
return serialize_OutlierTree((ModelOutputs*)R_ExternalPtrAddr(R_altrep_data1(altrepped_obj)));
|
133
|
+
}
|
134
|
+
|
135
|
+
SEXP deserialize_altrepped_pointer(SEXP cls, SEXP R_state)
|
136
|
+
{
|
137
|
+
SEXP R_ptr_name = PROTECT(Rf_mkString("ptr"));
|
138
|
+
SEXP R_ptr_class = PROTECT(Rf_mkString("otree_altrepped_handle"));
|
139
|
+
SEXP R_ptr = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
|
140
|
+
SEXP out = PROTECT(R_new_altrep(otree_altrepped_pointer_class, R_NilValue, R_NilValue));
|
141
|
+
|
142
|
+
try {
|
143
|
+
deserialize_OutlierTree(R_state, R_ptr);
|
144
|
+
}
|
145
|
+
catch (const std::exception &ex) {
|
146
|
+
Rf_error("%s\n", ex.what());
|
147
|
+
}
|
148
|
+
|
149
|
+
R_set_altrep_data1(out, R_ptr);
|
150
|
+
Rf_setAttrib(out, R_NamesSymbol, R_ptr_name);
|
151
|
+
Rf_setAttrib(out, R_ClassSymbol, R_ptr_class);
|
152
|
+
|
153
|
+
UNPROTECT(4);
|
154
|
+
return out;
|
155
|
+
}
|
156
|
+
|
157
|
+
SEXP get_altrepped_pointer(void *void_ptr)
|
158
|
+
{
|
159
|
+
SEXP R_ptr_name = PROTECT(Rf_mkString("ptr"));
|
160
|
+
SEXP R_ptr_class = PROTECT(Rf_mkString("otree_altrepped_handle"));
|
161
|
+
SEXP R_ptr = PROTECT(R_MakeExternalPtr(nullptr, R_NilValue, R_NilValue));
|
162
|
+
SEXP out = PROTECT(R_new_altrep(otree_altrepped_pointer_class, R_NilValue, R_NilValue));
|
163
|
+
|
164
|
+
std::unique_ptr<ModelOutputs> *ptr = (std::unique_ptr<ModelOutputs>*)void_ptr;
|
165
|
+
R_SetExternalPtrAddr(R_ptr, ptr->get());
|
166
|
+
R_RegisterCFinalizerEx(R_ptr, delete_model_from_R_ptr, TRUE);
|
167
|
+
ptr->release();
|
168
|
+
|
169
|
+
R_set_altrep_data1(out, R_ptr);
|
170
|
+
Rf_setAttrib(out, R_NamesSymbol, R_ptr_name);
|
171
|
+
Rf_setAttrib(out, R_ClassSymbol, R_ptr_class);
|
172
|
+
|
173
|
+
UNPROTECT(4);
|
174
|
+
return out;
|
175
|
+
}
|
176
|
+
|
177
|
+
// [[Rcpp::init]]
|
178
|
+
void init_altrepped_class(DllInfo* dll)
|
179
|
+
{
|
180
|
+
otree_altrepped_pointer_class = R_make_altlist_class("otree_altrepped_pointer_class", "outliertree", dll);
|
181
|
+
R_set_altrep_Length_method(otree_altrepped_pointer_class, altrepped_pointer_length);
|
182
|
+
R_set_altrep_Inspect_method(otree_altrepped_pointer_class, inspect_altrepped_pointer);
|
183
|
+
R_set_altrep_Serialized_state_method(otree_altrepped_pointer_class, serialize_altrepped_pointer);
|
184
|
+
R_set_altrep_Unserialize_method(otree_altrepped_pointer_class, deserialize_altrepped_pointer);
|
185
|
+
R_set_altrep_Duplicate_method(otree_altrepped_pointer_class, duplicate_altrepped_pointer);
|
186
|
+
R_set_altlist_Elt_method(otree_altrepped_pointer_class, get_element_from_altrepped_ptr);
|
187
|
+
}
|
188
|
+
|
189
|
+
SEXP safe_int(void *x)
|
190
|
+
{
|
191
|
+
return Rf_ScalarInteger(*(int*)x);
|
192
|
+
}
|
193
|
+
|
194
|
+
SEXP safe_bool(void *x)
|
195
|
+
{
|
196
|
+
return Rf_ScalarLogical(*(bool*)x);
|
50
197
|
}
|
51
198
|
|
52
199
|
double* set_R_nan_as_C_nan(double *restrict x_R, std::vector<double> &x_C, size_t n, int nthreads)
|
@@ -59,12 +206,22 @@ double* set_R_nan_as_C_nan(double *restrict x_R, std::vector<double> &x_C, size_
|
|
59
206
|
return x_C.data();
|
60
207
|
}
|
61
208
|
|
209
|
+
double* set_R_nan_as_C_nan(double *restrict x_R, Rcpp::NumericVector &x_C, size_t n, int nthreads)
|
210
|
+
{
|
211
|
+
x_C = Rcpp::NumericVector(x_R, x_R + n);
|
212
|
+
#pragma omp parallel for schedule(static) num_threads(nthreads) shared(x_R, x_C, n)
|
213
|
+
for (size_t_for i = 0; i < n; i++)
|
214
|
+
if (isnan(x_R[i]) || Rcpp::NumericVector::is_na(x_R[i]) || Rcpp::traits::is_nan<REALSXP>(x_R[i]))
|
215
|
+
x_C[i] = NAN;
|
216
|
+
return REAL(x_C);
|
217
|
+
}
|
218
|
+
|
62
219
|
|
63
220
|
/* for predicting outliers */
|
64
221
|
Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
65
|
-
double *arr_num,
|
66
|
-
int *arr_cat,
|
67
|
-
int *arr_ord,
|
222
|
+
double *restrict arr_num,
|
223
|
+
int *restrict arr_cat,
|
224
|
+
int *restrict arr_ord,
|
68
225
|
Rcpp::ListOf<Rcpp::StringVector> cat_levels,
|
69
226
|
Rcpp::ListOf<Rcpp::StringVector> ord_levels,
|
70
227
|
Rcpp::StringVector colnames_num,
|
@@ -209,7 +366,7 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
|
209
366
|
} else if (outl_col < (ncols_num + ncols_cat)) {
|
210
367
|
if (outl_col < (ncols_num + ncols_cat_cat)) {
|
211
368
|
tmp_bool = Rcpp::LogicalVector(model_outputs.all_clusters[outl_col][outl_clust].subset_common.size(), false);
|
212
|
-
for (size_t cat = 0; cat < tmp_bool.size(); cat++) {
|
369
|
+
for (size_t cat = 0; cat < (size_t)tmp_bool.size(); cat++) {
|
213
370
|
if (model_outputs.all_clusters[outl_col][outl_clust].subset_common[cat] == 0) {
|
214
371
|
tmp_bool[cat] = true;
|
215
372
|
}
|
@@ -253,7 +410,7 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
|
253
410
|
}
|
254
411
|
} else {
|
255
412
|
tmp_bool = Rcpp::LogicalVector(model_outputs.all_clusters[outl_col][outl_clust].subset_common.size(), false);
|
256
|
-
for (size_t cat = 0; cat < tmp_bool.size(); cat++) {
|
413
|
+
for (size_t cat = 0; cat < (size_t)tmp_bool.size(); cat++) {
|
257
414
|
if (model_outputs.all_clusters[outl_col][outl_clust].subset_common[cat] == 0) {
|
258
415
|
tmp_bool[cat] = true;
|
259
416
|
}
|
@@ -345,6 +502,12 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
|
345
502
|
}
|
346
503
|
break;
|
347
504
|
}
|
505
|
+
|
506
|
+
default:
|
507
|
+
{
|
508
|
+
assert(0);
|
509
|
+
break;
|
510
|
+
}
|
348
511
|
}
|
349
512
|
|
350
513
|
/* add the comparison point */
|
@@ -377,6 +540,11 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
|
377
540
|
cond_clust["value_comp"] = Rcpp::as<Rcpp::CharacterVector>(NA_STRING);
|
378
541
|
break;
|
379
542
|
}
|
543
|
+
|
544
|
+
default:
|
545
|
+
{
|
546
|
+
unexpected_error();
|
547
|
+
}
|
380
548
|
}
|
381
549
|
break;
|
382
550
|
}
|
@@ -492,6 +660,12 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
|
492
660
|
}
|
493
661
|
break;
|
494
662
|
}
|
663
|
+
|
664
|
+
default:
|
665
|
+
{
|
666
|
+
assert(0);
|
667
|
+
break;
|
668
|
+
}
|
495
669
|
|
496
670
|
}
|
497
671
|
lst_cond[row] = Rcpp::List::create(Rcpp::clone(cond_clust));
|
@@ -528,6 +702,12 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
|
528
702
|
cond_clust["column"] = Rcpp::as<Rcpp::CharacterVector>(colnames_ord[model_outputs.all_trees[outl_col][curr_tree].col_num]);
|
529
703
|
break;
|
530
704
|
}
|
705
|
+
|
706
|
+
default:
|
707
|
+
{
|
708
|
+
assert(0);
|
709
|
+
break;
|
710
|
+
}
|
531
711
|
}
|
532
712
|
|
533
713
|
/* add conditions from tree */
|
@@ -599,6 +779,7 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
|
599
779
|
break;
|
600
780
|
}
|
601
781
|
|
782
|
+
default: {}
|
602
783
|
}
|
603
784
|
break;
|
604
785
|
}
|
@@ -696,6 +877,7 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
|
696
877
|
break;
|
697
878
|
}
|
698
879
|
|
880
|
+
default: {}
|
699
881
|
}
|
700
882
|
break;
|
701
883
|
}
|
@@ -758,10 +940,16 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
|
758
940
|
break;
|
759
941
|
}
|
760
942
|
|
943
|
+
default: {}
|
761
944
|
}
|
762
945
|
break;
|
763
946
|
}
|
764
947
|
|
948
|
+
default:
|
949
|
+
{
|
950
|
+
assert(0);
|
951
|
+
break;
|
952
|
+
}
|
765
953
|
}
|
766
954
|
}
|
767
955
|
|
@@ -796,6 +984,12 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
|
796
984
|
cond_clust["column"] = Rcpp::as<Rcpp::CharacterVector>(colnames_ord[model_outputs.all_trees[outl_col][parent_tree].col_num]);
|
797
985
|
break;
|
798
986
|
}
|
987
|
+
|
988
|
+
default:
|
989
|
+
{
|
990
|
+
assert(0);
|
991
|
+
break;
|
992
|
+
}
|
799
993
|
}
|
800
994
|
|
801
995
|
|
@@ -835,6 +1029,11 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
|
835
1029
|
cond_clust["value_comp"] = Rcpp::as<Rcpp::CharacterVector>(NA_STRING);
|
836
1030
|
break;
|
837
1031
|
}
|
1032
|
+
|
1033
|
+
default:
|
1034
|
+
{
|
1035
|
+
unexpected_error();
|
1036
|
+
}
|
838
1037
|
}
|
839
1038
|
break;
|
840
1039
|
}
|
@@ -1011,6 +1210,11 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
|
1011
1210
|
break;
|
1012
1211
|
}
|
1013
1212
|
|
1213
|
+
default:
|
1214
|
+
{
|
1215
|
+
assert(0);
|
1216
|
+
break;
|
1217
|
+
}
|
1014
1218
|
}
|
1015
1219
|
|
1016
1220
|
|
@@ -1038,6 +1242,37 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
|
1038
1242
|
return outp;
|
1039
1243
|
}
|
1040
1244
|
|
1245
|
+
struct args_describe_outliers {
|
1246
|
+
ModelOutputs *model_outputs;
|
1247
|
+
double *arr_num;
|
1248
|
+
int *arr_cat;
|
1249
|
+
int *arr_ord;
|
1250
|
+
Rcpp::ListOf<Rcpp::StringVector> *cat_levels;
|
1251
|
+
Rcpp::ListOf<Rcpp::StringVector> *ord_levels;
|
1252
|
+
Rcpp::StringVector *colnames_num;
|
1253
|
+
Rcpp::StringVector *colnames_cat;
|
1254
|
+
Rcpp::StringVector *colnames_ord;
|
1255
|
+
Rcpp::NumericVector *min_date;
|
1256
|
+
Rcpp::NumericVector *min_ts;
|
1257
|
+
};
|
1258
|
+
|
1259
|
+
SEXP describe_outliers_wrapper(void *args_)
|
1260
|
+
{
|
1261
|
+
args_describe_outliers *args = (args_describe_outliers*)args_;
|
1262
|
+
return describe_outliers(*(args->model_outputs),
|
1263
|
+
args->arr_num,
|
1264
|
+
args->arr_cat,
|
1265
|
+
args->arr_ord,
|
1266
|
+
*(args->cat_levels),
|
1267
|
+
*(args->ord_levels),
|
1268
|
+
*(args->colnames_num),
|
1269
|
+
*(args->colnames_cat),
|
1270
|
+
*(args->colnames_ord),
|
1271
|
+
*(args->min_date),
|
1272
|
+
*(args->min_ts));
|
1273
|
+
}
|
1274
|
+
|
1275
|
+
|
1041
1276
|
/* for extracting info about flaggable outliers */
|
1042
1277
|
Rcpp::List extract_outl_bounds(ModelOutputs &model_outputs,
|
1043
1278
|
Rcpp::ListOf<Rcpp::StringVector> cat_levels,
|
@@ -1102,9 +1337,27 @@ Rcpp::List extract_outl_bounds(ModelOutputs &model_outputs,
|
|
1102
1337
|
return outp;
|
1103
1338
|
}
|
1104
1339
|
|
1340
|
+
struct args_extract_outl_bounds {
|
1341
|
+
ModelOutputs *model_outputs;
|
1342
|
+
Rcpp::ListOf<Rcpp::StringVector> *cat_levels;
|
1343
|
+
Rcpp::ListOf<Rcpp::StringVector> *ord_levels;
|
1344
|
+
Rcpp::NumericVector *min_date;
|
1345
|
+
Rcpp::NumericVector *min_ts;
|
1346
|
+
};
|
1347
|
+
|
1348
|
+
SEXP extract_outl_bounds_wrapper(void *args_)
|
1349
|
+
{
|
1350
|
+
args_extract_outl_bounds *args = (args_extract_outl_bounds*)args_;
|
1351
|
+
return extract_outl_bounds(*(args->model_outputs),
|
1352
|
+
*(args->cat_levels),
|
1353
|
+
*(args->ord_levels),
|
1354
|
+
*(args->min_date),
|
1355
|
+
*(args->min_ts));
|
1356
|
+
}
|
1357
|
+
|
1105
1358
|
|
1106
1359
|
/* external functions for fitting the model and predicting outliers */
|
1107
|
-
// [[Rcpp::export]]
|
1360
|
+
// [[Rcpp::export(rng = false)]]
|
1108
1361
|
Rcpp::List fit_OutlierTree(Rcpp::NumericVector arr_num, size_t ncols_numeric,
|
1109
1362
|
Rcpp::IntegerVector arr_cat, size_t ncols_categ, Rcpp::IntegerVector ncat,
|
1110
1363
|
Rcpp::IntegerVector arr_ord, size_t ncols_ord, Rcpp::IntegerVector ncat_ord,
|
@@ -1121,8 +1374,16 @@ Rcpp::List fit_OutlierTree(Rcpp::NumericVector arr_num, size_t ncols_numeric,
|
|
1121
1374
|
Rcpp::NumericVector min_date,
|
1122
1375
|
Rcpp::NumericVector min_ts)
|
1123
1376
|
{
|
1377
|
+
Rcpp::List outp = Rcpp::List::create(
|
1378
|
+
Rcpp::_["ptr_model"] = R_NilValue,
|
1379
|
+
Rcpp::_["bounds"] = R_NilValue,
|
1380
|
+
Rcpp::_["outliers_info"] = R_NilValue,
|
1381
|
+
Rcpp::_["ntrees"] = R_NilValue,
|
1382
|
+
Rcpp::_["nclust"] = R_NilValue,
|
1383
|
+
Rcpp::_["found_outliers"] = R_NilValue
|
1384
|
+
);
|
1385
|
+
|
1124
1386
|
bool found_outliers;
|
1125
|
-
Rcpp::List outp;
|
1126
1387
|
size_t tot_cols = ncols_numeric + ncols_categ + ncols_ord;
|
1127
1388
|
std::vector<char> cols_ignore;
|
1128
1389
|
char *cols_ignore_ptr = NULL;
|
@@ -1132,54 +1393,62 @@ Rcpp::List fit_OutlierTree(Rcpp::NumericVector arr_num, size_t ncols_numeric,
|
|
1132
1393
|
cols_ignore_ptr = &cols_ignore[0];
|
1133
1394
|
}
|
1134
1395
|
std::vector<double> Xcpp;
|
1135
|
-
double *arr_num_C = set_R_nan_as_C_nan(
|
1396
|
+
double *arr_num_C = set_R_nan_as_C_nan(REAL(arr_num), Xcpp, arr_num.size(), nthreads);
|
1136
1397
|
|
1137
1398
|
std::unique_ptr<ModelOutputs> model_outputs = std::unique_ptr<ModelOutputs>(new ModelOutputs());
|
1138
1399
|
found_outliers = fit_outliers_models(*model_outputs,
|
1139
1400
|
arr_num_C, ncols_numeric,
|
1140
|
-
|
1141
|
-
|
1401
|
+
INTEGER(arr_cat), ncols_categ, INTEGER(ncat),
|
1402
|
+
INTEGER(arr_ord), ncols_ord, INTEGER(ncat_ord),
|
1142
1403
|
nrows, cols_ignore_ptr, nthreads,
|
1143
1404
|
categ_as_bin, ord_as_bin, cat_bruteforce_subset, categ_from_maj, take_mid,
|
1144
1405
|
max_depth, max_perc_outliers, min_size_numeric, min_size_categ,
|
1145
1406
|
min_gain, gain_as_pct, follow_all, z_norm, z_outlier);
|
1146
1407
|
|
1147
|
-
|
1148
|
-
|
1149
|
-
|
1150
|
-
|
1151
|
-
|
1408
|
+
args_extract_outl_bounds temp = {
|
1409
|
+
model_outputs.get(),
|
1410
|
+
&cat_levels,
|
1411
|
+
&ord_levels,
|
1412
|
+
&min_date,
|
1413
|
+
&min_ts
|
1414
|
+
};
|
1415
|
+
outp["bounds"] = Rcpp::unwindProtect(extract_outl_bounds_wrapper, (void*)&temp);
|
1152
1416
|
|
1153
|
-
outp["serialized_obj"] = serialize_OutlierTree(model_outputs.get());
|
1154
1417
|
if (return_outliers) {
|
1155
|
-
|
1156
|
-
|
1157
|
-
|
1158
|
-
|
1159
|
-
|
1160
|
-
|
1161
|
-
|
1162
|
-
|
1163
|
-
|
1164
|
-
|
1165
|
-
|
1418
|
+
args_describe_outliers temp = {
|
1419
|
+
model_outputs.get(),
|
1420
|
+
arr_num_C,
|
1421
|
+
INTEGER(arr_cat),
|
1422
|
+
INTEGER(arr_ord),
|
1423
|
+
&cat_levels,
|
1424
|
+
&ord_levels,
|
1425
|
+
&colnames_num,
|
1426
|
+
&colnames_cat,
|
1427
|
+
&colnames_ord,
|
1428
|
+
&min_date,
|
1429
|
+
&min_ts
|
1430
|
+
};
|
1431
|
+
outp["outliers_info"] = Rcpp::unwindProtect(describe_outliers_wrapper, (void*)&temp);
|
1166
1432
|
}
|
1433
|
+
forget_row_outputs(*model_outputs);
|
1434
|
+
|
1167
1435
|
/* add number of trees and clusters */
|
1168
1436
|
size_t ntrees = 0, nclust = 0;
|
1169
1437
|
for (size_t col = 0; col < model_outputs->all_trees.size(); col++) {
|
1170
1438
|
ntrees += model_outputs->all_trees[col].size();
|
1171
1439
|
nclust += model_outputs->all_clusters[col].size();
|
1172
1440
|
}
|
1173
|
-
|
1174
|
-
|
1175
|
-
outp["
|
1441
|
+
int ntrees_int = (int)ntrees;
|
1442
|
+
int nclust_int = (int)nclust;
|
1443
|
+
outp["ntrees"] = Rcpp::unwindProtect(safe_int, (void*)&ntrees_int);
|
1444
|
+
outp["nclust"] = Rcpp::unwindProtect(safe_int, (void*)&nclust_int);
|
1445
|
+
outp["found_outliers"] = Rcpp::unwindProtect(safe_bool, (void*)&found_outliers);
|
1176
1446
|
|
1177
|
-
|
1178
|
-
outp["ptr_model"] = Rcpp::XPtr<ModelOutputs>(model_outputs.release(), true);
|
1447
|
+
outp["ptr_model"] = Rcpp::unwindProtect(get_altrepped_pointer, &model_outputs);
|
1179
1448
|
return outp;
|
1180
1449
|
}
|
1181
1450
|
|
1182
|
-
// [[Rcpp::export]]
|
1451
|
+
// [[Rcpp::export(rng = false)]]
|
1183
1452
|
Rcpp::List predict_OutlierTree(SEXP ptr_model, size_t nrows, int nthreads,
|
1184
1453
|
Rcpp::NumericVector arr_num, Rcpp::IntegerVector arr_cat, Rcpp::IntegerVector arr_ord,
|
1185
1454
|
Rcpp::ListOf<Rcpp::StringVector> cat_levels,
|
@@ -1190,36 +1459,59 @@ Rcpp::List predict_OutlierTree(SEXP ptr_model, size_t nrows, int nthreads,
|
|
1190
1459
|
Rcpp::NumericVector min_date,
|
1191
1460
|
Rcpp::NumericVector min_ts)
|
1192
1461
|
{
|
1193
|
-
|
1194
|
-
double *arr_num_C = set_R_nan_as_C_nan(
|
1462
|
+
Rcpp::NumericVector Xcpp;
|
1463
|
+
double *arr_num_C = set_R_nan_as_C_nan(REAL(arr_num), Xcpp, arr_num.size(), nthreads);
|
1195
1464
|
|
1196
1465
|
ModelOutputs *model_outputs = static_cast<ModelOutputs*>(R_ExternalPtrAddr(ptr_model));
|
1197
|
-
bool found_outliers = find_new_outliers(
|
1466
|
+
bool found_outliers = find_new_outliers(arr_num_C, INTEGER(arr_cat), INTEGER(arr_ord),
|
1198
1467
|
nrows, nthreads, *model_outputs);
|
1199
|
-
|
1200
|
-
|
1201
|
-
|
1202
|
-
|
1203
|
-
|
1204
|
-
|
1205
|
-
|
1206
|
-
|
1207
|
-
|
1208
|
-
|
1209
|
-
|
1210
|
-
|
1468
|
+
args_describe_outliers temp = {
|
1469
|
+
model_outputs,
|
1470
|
+
arr_num_C,
|
1471
|
+
INTEGER(arr_cat),
|
1472
|
+
INTEGER(arr_ord),
|
1473
|
+
&cat_levels,
|
1474
|
+
&ord_levels,
|
1475
|
+
&colnames_num,
|
1476
|
+
&colnames_cat,
|
1477
|
+
&colnames_ord,
|
1478
|
+
&min_date,
|
1479
|
+
&min_ts
|
1480
|
+
};
|
1481
|
+
|
1482
|
+
Rcpp::List outp;
|
1483
|
+
try {
|
1484
|
+
outp = Rcpp::unwindProtect(describe_outliers_wrapper, (void*)&temp);
|
1485
|
+
} catch(...) {
|
1486
|
+
forget_row_outputs(*model_outputs);
|
1487
|
+
throw;
|
1488
|
+
}
|
1211
1489
|
forget_row_outputs(*model_outputs);
|
1490
|
+
outp["found_outliers"] = Rcpp::LogicalVector(found_outliers);
|
1212
1491
|
return outp;
|
1213
1492
|
}
|
1214
1493
|
|
1215
|
-
// [[Rcpp::export]]
|
1494
|
+
// [[Rcpp::export(rng = false)]]
|
1216
1495
|
Rcpp::LogicalVector check_few_values(Rcpp::NumericVector arr_num, size_t nrows, size_t ncols, int nthreads)
|
1217
1496
|
{
|
1218
|
-
std::vector<char> too_few_vals(ncols, 0);
|
1219
|
-
check_more_two_values(&arr_num[0], nrows, ncols, nthreads, too_few_vals.data());
|
1220
1497
|
Rcpp::LogicalVector outp(ncols);
|
1498
|
+
std::vector<char> too_few_vals(ncols, 0);
|
1499
|
+
check_more_two_values(REAL(arr_num), nrows, ncols, nthreads, too_few_vals.data());
|
1221
1500
|
for (size_t col = 0; col < ncols; col++) {
|
1222
1501
|
outp[col] = (bool) too_few_vals[col];
|
1223
1502
|
}
|
1224
1503
|
return outp;
|
1225
1504
|
}
|
1505
|
+
|
1506
|
+
|
1507
|
+
// [[Rcpp::export(rng = false)]]
|
1508
|
+
bool R_has_openmp()
|
1509
|
+
{
|
1510
|
+
#ifdef _OPENMP
|
1511
|
+
return true;
|
1512
|
+
#else
|
1513
|
+
return false;
|
1514
|
+
#endif
|
1515
|
+
}
|
1516
|
+
|
1517
|
+
#endif /* _FOR_R */
|
@@ -74,7 +74,7 @@
|
|
74
74
|
*/
|
75
75
|
void find_outlier_categories(size_t categ_counts[], size_t ncateg, size_t tot, double max_perc_outliers,
|
76
76
|
long double perc_threshold[], size_t buffer_ix[], long double buffer_perc[],
|
77
|
-
double z_norm, char is_outlier[], bool *found_outliers, bool *new_is_outlier,
|
77
|
+
double z_norm, signed char is_outlier[], bool *found_outliers, bool *new_is_outlier,
|
78
78
|
double *next_most_comm)
|
79
79
|
{
|
80
80
|
//TODO: must also establish bounds for new, unseen categories
|
@@ -90,7 +90,7 @@ void find_outlier_categories(size_t categ_counts[], size_t ncateg, size_t tot, d
|
|
90
90
|
size_t size_tail = 0;
|
91
91
|
|
92
92
|
/* reset the temporary arrays and fill them */
|
93
|
-
memset(is_outlier, 0, ncateg * sizeof(char));
|
93
|
+
memset(is_outlier, 0, ncateg * sizeof(signed char));
|
94
94
|
for (size_t cat = 0; cat < ncateg; cat++) {
|
95
95
|
buffer_ix[cat] = cat;
|
96
96
|
buffer_perc[cat] = (categ_counts[cat] > 0)? ((long double)categ_counts[cat] / tot_dbl) : 0;
|
@@ -225,13 +225,13 @@ void find_outlier_categories(size_t categ_counts[], size_t ncateg, size_t tot, d
|
|
225
225
|
* Category to which the majority of the observations belong.
|
226
226
|
*/
|
227
227
|
void find_outlier_categories_by_maj(size_t categ_counts[], size_t ncateg, size_t tot, double max_perc_outliers,
|
228
|
-
long double prior_prob[], double z_outlier, char is_outlier[],
|
228
|
+
long double prior_prob[], double z_outlier, signed char is_outlier[],
|
229
229
|
bool *found_outliers, bool *new_is_outlier, int *categ_maj)
|
230
230
|
{
|
231
231
|
/* initialize parameters as needed */
|
232
232
|
*found_outliers = false;
|
233
233
|
*new_is_outlier = false;
|
234
|
-
memset(is_outlier, 0, ncateg * sizeof(char));
|
234
|
+
memset(is_outlier, 0, ncateg * sizeof(signed char));
|
235
235
|
size_t max_outliers = (size_t) calculate_max_outliers((long double)tot, max_perc_outliers);
|
236
236
|
long double tot_dbl = (long double) (tot + 1);
|
237
237
|
size_t n_non_maj;
|
@@ -283,7 +283,7 @@ void find_outlier_categories_by_maj(size_t categ_counts[], size_t ncateg, size_t
|
|
283
283
|
* Proportion of the least common non-outlier category.
|
284
284
|
*/
|
285
285
|
bool find_outlier_categories_no_cond(size_t categ_counts[], size_t ncateg, size_t tot,
|
286
|
-
char is_outlier[], double *next_most_comm)
|
286
|
+
signed char is_outlier[], double *next_most_comm)
|
287
287
|
{
|
288
288
|
/* if sample is too small, don't flag any as outliers */
|
289
289
|
if (tot < 1000) return false;
|
@@ -296,7 +296,7 @@ bool find_outlier_categories_no_cond(size_t categ_counts[], size_t ncateg, size_
|
|
296
296
|
|
297
297
|
/* look if there's any category meeting the first condition and none meeting the second one */
|
298
298
|
bool has_outlier_cat = false;
|
299
|
-
memset(is_outlier, 0, sizeof(char) * ncateg);
|
299
|
+
memset(is_outlier, 0, sizeof(signed char) * ncateg);
|
300
300
|
for (size_t cat = 0; cat < ncateg; cat++) {
|
301
301
|
if (categ_counts[cat] > max_outliers && categ_counts[cat] < max_next_most_comm) {
|
302
302
|
has_outlier_cat = false;
|