outliertree 0.2.1 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/NOTICE.txt +1 -1
- data/README.md +11 -10
- data/ext/outliertree/extconf.rb +1 -1
- data/lib/outliertree/result.rb +3 -3
- data/lib/outliertree/version.rb +1 -1
- data/vendor/outliertree/README.md +77 -40
- data/vendor/outliertree/src/Makevars.in +4 -0
- data/vendor/outliertree/src/Makevars.win +4 -0
- data/vendor/outliertree/src/RcppExports.cpp +20 -9
- data/vendor/outliertree/src/Rwrapper.cpp +256 -57
- data/vendor/outliertree/src/cat_outlier.cpp +6 -6
- data/vendor/outliertree/src/clusters.cpp +114 -9
- data/vendor/outliertree/src/fit_model.cpp +505 -308
- data/vendor/outliertree/src/misc.cpp +165 -4
- data/vendor/outliertree/src/outlier_tree.hpp +159 -51
- data/vendor/outliertree/src/outliertree-win.def +3 -0
- data/vendor/outliertree/src/predict.cpp +33 -0
- data/vendor/outliertree/src/split.cpp +124 -20
- metadata +8 -6
- data/vendor/outliertree/src/Makevars +0 -3
@@ -1,5 +1,9 @@
|
|
1
|
+
#ifdef _FOR_R
|
2
|
+
|
1
3
|
#include <Rcpp.h>
|
4
|
+
#include <Rcpp/unwindProtect.h>
|
2
5
|
// [[Rcpp::plugins(cpp11)]]
|
6
|
+
// [[Rcpp::plugins(unwindProtect)]]
|
3
7
|
|
4
8
|
/* This is to serialize the model objects */
|
5
9
|
// [[Rcpp::depends(Rcereal)]]
|
@@ -7,14 +11,22 @@
|
|
7
11
|
#include <cereal/types/vector.hpp>
|
8
12
|
#include <sstream>
|
9
13
|
#include <string>
|
14
|
+
#include <limits>
|
10
15
|
|
11
16
|
/* This is the package's header */
|
12
17
|
#include "outlier_tree.hpp"
|
13
18
|
|
19
|
+
SEXP alloc_RawVec(void *data)
|
20
|
+
{
|
21
|
+
size_t vec_size = *(size_t*)data;
|
22
|
+
if (vec_size > (size_t)std::numeric_limits<R_xlen_t>::max())
|
23
|
+
Rcpp::stop("Resulting model is too large for R to handle.");
|
24
|
+
return Rcpp::RawVector((R_xlen_t)vec_size);
|
25
|
+
}
|
26
|
+
|
14
27
|
/* for model serialization and re-usage in R */
|
15
28
|
/* https://stackoverflow.com/questions/18474292/how-to-handle-c-internal-data-structure-in-r-in-order-to-allow-save-load */
|
16
29
|
/* this extra comment below the link is a workaround for Rcpp issue 675 in GitHub, do not remove it */
|
17
|
-
#include <Rinternals.h>
|
18
30
|
Rcpp::RawVector serialize_OutlierTree(ModelOutputs *model_outputs)
|
19
31
|
{
|
20
32
|
std::stringstream ss;
|
@@ -23,27 +35,60 @@ Rcpp::RawVector serialize_OutlierTree(ModelOutputs *model_outputs)
|
|
23
35
|
oarchive(*model_outputs);
|
24
36
|
}
|
25
37
|
ss.seekg(0, ss.end);
|
26
|
-
|
38
|
+
std::stringstream::pos_type vec_size = ss.tellg();
|
39
|
+
if (vec_size <= 0) {
|
40
|
+
Rcpp::Rcerr << "Error: model is too big to serialize, resulting object will not be usable.\n" << std::endl;
|
41
|
+
return Rcpp::RawVector();
|
42
|
+
}
|
43
|
+
size_t vec_size_ = (size_t)vec_size;
|
44
|
+
Rcpp::RawVector retval = Rcpp::unwindProtect(alloc_RawVec, (void*)&vec_size_);
|
45
|
+
if (!retval.size())
|
46
|
+
return retval;
|
27
47
|
ss.seekg(0, ss.beg);
|
28
|
-
ss.read(reinterpret_cast<char*>(
|
48
|
+
ss.read(reinterpret_cast<char*>(RAW(retval)), retval.size());
|
29
49
|
return retval;
|
30
50
|
}
|
31
51
|
|
32
|
-
|
33
|
-
|
52
|
+
SEXP safe_XPtr(void *model_ptr)
|
53
|
+
{
|
54
|
+
return Rcpp::XPtr<ModelOutputs>((ModelOutputs*)model_ptr, true);
|
55
|
+
}
|
56
|
+
|
57
|
+
void R_delete_model(SEXP R_ptr)
|
58
|
+
{
|
59
|
+
ModelOutputs *model = static_cast<ModelOutputs*>(R_ExternalPtrAddr(R_ptr));
|
60
|
+
delete model;
|
61
|
+
R_ClearExternalPtr(R_ptr);
|
62
|
+
}
|
63
|
+
|
64
|
+
// [[Rcpp::export(rng = false)]]
|
65
|
+
SEXP deserialize_OutlierTree(Rcpp::RawVector src, SEXP ptr_obj)
|
34
66
|
{
|
35
67
|
std::stringstream ss;
|
36
|
-
ss.write(reinterpret_cast<char*>(
|
68
|
+
ss.write(reinterpret_cast<char*>(RAW(src)), src.size());
|
37
69
|
ss.seekg(0, ss.beg);
|
38
70
|
std::unique_ptr<ModelOutputs> model_outputs = std::unique_ptr<ModelOutputs>(new ModelOutputs());
|
39
71
|
{
|
40
72
|
cereal::BinaryInputArchive iarchive(ss);
|
41
73
|
iarchive(*model_outputs);
|
42
74
|
}
|
43
|
-
|
75
|
+
R_SetExternalPtrAddr(ptr_obj, model_outputs.get());
|
76
|
+
R_RegisterCFinalizerEx(ptr_obj, R_delete_model, TRUE);
|
77
|
+
model_outputs.release();
|
78
|
+
return R_NilValue;
|
44
79
|
}
|
45
80
|
|
46
|
-
|
81
|
+
SEXP safe_int(void *x)
|
82
|
+
{
|
83
|
+
return Rcpp::wrap(*(int*)x);
|
84
|
+
}
|
85
|
+
|
86
|
+
SEXP safe_bool(void *x)
|
87
|
+
{
|
88
|
+
return Rcpp::wrap(*(bool*)x);
|
89
|
+
}
|
90
|
+
|
91
|
+
// [[Rcpp::export(rng = false)]]
|
47
92
|
Rcpp::LogicalVector check_null_ptr_model(SEXP ptr_model)
|
48
93
|
{
|
49
94
|
return Rcpp::LogicalVector(R_ExternalPtrAddr(ptr_model) == NULL);
|
@@ -59,12 +104,22 @@ double* set_R_nan_as_C_nan(double *restrict x_R, std::vector<double> &x_C, size_
|
|
59
104
|
return x_C.data();
|
60
105
|
}
|
61
106
|
|
107
|
+
double* set_R_nan_as_C_nan(double *restrict x_R, Rcpp::NumericVector &x_C, size_t n, int nthreads)
|
108
|
+
{
|
109
|
+
x_C = Rcpp::NumericVector(x_R, x_R + n);
|
110
|
+
#pragma omp parallel for schedule(static) num_threads(nthreads) shared(x_R, x_C, n)
|
111
|
+
for (size_t_for i = 0; i < n; i++)
|
112
|
+
if (isnan(x_R[i]) || Rcpp::NumericVector::is_na(x_R[i]) || Rcpp::traits::is_nan<REALSXP>(x_R[i]))
|
113
|
+
x_C[i] = NAN;
|
114
|
+
return REAL(x_C);
|
115
|
+
}
|
116
|
+
|
62
117
|
|
63
118
|
/* for predicting outliers */
|
64
119
|
Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
65
|
-
double *arr_num,
|
66
|
-
int *arr_cat,
|
67
|
-
int *arr_ord,
|
120
|
+
double *restrict arr_num,
|
121
|
+
int *restrict arr_cat,
|
122
|
+
int *restrict arr_ord,
|
68
123
|
Rcpp::ListOf<Rcpp::StringVector> cat_levels,
|
69
124
|
Rcpp::ListOf<Rcpp::StringVector> ord_levels,
|
70
125
|
Rcpp::StringVector colnames_num,
|
@@ -345,6 +400,12 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
|
345
400
|
}
|
346
401
|
break;
|
347
402
|
}
|
403
|
+
|
404
|
+
default:
|
405
|
+
{
|
406
|
+
assert(0);
|
407
|
+
break;
|
408
|
+
}
|
348
409
|
}
|
349
410
|
|
350
411
|
/* add the comparison point */
|
@@ -377,6 +438,11 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
|
377
438
|
cond_clust["value_comp"] = Rcpp::as<Rcpp::CharacterVector>(NA_STRING);
|
378
439
|
break;
|
379
440
|
}
|
441
|
+
|
442
|
+
default:
|
443
|
+
{
|
444
|
+
unexpected_error();
|
445
|
+
}
|
380
446
|
}
|
381
447
|
break;
|
382
448
|
}
|
@@ -492,6 +558,12 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
|
492
558
|
}
|
493
559
|
break;
|
494
560
|
}
|
561
|
+
|
562
|
+
default:
|
563
|
+
{
|
564
|
+
assert(0);
|
565
|
+
break;
|
566
|
+
}
|
495
567
|
|
496
568
|
}
|
497
569
|
lst_cond[row] = Rcpp::List::create(Rcpp::clone(cond_clust));
|
@@ -528,6 +600,12 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
|
528
600
|
cond_clust["column"] = Rcpp::as<Rcpp::CharacterVector>(colnames_ord[model_outputs.all_trees[outl_col][curr_tree].col_num]);
|
529
601
|
break;
|
530
602
|
}
|
603
|
+
|
604
|
+
default:
|
605
|
+
{
|
606
|
+
assert(0);
|
607
|
+
break;
|
608
|
+
}
|
531
609
|
}
|
532
610
|
|
533
611
|
/* add conditions from tree */
|
@@ -599,6 +677,7 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
|
599
677
|
break;
|
600
678
|
}
|
601
679
|
|
680
|
+
default: {}
|
602
681
|
}
|
603
682
|
break;
|
604
683
|
}
|
@@ -696,6 +775,7 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
|
696
775
|
break;
|
697
776
|
}
|
698
777
|
|
778
|
+
default: {}
|
699
779
|
}
|
700
780
|
break;
|
701
781
|
}
|
@@ -758,10 +838,16 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
|
758
838
|
break;
|
759
839
|
}
|
760
840
|
|
841
|
+
default: {}
|
761
842
|
}
|
762
843
|
break;
|
763
844
|
}
|
764
845
|
|
846
|
+
default:
|
847
|
+
{
|
848
|
+
assert(0);
|
849
|
+
break;
|
850
|
+
}
|
765
851
|
}
|
766
852
|
}
|
767
853
|
|
@@ -796,6 +882,12 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
|
796
882
|
cond_clust["column"] = Rcpp::as<Rcpp::CharacterVector>(colnames_ord[model_outputs.all_trees[outl_col][parent_tree].col_num]);
|
797
883
|
break;
|
798
884
|
}
|
885
|
+
|
886
|
+
default:
|
887
|
+
{
|
888
|
+
assert(0);
|
889
|
+
break;
|
890
|
+
}
|
799
891
|
}
|
800
892
|
|
801
893
|
|
@@ -835,6 +927,11 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
|
835
927
|
cond_clust["value_comp"] = Rcpp::as<Rcpp::CharacterVector>(NA_STRING);
|
836
928
|
break;
|
837
929
|
}
|
930
|
+
|
931
|
+
default:
|
932
|
+
{
|
933
|
+
unexpected_error();
|
934
|
+
}
|
838
935
|
}
|
839
936
|
break;
|
840
937
|
}
|
@@ -1011,6 +1108,11 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
|
1011
1108
|
break;
|
1012
1109
|
}
|
1013
1110
|
|
1111
|
+
default:
|
1112
|
+
{
|
1113
|
+
assert(0);
|
1114
|
+
break;
|
1115
|
+
}
|
1014
1116
|
}
|
1015
1117
|
|
1016
1118
|
|
@@ -1038,6 +1140,37 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
|
1038
1140
|
return outp;
|
1039
1141
|
}
|
1040
1142
|
|
1143
|
+
struct args_describe_outliers {
|
1144
|
+
ModelOutputs *model_outputs;
|
1145
|
+
double *arr_num;
|
1146
|
+
int *arr_cat;
|
1147
|
+
int *arr_ord;
|
1148
|
+
Rcpp::ListOf<Rcpp::StringVector> *cat_levels;
|
1149
|
+
Rcpp::ListOf<Rcpp::StringVector> *ord_levels;
|
1150
|
+
Rcpp::StringVector *colnames_num;
|
1151
|
+
Rcpp::StringVector *colnames_cat;
|
1152
|
+
Rcpp::StringVector *colnames_ord;
|
1153
|
+
Rcpp::NumericVector *min_date;
|
1154
|
+
Rcpp::NumericVector *min_ts;
|
1155
|
+
};
|
1156
|
+
|
1157
|
+
SEXP describe_outliers_wrapper(void *args_)
|
1158
|
+
{
|
1159
|
+
args_describe_outliers *args = (args_describe_outliers*)args_;
|
1160
|
+
return describe_outliers(*(args->model_outputs),
|
1161
|
+
args->arr_num,
|
1162
|
+
args->arr_cat,
|
1163
|
+
args->arr_ord,
|
1164
|
+
*(args->cat_levels),
|
1165
|
+
*(args->ord_levels),
|
1166
|
+
*(args->colnames_num),
|
1167
|
+
*(args->colnames_cat),
|
1168
|
+
*(args->colnames_ord),
|
1169
|
+
*(args->min_date),
|
1170
|
+
*(args->min_ts));
|
1171
|
+
}
|
1172
|
+
|
1173
|
+
|
1041
1174
|
/* for extracting info about flaggable outliers */
|
1042
1175
|
Rcpp::List extract_outl_bounds(ModelOutputs &model_outputs,
|
1043
1176
|
Rcpp::ListOf<Rcpp::StringVector> cat_levels,
|
@@ -1102,9 +1235,27 @@ Rcpp::List extract_outl_bounds(ModelOutputs &model_outputs,
|
|
1102
1235
|
return outp;
|
1103
1236
|
}
|
1104
1237
|
|
1238
|
+
struct args_extract_outl_bounds {
|
1239
|
+
ModelOutputs *model_outputs;
|
1240
|
+
Rcpp::ListOf<Rcpp::StringVector> *cat_levels;
|
1241
|
+
Rcpp::ListOf<Rcpp::StringVector> *ord_levels;
|
1242
|
+
Rcpp::NumericVector *min_date;
|
1243
|
+
Rcpp::NumericVector *min_ts;
|
1244
|
+
};
|
1245
|
+
|
1246
|
+
SEXP extract_outl_bounds_wrapper(void *args_)
|
1247
|
+
{
|
1248
|
+
args_extract_outl_bounds *args = (args_extract_outl_bounds*)args_;
|
1249
|
+
return extract_outl_bounds(*(args->model_outputs),
|
1250
|
+
*(args->cat_levels),
|
1251
|
+
*(args->ord_levels),
|
1252
|
+
*(args->min_date),
|
1253
|
+
*(args->min_ts));
|
1254
|
+
}
|
1255
|
+
|
1105
1256
|
|
1106
1257
|
/* external functions for fitting the model and predicting outliers */
|
1107
|
-
// [[Rcpp::export]]
|
1258
|
+
// [[Rcpp::export(rng = false)]]
|
1108
1259
|
Rcpp::List fit_OutlierTree(Rcpp::NumericVector arr_num, size_t ncols_numeric,
|
1109
1260
|
Rcpp::IntegerVector arr_cat, size_t ncols_categ, Rcpp::IntegerVector ncat,
|
1110
1261
|
Rcpp::IntegerVector arr_ord, size_t ncols_ord, Rcpp::IntegerVector ncat_ord,
|
@@ -1121,8 +1272,17 @@ Rcpp::List fit_OutlierTree(Rcpp::NumericVector arr_num, size_t ncols_numeric,
|
|
1121
1272
|
Rcpp::NumericVector min_date,
|
1122
1273
|
Rcpp::NumericVector min_ts)
|
1123
1274
|
{
|
1275
|
+
Rcpp::List outp = Rcpp::List::create(
|
1276
|
+
Rcpp::_["ptr_model"] = R_NilValue,
|
1277
|
+
Rcpp::_["serialized_obj"] = R_NilValue,
|
1278
|
+
Rcpp::_["bounds"] = R_NilValue,
|
1279
|
+
Rcpp::_["outliers_info"] = R_NilValue,
|
1280
|
+
Rcpp::_["ntrees"] = R_NilValue,
|
1281
|
+
Rcpp::_["nclust"] = R_NilValue,
|
1282
|
+
Rcpp::_["found_outliers"] = R_NilValue
|
1283
|
+
);
|
1284
|
+
|
1124
1285
|
bool found_outliers;
|
1125
|
-
Rcpp::List outp;
|
1126
1286
|
size_t tot_cols = ncols_numeric + ncols_categ + ncols_ord;
|
1127
1287
|
std::vector<char> cols_ignore;
|
1128
1288
|
char *cols_ignore_ptr = NULL;
|
@@ -1132,54 +1292,70 @@ Rcpp::List fit_OutlierTree(Rcpp::NumericVector arr_num, size_t ncols_numeric,
|
|
1132
1292
|
cols_ignore_ptr = &cols_ignore[0];
|
1133
1293
|
}
|
1134
1294
|
std::vector<double> Xcpp;
|
1135
|
-
double *arr_num_C = set_R_nan_as_C_nan(
|
1295
|
+
double *arr_num_C = set_R_nan_as_C_nan(REAL(arr_num), Xcpp, arr_num.size(), nthreads);
|
1136
1296
|
|
1137
1297
|
std::unique_ptr<ModelOutputs> model_outputs = std::unique_ptr<ModelOutputs>(new ModelOutputs());
|
1298
|
+
try {
|
1138
1299
|
found_outliers = fit_outliers_models(*model_outputs,
|
1139
1300
|
arr_num_C, ncols_numeric,
|
1140
|
-
|
1141
|
-
|
1301
|
+
INTEGER(arr_cat), ncols_categ, INTEGER(ncat),
|
1302
|
+
INTEGER(arr_ord), ncols_ord, INTEGER(ncat_ord),
|
1142
1303
|
nrows, cols_ignore_ptr, nthreads,
|
1143
1304
|
categ_as_bin, ord_as_bin, cat_bruteforce_subset, categ_from_maj, take_mid,
|
1144
1305
|
max_depth, max_perc_outliers, min_size_numeric, min_size_categ,
|
1145
1306
|
min_gain, gain_as_pct, follow_all, z_norm, z_outlier);
|
1146
1307
|
|
1147
|
-
|
1148
|
-
|
1149
|
-
|
1150
|
-
|
1151
|
-
|
1152
|
-
|
1308
|
+
args_extract_outl_bounds temp = {
|
1309
|
+
model_outputs.get(),
|
1310
|
+
&cat_levels,
|
1311
|
+
&ord_levels,
|
1312
|
+
&min_date,
|
1313
|
+
&min_ts
|
1314
|
+
};
|
1315
|
+
outp["bounds"] = Rcpp::unwindProtect(extract_outl_bounds_wrapper, (void*)&temp);
|
1153
1316
|
outp["serialized_obj"] = serialize_OutlierTree(model_outputs.get());
|
1317
|
+
} catch(std::bad_alloc &e) {
|
1318
|
+
Rcpp::stop("Insufficient memory.\n");
|
1319
|
+
}
|
1320
|
+
|
1321
|
+
if (!Rf_xlength(outp["serialized_obj"]))
|
1322
|
+
return outp;
|
1154
1323
|
if (return_outliers) {
|
1155
|
-
|
1156
|
-
|
1157
|
-
|
1158
|
-
|
1159
|
-
|
1160
|
-
|
1161
|
-
|
1162
|
-
|
1163
|
-
|
1164
|
-
|
1165
|
-
|
1324
|
+
args_describe_outliers temp = {
|
1325
|
+
model_outputs.get(),
|
1326
|
+
arr_num_C,
|
1327
|
+
INTEGER(arr_cat),
|
1328
|
+
INTEGER(arr_ord),
|
1329
|
+
&cat_levels,
|
1330
|
+
&ord_levels,
|
1331
|
+
&colnames_num,
|
1332
|
+
&colnames_cat,
|
1333
|
+
&colnames_ord,
|
1334
|
+
&min_date,
|
1335
|
+
&min_ts
|
1336
|
+
};
|
1337
|
+
outp["outliers_info"] = Rcpp::unwindProtect(describe_outliers_wrapper, (void*)&temp);
|
1166
1338
|
}
|
1339
|
+
forget_row_outputs(*model_outputs);
|
1340
|
+
|
1167
1341
|
/* add number of trees and clusters */
|
1168
1342
|
size_t ntrees = 0, nclust = 0;
|
1169
1343
|
for (size_t col = 0; col < model_outputs->all_trees.size(); col++) {
|
1170
1344
|
ntrees += model_outputs->all_trees[col].size();
|
1171
1345
|
nclust += model_outputs->all_clusters[col].size();
|
1172
1346
|
}
|
1173
|
-
|
1174
|
-
|
1175
|
-
outp["
|
1347
|
+
int ntrees_int = (int)ntrees;
|
1348
|
+
int nclust_int = (int)nclust;
|
1349
|
+
outp["ntrees"] = Rcpp::unwindProtect(safe_int, (void*)&ntrees_int);
|
1350
|
+
outp["nclust"] = Rcpp::unwindProtect(safe_int, (void*)&nclust_int);
|
1351
|
+
outp["found_outliers"] = Rcpp::unwindProtect(safe_bool, (void*)&found_outliers);
|
1176
1352
|
|
1177
|
-
|
1178
|
-
|
1353
|
+
outp["ptr_model"] = Rcpp::unwindProtect(safe_XPtr, model_outputs.get());
|
1354
|
+
model_outputs.release();
|
1179
1355
|
return outp;
|
1180
1356
|
}
|
1181
1357
|
|
1182
|
-
// [[Rcpp::export]]
|
1358
|
+
// [[Rcpp::export(rng = false)]]
|
1183
1359
|
Rcpp::List predict_OutlierTree(SEXP ptr_model, size_t nrows, int nthreads,
|
1184
1360
|
Rcpp::NumericVector arr_num, Rcpp::IntegerVector arr_cat, Rcpp::IntegerVector arr_ord,
|
1185
1361
|
Rcpp::ListOf<Rcpp::StringVector> cat_levels,
|
@@ -1190,36 +1366,59 @@ Rcpp::List predict_OutlierTree(SEXP ptr_model, size_t nrows, int nthreads,
|
|
1190
1366
|
Rcpp::NumericVector min_date,
|
1191
1367
|
Rcpp::NumericVector min_ts)
|
1192
1368
|
{
|
1193
|
-
|
1194
|
-
double *arr_num_C = set_R_nan_as_C_nan(
|
1369
|
+
Rcpp::NumericVector Xcpp;
|
1370
|
+
double *arr_num_C = set_R_nan_as_C_nan(REAL(arr_num), Xcpp, arr_num.size(), nthreads);
|
1195
1371
|
|
1196
1372
|
ModelOutputs *model_outputs = static_cast<ModelOutputs*>(R_ExternalPtrAddr(ptr_model));
|
1197
|
-
bool found_outliers = find_new_outliers(
|
1373
|
+
bool found_outliers = find_new_outliers(arr_num_C, INTEGER(arr_cat), INTEGER(arr_ord),
|
1198
1374
|
nrows, nthreads, *model_outputs);
|
1199
|
-
|
1200
|
-
|
1201
|
-
|
1202
|
-
|
1203
|
-
|
1204
|
-
|
1205
|
-
|
1206
|
-
|
1207
|
-
|
1208
|
-
|
1209
|
-
|
1210
|
-
|
1375
|
+
args_describe_outliers temp = {
|
1376
|
+
model_outputs,
|
1377
|
+
arr_num_C,
|
1378
|
+
INTEGER(arr_cat),
|
1379
|
+
INTEGER(arr_ord),
|
1380
|
+
&cat_levels,
|
1381
|
+
&ord_levels,
|
1382
|
+
&colnames_num,
|
1383
|
+
&colnames_cat,
|
1384
|
+
&colnames_ord,
|
1385
|
+
&min_date,
|
1386
|
+
&min_ts
|
1387
|
+
};
|
1388
|
+
|
1389
|
+
Rcpp::List outp;
|
1390
|
+
try {
|
1391
|
+
outp = Rcpp::unwindProtect(describe_outliers_wrapper, (void*)&temp);
|
1392
|
+
} catch(...) {
|
1393
|
+
forget_row_outputs(*model_outputs);
|
1394
|
+
throw;
|
1395
|
+
}
|
1211
1396
|
forget_row_outputs(*model_outputs);
|
1397
|
+
outp["found_outliers"] = Rcpp::LogicalVector(found_outliers);
|
1212
1398
|
return outp;
|
1213
1399
|
}
|
1214
1400
|
|
1215
|
-
// [[Rcpp::export]]
|
1401
|
+
// [[Rcpp::export(rng = false)]]
|
1216
1402
|
Rcpp::LogicalVector check_few_values(Rcpp::NumericVector arr_num, size_t nrows, size_t ncols, int nthreads)
|
1217
1403
|
{
|
1218
|
-
std::vector<char> too_few_vals(ncols, 0);
|
1219
|
-
check_more_two_values(&arr_num[0], nrows, ncols, nthreads, too_few_vals.data());
|
1220
1404
|
Rcpp::LogicalVector outp(ncols);
|
1405
|
+
std::vector<char> too_few_vals(ncols, 0);
|
1406
|
+
check_more_two_values(REAL(arr_num), nrows, ncols, nthreads, too_few_vals.data());
|
1221
1407
|
for (size_t col = 0; col < ncols; col++) {
|
1222
1408
|
outp[col] = (bool) too_few_vals[col];
|
1223
1409
|
}
|
1224
1410
|
return outp;
|
1225
1411
|
}
|
1412
|
+
|
1413
|
+
|
1414
|
+
// [[Rcpp::export(rng = false)]]
|
1415
|
+
bool R_has_openmp()
|
1416
|
+
{
|
1417
|
+
#ifdef _OPENMP
|
1418
|
+
return true;
|
1419
|
+
#else
|
1420
|
+
return false;
|
1421
|
+
#endif
|
1422
|
+
}
|
1423
|
+
|
1424
|
+
#endif /* _FOR_R */
|
@@ -74,7 +74,7 @@
|
|
74
74
|
*/
|
75
75
|
void find_outlier_categories(size_t categ_counts[], size_t ncateg, size_t tot, double max_perc_outliers,
|
76
76
|
long double perc_threshold[], size_t buffer_ix[], long double buffer_perc[],
|
77
|
-
double z_norm, char is_outlier[], bool *found_outliers, bool *new_is_outlier,
|
77
|
+
double z_norm, signed char is_outlier[], bool *found_outliers, bool *new_is_outlier,
|
78
78
|
double *next_most_comm)
|
79
79
|
{
|
80
80
|
//TODO: must also establish bounds for new, unseen categories
|
@@ -90,7 +90,7 @@ void find_outlier_categories(size_t categ_counts[], size_t ncateg, size_t tot, d
|
|
90
90
|
size_t size_tail = 0;
|
91
91
|
|
92
92
|
/* reset the temporary arrays and fill them */
|
93
|
-
memset(is_outlier, 0, ncateg * sizeof(char));
|
93
|
+
memset(is_outlier, 0, ncateg * sizeof(signed char));
|
94
94
|
for (size_t cat = 0; cat < ncateg; cat++) {
|
95
95
|
buffer_ix[cat] = cat;
|
96
96
|
buffer_perc[cat] = (categ_counts[cat] > 0)? ((long double)categ_counts[cat] / tot_dbl) : 0;
|
@@ -225,13 +225,13 @@ void find_outlier_categories(size_t categ_counts[], size_t ncateg, size_t tot, d
|
|
225
225
|
* Category to which the majority of the observations belong.
|
226
226
|
*/
|
227
227
|
void find_outlier_categories_by_maj(size_t categ_counts[], size_t ncateg, size_t tot, double max_perc_outliers,
|
228
|
-
long double prior_prob[], double z_outlier, char is_outlier[],
|
228
|
+
long double prior_prob[], double z_outlier, signed char is_outlier[],
|
229
229
|
bool *found_outliers, bool *new_is_outlier, int *categ_maj)
|
230
230
|
{
|
231
231
|
/* initialize parameters as needed */
|
232
232
|
*found_outliers = false;
|
233
233
|
*new_is_outlier = false;
|
234
|
-
memset(is_outlier, 0, ncateg * sizeof(char));
|
234
|
+
memset(is_outlier, 0, ncateg * sizeof(signed char));
|
235
235
|
size_t max_outliers = (size_t) calculate_max_outliers((long double)tot, max_perc_outliers);
|
236
236
|
long double tot_dbl = (long double) (tot + 1);
|
237
237
|
size_t n_non_maj;
|
@@ -283,7 +283,7 @@ void find_outlier_categories_by_maj(size_t categ_counts[], size_t ncateg, size_t
|
|
283
283
|
* Proportion of the least common non-outlier category.
|
284
284
|
*/
|
285
285
|
bool find_outlier_categories_no_cond(size_t categ_counts[], size_t ncateg, size_t tot,
|
286
|
-
char is_outlier[], double *next_most_comm)
|
286
|
+
signed char is_outlier[], double *next_most_comm)
|
287
287
|
{
|
288
288
|
/* if sample is too small, don't flag any as outliers */
|
289
289
|
if (tot < 1000) return false;
|
@@ -296,7 +296,7 @@ bool find_outlier_categories_no_cond(size_t categ_counts[], size_t ncateg, size_
|
|
296
296
|
|
297
297
|
/* look if there's any category meeting the first condition and none meeting the second one */
|
298
298
|
bool has_outlier_cat = false;
|
299
|
-
memset(is_outlier, 0, sizeof(char) * ncateg);
|
299
|
+
memset(is_outlier, 0, sizeof(signed char) * ncateg);
|
300
300
|
for (size_t cat = 0; cat < ncateg; cat++) {
|
301
301
|
if (categ_counts[cat] > max_outliers && categ_counts[cat] < max_next_most_comm) {
|
302
302
|
has_outlier_cat = false;
|