outliertree 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/NOTICE.txt +1 -1
- data/README.md +11 -10
- data/ext/outliertree/extconf.rb +1 -1
- data/lib/outliertree/result.rb +3 -3
- data/lib/outliertree/version.rb +1 -1
- data/vendor/outliertree/README.md +77 -40
- data/vendor/outliertree/src/Makevars.in +4 -0
- data/vendor/outliertree/src/Makevars.win +4 -0
- data/vendor/outliertree/src/RcppExports.cpp +20 -9
- data/vendor/outliertree/src/Rwrapper.cpp +256 -57
- data/vendor/outliertree/src/cat_outlier.cpp +6 -6
- data/vendor/outliertree/src/clusters.cpp +114 -9
- data/vendor/outliertree/src/fit_model.cpp +505 -308
- data/vendor/outliertree/src/misc.cpp +165 -4
- data/vendor/outliertree/src/outlier_tree.hpp +159 -51
- data/vendor/outliertree/src/outliertree-win.def +3 -0
- data/vendor/outliertree/src/predict.cpp +33 -0
- data/vendor/outliertree/src/split.cpp +124 -20
- metadata +8 -6
- data/vendor/outliertree/src/Makevars +0 -3
@@ -1,5 +1,9 @@
|
|
1
|
+
#ifdef _FOR_R
|
2
|
+
|
1
3
|
#include <Rcpp.h>
|
4
|
+
#include <Rcpp/unwindProtect.h>
|
2
5
|
// [[Rcpp::plugins(cpp11)]]
|
6
|
+
// [[Rcpp::plugins(unwindProtect)]]
|
3
7
|
|
4
8
|
/* This is to serialize the model objects */
|
5
9
|
// [[Rcpp::depends(Rcereal)]]
|
@@ -7,14 +11,22 @@
|
|
7
11
|
#include <cereal/types/vector.hpp>
|
8
12
|
#include <sstream>
|
9
13
|
#include <string>
|
14
|
+
#include <limits>
|
10
15
|
|
11
16
|
/* This is the package's header */
|
12
17
|
#include "outlier_tree.hpp"
|
13
18
|
|
19
|
+
SEXP alloc_RawVec(void *data)
|
20
|
+
{
|
21
|
+
size_t vec_size = *(size_t*)data;
|
22
|
+
if (vec_size > (size_t)std::numeric_limits<R_xlen_t>::max())
|
23
|
+
Rcpp::stop("Resulting model is too large for R to handle.");
|
24
|
+
return Rcpp::RawVector((R_xlen_t)vec_size);
|
25
|
+
}
|
26
|
+
|
14
27
|
/* for model serialization and re-usage in R */
|
15
28
|
/* https://stackoverflow.com/questions/18474292/how-to-handle-c-internal-data-structure-in-r-in-order-to-allow-save-load */
|
16
29
|
/* this extra comment below the link is a workaround for Rcpp issue 675 in GitHub, do not remove it */
|
17
|
-
#include <Rinternals.h>
|
18
30
|
Rcpp::RawVector serialize_OutlierTree(ModelOutputs *model_outputs)
|
19
31
|
{
|
20
32
|
std::stringstream ss;
|
@@ -23,27 +35,60 @@ Rcpp::RawVector serialize_OutlierTree(ModelOutputs *model_outputs)
|
|
23
35
|
oarchive(*model_outputs);
|
24
36
|
}
|
25
37
|
ss.seekg(0, ss.end);
|
26
|
-
|
38
|
+
std::stringstream::pos_type vec_size = ss.tellg();
|
39
|
+
if (vec_size <= 0) {
|
40
|
+
Rcpp::Rcerr << "Error: model is too big to serialize, resulting object will not be usable.\n" << std::endl;
|
41
|
+
return Rcpp::RawVector();
|
42
|
+
}
|
43
|
+
size_t vec_size_ = (size_t)vec_size;
|
44
|
+
Rcpp::RawVector retval = Rcpp::unwindProtect(alloc_RawVec, (void*)&vec_size_);
|
45
|
+
if (!retval.size())
|
46
|
+
return retval;
|
27
47
|
ss.seekg(0, ss.beg);
|
28
|
-
ss.read(reinterpret_cast<char*>(
|
48
|
+
ss.read(reinterpret_cast<char*>(RAW(retval)), retval.size());
|
29
49
|
return retval;
|
30
50
|
}
|
31
51
|
|
32
|
-
|
33
|
-
|
52
|
+
SEXP safe_XPtr(void *model_ptr)
|
53
|
+
{
|
54
|
+
return Rcpp::XPtr<ModelOutputs>((ModelOutputs*)model_ptr, true);
|
55
|
+
}
|
56
|
+
|
57
|
+
void R_delete_model(SEXP R_ptr)
|
58
|
+
{
|
59
|
+
ModelOutputs *model = static_cast<ModelOutputs*>(R_ExternalPtrAddr(R_ptr));
|
60
|
+
delete model;
|
61
|
+
R_ClearExternalPtr(R_ptr);
|
62
|
+
}
|
63
|
+
|
64
|
+
// [[Rcpp::export(rng = false)]]
|
65
|
+
SEXP deserialize_OutlierTree(Rcpp::RawVector src, SEXP ptr_obj)
|
34
66
|
{
|
35
67
|
std::stringstream ss;
|
36
|
-
ss.write(reinterpret_cast<char*>(
|
68
|
+
ss.write(reinterpret_cast<char*>(RAW(src)), src.size());
|
37
69
|
ss.seekg(0, ss.beg);
|
38
70
|
std::unique_ptr<ModelOutputs> model_outputs = std::unique_ptr<ModelOutputs>(new ModelOutputs());
|
39
71
|
{
|
40
72
|
cereal::BinaryInputArchive iarchive(ss);
|
41
73
|
iarchive(*model_outputs);
|
42
74
|
}
|
43
|
-
|
75
|
+
R_SetExternalPtrAddr(ptr_obj, model_outputs.get());
|
76
|
+
R_RegisterCFinalizerEx(ptr_obj, R_delete_model, TRUE);
|
77
|
+
model_outputs.release();
|
78
|
+
return R_NilValue;
|
44
79
|
}
|
45
80
|
|
46
|
-
|
81
|
+
SEXP safe_int(void *x)
|
82
|
+
{
|
83
|
+
return Rcpp::wrap(*(int*)x);
|
84
|
+
}
|
85
|
+
|
86
|
+
SEXP safe_bool(void *x)
|
87
|
+
{
|
88
|
+
return Rcpp::wrap(*(bool*)x);
|
89
|
+
}
|
90
|
+
|
91
|
+
// [[Rcpp::export(rng = false)]]
|
47
92
|
Rcpp::LogicalVector check_null_ptr_model(SEXP ptr_model)
|
48
93
|
{
|
49
94
|
return Rcpp::LogicalVector(R_ExternalPtrAddr(ptr_model) == NULL);
|
@@ -59,12 +104,22 @@ double* set_R_nan_as_C_nan(double *restrict x_R, std::vector<double> &x_C, size_
|
|
59
104
|
return x_C.data();
|
60
105
|
}
|
61
106
|
|
107
|
+
double* set_R_nan_as_C_nan(double *restrict x_R, Rcpp::NumericVector &x_C, size_t n, int nthreads)
|
108
|
+
{
|
109
|
+
x_C = Rcpp::NumericVector(x_R, x_R + n);
|
110
|
+
#pragma omp parallel for schedule(static) num_threads(nthreads) shared(x_R, x_C, n)
|
111
|
+
for (size_t_for i = 0; i < n; i++)
|
112
|
+
if (isnan(x_R[i]) || Rcpp::NumericVector::is_na(x_R[i]) || Rcpp::traits::is_nan<REALSXP>(x_R[i]))
|
113
|
+
x_C[i] = NAN;
|
114
|
+
return REAL(x_C);
|
115
|
+
}
|
116
|
+
|
62
117
|
|
63
118
|
/* for predicting outliers */
|
64
119
|
Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
65
|
-
double *arr_num,
|
66
|
-
int *arr_cat,
|
67
|
-
int *arr_ord,
|
120
|
+
double *restrict arr_num,
|
121
|
+
int *restrict arr_cat,
|
122
|
+
int *restrict arr_ord,
|
68
123
|
Rcpp::ListOf<Rcpp::StringVector> cat_levels,
|
69
124
|
Rcpp::ListOf<Rcpp::StringVector> ord_levels,
|
70
125
|
Rcpp::StringVector colnames_num,
|
@@ -345,6 +400,12 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
|
345
400
|
}
|
346
401
|
break;
|
347
402
|
}
|
403
|
+
|
404
|
+
default:
|
405
|
+
{
|
406
|
+
assert(0);
|
407
|
+
break;
|
408
|
+
}
|
348
409
|
}
|
349
410
|
|
350
411
|
/* add the comparison point */
|
@@ -377,6 +438,11 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
|
377
438
|
cond_clust["value_comp"] = Rcpp::as<Rcpp::CharacterVector>(NA_STRING);
|
378
439
|
break;
|
379
440
|
}
|
441
|
+
|
442
|
+
default:
|
443
|
+
{
|
444
|
+
unexpected_error();
|
445
|
+
}
|
380
446
|
}
|
381
447
|
break;
|
382
448
|
}
|
@@ -492,6 +558,12 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
|
492
558
|
}
|
493
559
|
break;
|
494
560
|
}
|
561
|
+
|
562
|
+
default:
|
563
|
+
{
|
564
|
+
assert(0);
|
565
|
+
break;
|
566
|
+
}
|
495
567
|
|
496
568
|
}
|
497
569
|
lst_cond[row] = Rcpp::List::create(Rcpp::clone(cond_clust));
|
@@ -528,6 +600,12 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
|
528
600
|
cond_clust["column"] = Rcpp::as<Rcpp::CharacterVector>(colnames_ord[model_outputs.all_trees[outl_col][curr_tree].col_num]);
|
529
601
|
break;
|
530
602
|
}
|
603
|
+
|
604
|
+
default:
|
605
|
+
{
|
606
|
+
assert(0);
|
607
|
+
break;
|
608
|
+
}
|
531
609
|
}
|
532
610
|
|
533
611
|
/* add conditions from tree */
|
@@ -599,6 +677,7 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
|
599
677
|
break;
|
600
678
|
}
|
601
679
|
|
680
|
+
default: {}
|
602
681
|
}
|
603
682
|
break;
|
604
683
|
}
|
@@ -696,6 +775,7 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
|
696
775
|
break;
|
697
776
|
}
|
698
777
|
|
778
|
+
default: {}
|
699
779
|
}
|
700
780
|
break;
|
701
781
|
}
|
@@ -758,10 +838,16 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
|
758
838
|
break;
|
759
839
|
}
|
760
840
|
|
841
|
+
default: {}
|
761
842
|
}
|
762
843
|
break;
|
763
844
|
}
|
764
845
|
|
846
|
+
default:
|
847
|
+
{
|
848
|
+
assert(0);
|
849
|
+
break;
|
850
|
+
}
|
765
851
|
}
|
766
852
|
}
|
767
853
|
|
@@ -796,6 +882,12 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
|
796
882
|
cond_clust["column"] = Rcpp::as<Rcpp::CharacterVector>(colnames_ord[model_outputs.all_trees[outl_col][parent_tree].col_num]);
|
797
883
|
break;
|
798
884
|
}
|
885
|
+
|
886
|
+
default:
|
887
|
+
{
|
888
|
+
assert(0);
|
889
|
+
break;
|
890
|
+
}
|
799
891
|
}
|
800
892
|
|
801
893
|
|
@@ -835,6 +927,11 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
|
835
927
|
cond_clust["value_comp"] = Rcpp::as<Rcpp::CharacterVector>(NA_STRING);
|
836
928
|
break;
|
837
929
|
}
|
930
|
+
|
931
|
+
default:
|
932
|
+
{
|
933
|
+
unexpected_error();
|
934
|
+
}
|
838
935
|
}
|
839
936
|
break;
|
840
937
|
}
|
@@ -1011,6 +1108,11 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
|
1011
1108
|
break;
|
1012
1109
|
}
|
1013
1110
|
|
1111
|
+
default:
|
1112
|
+
{
|
1113
|
+
assert(0);
|
1114
|
+
break;
|
1115
|
+
}
|
1014
1116
|
}
|
1015
1117
|
|
1016
1118
|
|
@@ -1038,6 +1140,37 @@ Rcpp::List describe_outliers(ModelOutputs &model_outputs,
|
|
1038
1140
|
return outp;
|
1039
1141
|
}
|
1040
1142
|
|
1143
|
+
struct args_describe_outliers {
|
1144
|
+
ModelOutputs *model_outputs;
|
1145
|
+
double *arr_num;
|
1146
|
+
int *arr_cat;
|
1147
|
+
int *arr_ord;
|
1148
|
+
Rcpp::ListOf<Rcpp::StringVector> *cat_levels;
|
1149
|
+
Rcpp::ListOf<Rcpp::StringVector> *ord_levels;
|
1150
|
+
Rcpp::StringVector *colnames_num;
|
1151
|
+
Rcpp::StringVector *colnames_cat;
|
1152
|
+
Rcpp::StringVector *colnames_ord;
|
1153
|
+
Rcpp::NumericVector *min_date;
|
1154
|
+
Rcpp::NumericVector *min_ts;
|
1155
|
+
};
|
1156
|
+
|
1157
|
+
SEXP describe_outliers_wrapper(void *args_)
|
1158
|
+
{
|
1159
|
+
args_describe_outliers *args = (args_describe_outliers*)args_;
|
1160
|
+
return describe_outliers(*(args->model_outputs),
|
1161
|
+
args->arr_num,
|
1162
|
+
args->arr_cat,
|
1163
|
+
args->arr_ord,
|
1164
|
+
*(args->cat_levels),
|
1165
|
+
*(args->ord_levels),
|
1166
|
+
*(args->colnames_num),
|
1167
|
+
*(args->colnames_cat),
|
1168
|
+
*(args->colnames_ord),
|
1169
|
+
*(args->min_date),
|
1170
|
+
*(args->min_ts));
|
1171
|
+
}
|
1172
|
+
|
1173
|
+
|
1041
1174
|
/* for extracting info about flaggable outliers */
|
1042
1175
|
Rcpp::List extract_outl_bounds(ModelOutputs &model_outputs,
|
1043
1176
|
Rcpp::ListOf<Rcpp::StringVector> cat_levels,
|
@@ -1102,9 +1235,27 @@ Rcpp::List extract_outl_bounds(ModelOutputs &model_outputs,
|
|
1102
1235
|
return outp;
|
1103
1236
|
}
|
1104
1237
|
|
1238
|
+
struct args_extract_outl_bounds {
|
1239
|
+
ModelOutputs *model_outputs;
|
1240
|
+
Rcpp::ListOf<Rcpp::StringVector> *cat_levels;
|
1241
|
+
Rcpp::ListOf<Rcpp::StringVector> *ord_levels;
|
1242
|
+
Rcpp::NumericVector *min_date;
|
1243
|
+
Rcpp::NumericVector *min_ts;
|
1244
|
+
};
|
1245
|
+
|
1246
|
+
SEXP extract_outl_bounds_wrapper(void *args_)
|
1247
|
+
{
|
1248
|
+
args_extract_outl_bounds *args = (args_extract_outl_bounds*)args_;
|
1249
|
+
return extract_outl_bounds(*(args->model_outputs),
|
1250
|
+
*(args->cat_levels),
|
1251
|
+
*(args->ord_levels),
|
1252
|
+
*(args->min_date),
|
1253
|
+
*(args->min_ts));
|
1254
|
+
}
|
1255
|
+
|
1105
1256
|
|
1106
1257
|
/* external functions for fitting the model and predicting outliers */
|
1107
|
-
// [[Rcpp::export]]
|
1258
|
+
// [[Rcpp::export(rng = false)]]
|
1108
1259
|
Rcpp::List fit_OutlierTree(Rcpp::NumericVector arr_num, size_t ncols_numeric,
|
1109
1260
|
Rcpp::IntegerVector arr_cat, size_t ncols_categ, Rcpp::IntegerVector ncat,
|
1110
1261
|
Rcpp::IntegerVector arr_ord, size_t ncols_ord, Rcpp::IntegerVector ncat_ord,
|
@@ -1121,8 +1272,17 @@ Rcpp::List fit_OutlierTree(Rcpp::NumericVector arr_num, size_t ncols_numeric,
|
|
1121
1272
|
Rcpp::NumericVector min_date,
|
1122
1273
|
Rcpp::NumericVector min_ts)
|
1123
1274
|
{
|
1275
|
+
Rcpp::List outp = Rcpp::List::create(
|
1276
|
+
Rcpp::_["ptr_model"] = R_NilValue,
|
1277
|
+
Rcpp::_["serialized_obj"] = R_NilValue,
|
1278
|
+
Rcpp::_["bounds"] = R_NilValue,
|
1279
|
+
Rcpp::_["outliers_info"] = R_NilValue,
|
1280
|
+
Rcpp::_["ntrees"] = R_NilValue,
|
1281
|
+
Rcpp::_["nclust"] = R_NilValue,
|
1282
|
+
Rcpp::_["found_outliers"] = R_NilValue
|
1283
|
+
);
|
1284
|
+
|
1124
1285
|
bool found_outliers;
|
1125
|
-
Rcpp::List outp;
|
1126
1286
|
size_t tot_cols = ncols_numeric + ncols_categ + ncols_ord;
|
1127
1287
|
std::vector<char> cols_ignore;
|
1128
1288
|
char *cols_ignore_ptr = NULL;
|
@@ -1132,54 +1292,70 @@ Rcpp::List fit_OutlierTree(Rcpp::NumericVector arr_num, size_t ncols_numeric,
|
|
1132
1292
|
cols_ignore_ptr = &cols_ignore[0];
|
1133
1293
|
}
|
1134
1294
|
std::vector<double> Xcpp;
|
1135
|
-
double *arr_num_C = set_R_nan_as_C_nan(
|
1295
|
+
double *arr_num_C = set_R_nan_as_C_nan(REAL(arr_num), Xcpp, arr_num.size(), nthreads);
|
1136
1296
|
|
1137
1297
|
std::unique_ptr<ModelOutputs> model_outputs = std::unique_ptr<ModelOutputs>(new ModelOutputs());
|
1298
|
+
try {
|
1138
1299
|
found_outliers = fit_outliers_models(*model_outputs,
|
1139
1300
|
arr_num_C, ncols_numeric,
|
1140
|
-
|
1141
|
-
|
1301
|
+
INTEGER(arr_cat), ncols_categ, INTEGER(ncat),
|
1302
|
+
INTEGER(arr_ord), ncols_ord, INTEGER(ncat_ord),
|
1142
1303
|
nrows, cols_ignore_ptr, nthreads,
|
1143
1304
|
categ_as_bin, ord_as_bin, cat_bruteforce_subset, categ_from_maj, take_mid,
|
1144
1305
|
max_depth, max_perc_outliers, min_size_numeric, min_size_categ,
|
1145
1306
|
min_gain, gain_as_pct, follow_all, z_norm, z_outlier);
|
1146
1307
|
|
1147
|
-
|
1148
|
-
|
1149
|
-
|
1150
|
-
|
1151
|
-
|
1152
|
-
|
1308
|
+
args_extract_outl_bounds temp = {
|
1309
|
+
model_outputs.get(),
|
1310
|
+
&cat_levels,
|
1311
|
+
&ord_levels,
|
1312
|
+
&min_date,
|
1313
|
+
&min_ts
|
1314
|
+
};
|
1315
|
+
outp["bounds"] = Rcpp::unwindProtect(extract_outl_bounds_wrapper, (void*)&temp);
|
1153
1316
|
outp["serialized_obj"] = serialize_OutlierTree(model_outputs.get());
|
1317
|
+
} catch(std::bad_alloc &e) {
|
1318
|
+
Rcpp::stop("Insufficient memory.\n");
|
1319
|
+
}
|
1320
|
+
|
1321
|
+
if (!Rf_xlength(outp["serialized_obj"]))
|
1322
|
+
return outp;
|
1154
1323
|
if (return_outliers) {
|
1155
|
-
|
1156
|
-
|
1157
|
-
|
1158
|
-
|
1159
|
-
|
1160
|
-
|
1161
|
-
|
1162
|
-
|
1163
|
-
|
1164
|
-
|
1165
|
-
|
1324
|
+
args_describe_outliers temp = {
|
1325
|
+
model_outputs.get(),
|
1326
|
+
arr_num_C,
|
1327
|
+
INTEGER(arr_cat),
|
1328
|
+
INTEGER(arr_ord),
|
1329
|
+
&cat_levels,
|
1330
|
+
&ord_levels,
|
1331
|
+
&colnames_num,
|
1332
|
+
&colnames_cat,
|
1333
|
+
&colnames_ord,
|
1334
|
+
&min_date,
|
1335
|
+
&min_ts
|
1336
|
+
};
|
1337
|
+
outp["outliers_info"] = Rcpp::unwindProtect(describe_outliers_wrapper, (void*)&temp);
|
1166
1338
|
}
|
1339
|
+
forget_row_outputs(*model_outputs);
|
1340
|
+
|
1167
1341
|
/* add number of trees and clusters */
|
1168
1342
|
size_t ntrees = 0, nclust = 0;
|
1169
1343
|
for (size_t col = 0; col < model_outputs->all_trees.size(); col++) {
|
1170
1344
|
ntrees += model_outputs->all_trees[col].size();
|
1171
1345
|
nclust += model_outputs->all_clusters[col].size();
|
1172
1346
|
}
|
1173
|
-
|
1174
|
-
|
1175
|
-
outp["
|
1347
|
+
int ntrees_int = (int)ntrees;
|
1348
|
+
int nclust_int = (int)nclust;
|
1349
|
+
outp["ntrees"] = Rcpp::unwindProtect(safe_int, (void*)&ntrees_int);
|
1350
|
+
outp["nclust"] = Rcpp::unwindProtect(safe_int, (void*)&nclust_int);
|
1351
|
+
outp["found_outliers"] = Rcpp::unwindProtect(safe_bool, (void*)&found_outliers);
|
1176
1352
|
|
1177
|
-
|
1178
|
-
|
1353
|
+
outp["ptr_model"] = Rcpp::unwindProtect(safe_XPtr, model_outputs.get());
|
1354
|
+
model_outputs.release();
|
1179
1355
|
return outp;
|
1180
1356
|
}
|
1181
1357
|
|
1182
|
-
// [[Rcpp::export]]
|
1358
|
+
// [[Rcpp::export(rng = false)]]
|
1183
1359
|
Rcpp::List predict_OutlierTree(SEXP ptr_model, size_t nrows, int nthreads,
|
1184
1360
|
Rcpp::NumericVector arr_num, Rcpp::IntegerVector arr_cat, Rcpp::IntegerVector arr_ord,
|
1185
1361
|
Rcpp::ListOf<Rcpp::StringVector> cat_levels,
|
@@ -1190,36 +1366,59 @@ Rcpp::List predict_OutlierTree(SEXP ptr_model, size_t nrows, int nthreads,
|
|
1190
1366
|
Rcpp::NumericVector min_date,
|
1191
1367
|
Rcpp::NumericVector min_ts)
|
1192
1368
|
{
|
1193
|
-
|
1194
|
-
double *arr_num_C = set_R_nan_as_C_nan(
|
1369
|
+
Rcpp::NumericVector Xcpp;
|
1370
|
+
double *arr_num_C = set_R_nan_as_C_nan(REAL(arr_num), Xcpp, arr_num.size(), nthreads);
|
1195
1371
|
|
1196
1372
|
ModelOutputs *model_outputs = static_cast<ModelOutputs*>(R_ExternalPtrAddr(ptr_model));
|
1197
|
-
bool found_outliers = find_new_outliers(
|
1373
|
+
bool found_outliers = find_new_outliers(arr_num_C, INTEGER(arr_cat), INTEGER(arr_ord),
|
1198
1374
|
nrows, nthreads, *model_outputs);
|
1199
|
-
|
1200
|
-
|
1201
|
-
|
1202
|
-
|
1203
|
-
|
1204
|
-
|
1205
|
-
|
1206
|
-
|
1207
|
-
|
1208
|
-
|
1209
|
-
|
1210
|
-
|
1375
|
+
args_describe_outliers temp = {
|
1376
|
+
model_outputs,
|
1377
|
+
arr_num_C,
|
1378
|
+
INTEGER(arr_cat),
|
1379
|
+
INTEGER(arr_ord),
|
1380
|
+
&cat_levels,
|
1381
|
+
&ord_levels,
|
1382
|
+
&colnames_num,
|
1383
|
+
&colnames_cat,
|
1384
|
+
&colnames_ord,
|
1385
|
+
&min_date,
|
1386
|
+
&min_ts
|
1387
|
+
};
|
1388
|
+
|
1389
|
+
Rcpp::List outp;
|
1390
|
+
try {
|
1391
|
+
outp = Rcpp::unwindProtect(describe_outliers_wrapper, (void*)&temp);
|
1392
|
+
} catch(...) {
|
1393
|
+
forget_row_outputs(*model_outputs);
|
1394
|
+
throw;
|
1395
|
+
}
|
1211
1396
|
forget_row_outputs(*model_outputs);
|
1397
|
+
outp["found_outliers"] = Rcpp::LogicalVector(found_outliers);
|
1212
1398
|
return outp;
|
1213
1399
|
}
|
1214
1400
|
|
1215
|
-
// [[Rcpp::export]]
|
1401
|
+
// [[Rcpp::export(rng = false)]]
|
1216
1402
|
Rcpp::LogicalVector check_few_values(Rcpp::NumericVector arr_num, size_t nrows, size_t ncols, int nthreads)
|
1217
1403
|
{
|
1218
|
-
std::vector<char> too_few_vals(ncols, 0);
|
1219
|
-
check_more_two_values(&arr_num[0], nrows, ncols, nthreads, too_few_vals.data());
|
1220
1404
|
Rcpp::LogicalVector outp(ncols);
|
1405
|
+
std::vector<char> too_few_vals(ncols, 0);
|
1406
|
+
check_more_two_values(REAL(arr_num), nrows, ncols, nthreads, too_few_vals.data());
|
1221
1407
|
for (size_t col = 0; col < ncols; col++) {
|
1222
1408
|
outp[col] = (bool) too_few_vals[col];
|
1223
1409
|
}
|
1224
1410
|
return outp;
|
1225
1411
|
}
|
1412
|
+
|
1413
|
+
|
1414
|
+
// [[Rcpp::export(rng = false)]]
|
1415
|
+
bool R_has_openmp()
|
1416
|
+
{
|
1417
|
+
#ifdef _OPENMP
|
1418
|
+
return true;
|
1419
|
+
#else
|
1420
|
+
return false;
|
1421
|
+
#endif
|
1422
|
+
}
|
1423
|
+
|
1424
|
+
#endif /* _FOR_R */
|
@@ -74,7 +74,7 @@
|
|
74
74
|
*/
|
75
75
|
void find_outlier_categories(size_t categ_counts[], size_t ncateg, size_t tot, double max_perc_outliers,
|
76
76
|
long double perc_threshold[], size_t buffer_ix[], long double buffer_perc[],
|
77
|
-
double z_norm, char is_outlier[], bool *found_outliers, bool *new_is_outlier,
|
77
|
+
double z_norm, signed char is_outlier[], bool *found_outliers, bool *new_is_outlier,
|
78
78
|
double *next_most_comm)
|
79
79
|
{
|
80
80
|
//TODO: must also establish bounds for new, unseen categories
|
@@ -90,7 +90,7 @@ void find_outlier_categories(size_t categ_counts[], size_t ncateg, size_t tot, d
|
|
90
90
|
size_t size_tail = 0;
|
91
91
|
|
92
92
|
/* reset the temporary arrays and fill them */
|
93
|
-
memset(is_outlier, 0, ncateg * sizeof(char));
|
93
|
+
memset(is_outlier, 0, ncateg * sizeof(signed char));
|
94
94
|
for (size_t cat = 0; cat < ncateg; cat++) {
|
95
95
|
buffer_ix[cat] = cat;
|
96
96
|
buffer_perc[cat] = (categ_counts[cat] > 0)? ((long double)categ_counts[cat] / tot_dbl) : 0;
|
@@ -225,13 +225,13 @@ void find_outlier_categories(size_t categ_counts[], size_t ncateg, size_t tot, d
|
|
225
225
|
* Category to which the majority of the observations belong.
|
226
226
|
*/
|
227
227
|
void find_outlier_categories_by_maj(size_t categ_counts[], size_t ncateg, size_t tot, double max_perc_outliers,
|
228
|
-
long double prior_prob[], double z_outlier, char is_outlier[],
|
228
|
+
long double prior_prob[], double z_outlier, signed char is_outlier[],
|
229
229
|
bool *found_outliers, bool *new_is_outlier, int *categ_maj)
|
230
230
|
{
|
231
231
|
/* initialize parameters as needed */
|
232
232
|
*found_outliers = false;
|
233
233
|
*new_is_outlier = false;
|
234
|
-
memset(is_outlier, 0, ncateg * sizeof(char));
|
234
|
+
memset(is_outlier, 0, ncateg * sizeof(signed char));
|
235
235
|
size_t max_outliers = (size_t) calculate_max_outliers((long double)tot, max_perc_outliers);
|
236
236
|
long double tot_dbl = (long double) (tot + 1);
|
237
237
|
size_t n_non_maj;
|
@@ -283,7 +283,7 @@ void find_outlier_categories_by_maj(size_t categ_counts[], size_t ncateg, size_t
|
|
283
283
|
* Proportion of the least common non-outlier category.
|
284
284
|
*/
|
285
285
|
bool find_outlier_categories_no_cond(size_t categ_counts[], size_t ncateg, size_t tot,
|
286
|
-
char is_outlier[], double *next_most_comm)
|
286
|
+
signed char is_outlier[], double *next_most_comm)
|
287
287
|
{
|
288
288
|
/* if sample is too small, don't flag any as outliers */
|
289
289
|
if (tot < 1000) return false;
|
@@ -296,7 +296,7 @@ bool find_outlier_categories_no_cond(size_t categ_counts[], size_t ncateg, size_
|
|
296
296
|
|
297
297
|
/* look if there's any category meeting the first condition and none meeting the second one */
|
298
298
|
bool has_outlier_cat = false;
|
299
|
-
memset(is_outlier, 0, sizeof(char) * ncateg);
|
299
|
+
memset(is_outlier, 0, sizeof(signed char) * ncateg);
|
300
300
|
for (size_t cat = 0; cat < ncateg; cat++) {
|
301
301
|
if (categ_counts[cat] > max_outliers && categ_counts[cat] < max_next_most_comm) {
|
302
302
|
has_outlier_cat = false;
|