isotree 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,762 @@
1
+ /* Isolation forests and variations thereof, with adjustments for incorporation
2
+ * of categorical variables and missing values.
3
+ * Writen for C++11 standard and aimed at being used in R and Python.
4
+ *
5
+ * This library is based on the following works:
6
+ * [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
7
+ * "Isolation forest."
8
+ * 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
9
+ * [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
10
+ * "Isolation-based anomaly detection."
11
+ * ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
12
+ * [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
13
+ * "Extended Isolation Forest."
14
+ * arXiv preprint arXiv:1811.02141 (2018).
15
+ * [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
16
+ * "On detecting clustered anomalies using SCiForest."
17
+ * Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
18
+ * [5] https://sourceforge.net/projects/iforest/
19
+ * [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
20
+ * [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
21
+ * [8] Cortes, David. "Distance approximation using Isolation Forests." arXiv preprint arXiv:1910.12362 (2019).
22
+ * [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
23
+ *
24
+ * BSD 2-Clause License
25
+ * Copyright (c) 2019, David Cortes
26
+ * All rights reserved.
27
+ * Redistribution and use in source and binary forms, with or without
28
+ * modification, are permitted provided that the following conditions are met:
29
+ * * Redistributions of source code must retain the above copyright notice, this
30
+ * list of conditions and the following disclaimer.
31
+ * * Redistributions in binary form must reproduce the above copyright notice,
32
+ * this list of conditions and the following disclaimer in the documentation
33
+ * and/or other materials provided with the distribution.
34
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
35
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
36
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
37
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
38
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
39
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
40
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
41
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
42
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
43
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
44
+ */
45
+
46
+ #include <Rcpp.h>
47
+ // [[Rcpp::plugins(cpp11)]]
48
+
49
+ /* This is to serialize the model objects */
50
+ // [[Rcpp::depends(Rcereal)]]
51
+ #include <cereal/archives/binary.hpp>
52
+ #include <cereal/types/vector.hpp>
53
+ #include <sstream>
54
+ #include <string>
55
+
56
+ /* This is the package's header */
57
+ #include "isotree.hpp"
58
+
59
+ /* for model serialization and re-usage in R */
60
+ /* https://stackoverflow.com/questions/18474292/how-to-handle-c-internal-data-structure-in-r-in-order-to-allow-save-load */
61
+ /* this extra comment below the link is a workaround for Rcpp issue 675 in GitHub, do not remove it */
62
+ #include <Rinternals.h>
63
+ template <class T>
64
+ Rcpp::RawVector serialize_cpp_obj(T *model_outputs)
65
+ {
66
+ std::stringstream ss;
67
+ {
68
+ cereal::BinaryOutputArchive oarchive(ss); // Create an output archive
69
+ oarchive(*model_outputs);
70
+ }
71
+ ss.seekg(0, ss.end);
72
+ Rcpp::RawVector retval(ss.tellg());
73
+ ss.seekg(0, ss.beg);
74
+ ss.read(reinterpret_cast<char*>(&retval[0]), retval.size());
75
+ return retval;
76
+ }
77
+
78
+ // [[Rcpp::export]]
79
+ SEXP deserialize_IsoForest(Rcpp::RawVector src)
80
+ {
81
+ std::stringstream ss;
82
+ ss.write(reinterpret_cast<char*>(&src[0]), src.size());
83
+ ss.seekg(0, ss.beg);
84
+ std::unique_ptr<IsoForest> model_outputs = std::unique_ptr<IsoForest>(new IsoForest());
85
+ {
86
+ cereal::BinaryInputArchive iarchive(ss);
87
+ iarchive(*model_outputs);
88
+ }
89
+ return Rcpp::XPtr<IsoForest>(model_outputs.release(), true);
90
+ }
91
+
92
+ // [[Rcpp::export]]
93
+ SEXP deserialize_ExtIsoForest(Rcpp::RawVector src)
94
+ {
95
+ std::stringstream ss;
96
+ ss.write(reinterpret_cast<char*>(&src[0]), src.size());
97
+ ss.seekg(0, ss.beg);
98
+ std::unique_ptr<ExtIsoForest> model_outputs = std::unique_ptr<ExtIsoForest>(new ExtIsoForest());
99
+ {
100
+ cereal::BinaryInputArchive iarchive(ss);
101
+ iarchive(*model_outputs);
102
+ }
103
+ return Rcpp::XPtr<ExtIsoForest>(model_outputs.release(), true);
104
+ }
105
+
106
+ // [[Rcpp::export]]
107
+ SEXP deserialize_Imputer(Rcpp::RawVector src)
108
+ {
109
+ std::stringstream ss;
110
+ ss.write(reinterpret_cast<char*>(&src[0]), src.size());
111
+ ss.seekg(0, ss.beg);
112
+ std::unique_ptr<Imputer> imputer = std::unique_ptr<Imputer>(new Imputer());
113
+ {
114
+ cereal::BinaryInputArchive iarchive(ss);
115
+ iarchive(*imputer);
116
+ }
117
+ return Rcpp::XPtr<Imputer>(imputer.release(), true);
118
+ }
119
+
120
+ // [[Rcpp::export]]
121
+ Rcpp::LogicalVector check_null_ptr_model(SEXP ptr_model)
122
+ {
123
+ return Rcpp::LogicalVector(R_ExternalPtrAddr(ptr_model) == NULL);
124
+ }
125
+
126
+ double* set_R_nan_as_C_nan(double *x, size_t n, std::vector<double> &v, int nthreads)
127
+ {
128
+ v.assign(x, x + n);
129
+ #pragma omp parallel for schedule(static) num_threads(nthreads) shared(x, v, n)
130
+ for (size_t_for i = 0; i < n; i++)
131
+ if (isnan(v[i]))
132
+ v[i] = NAN;
133
+ return v.data();
134
+ }
135
+
136
+ double* set_R_nan_as_C_nan(double *x, size_t n, int nthreads)
137
+ {
138
+ #pragma omp parallel for schedule(static) num_threads(nthreads) shared(x, n)
139
+ for (size_t_for i = 0; i < n; i++)
140
+ if (isnan(x[i]))
141
+ x[i] = NAN;
142
+ return &x[0];
143
+ }
144
+
145
+ // [[Rcpp::export]]
146
+ Rcpp::List fit_model(Rcpp::NumericVector X_num, Rcpp::IntegerVector X_cat, Rcpp::IntegerVector ncat,
147
+ Rcpp::NumericVector Xc, Rcpp::IntegerVector Xc_ind, Rcpp::IntegerVector Xc_indptr,
148
+ Rcpp::NumericVector sample_weights, Rcpp::NumericVector col_weights,
149
+ size_t nrows, size_t ncols_numeric, size_t ncols_categ, size_t ndim, size_t ntry,
150
+ Rcpp::CharacterVector coef_type, bool coef_by_prop, bool with_replacement, bool weight_as_sample,
151
+ size_t sample_size, size_t ntrees, size_t max_depth, bool limit_depth,
152
+ bool penalize_range, bool calc_dist, bool standardize_dist, bool sq_dist,
153
+ bool calc_depth, bool standardize_depth, bool weigh_by_kurt,
154
+ double prob_pick_by_gain_avg, double prob_split_by_gain_avg,
155
+ double prob_pick_by_gain_pl, double prob_split_by_gain_pl, double min_gain,
156
+ Rcpp::CharacterVector cat_split_type, Rcpp::CharacterVector new_cat_action,
157
+ Rcpp::CharacterVector missing_action, bool all_perm,
158
+ bool build_imputer, bool output_imputations, size_t min_imp_obs,
159
+ Rcpp::CharacterVector depth_imp, Rcpp::CharacterVector weigh_imp_rows,
160
+ int random_seed, int nthreads)
161
+ {
162
+ double* numeric_data_ptr = NULL;
163
+ int* categ_data_ptr = NULL;
164
+ int* ncat_ptr = NULL;
165
+ double* Xc_ptr = NULL;
166
+ sparse_ix* Xc_ind_ptr = NULL;
167
+ sparse_ix* Xc_indptr_ptr = NULL;
168
+ double* sample_weights_ptr = NULL;
169
+ double* col_weights_ptr = NULL;
170
+ std::vector<double> Xcpp;
171
+
172
+ if (X_num.size())
173
+ {
174
+ numeric_data_ptr = &X_num[0];
175
+ if (Rcpp::as<std::string>(missing_action) != std::string("fail"))
176
+ numeric_data_ptr = set_R_nan_as_C_nan(numeric_data_ptr, nrows * ncols_numeric, Xcpp, nthreads);
177
+ }
178
+
179
+ if (X_cat.size())
180
+ {
181
+ categ_data_ptr = &X_cat[0];
182
+ ncat_ptr = &ncat[0];
183
+ }
184
+
185
+ if (Xc.size())
186
+ {
187
+ Xc_ptr = &Xc[0];
188
+ Xc_ind_ptr = &Xc_ind[0];
189
+ Xc_indptr_ptr = &Xc_indptr[0];
190
+ if (Rcpp::as<std::string>(missing_action) != std::string("fail"))
191
+ Xc_ptr = set_R_nan_as_C_nan(Xc_ptr, Xc.size(), Xcpp, nthreads);
192
+ }
193
+
194
+ if (sample_weights.size())
195
+ {
196
+ sample_weights_ptr = &sample_weights[0];
197
+ }
198
+
199
+ if (col_weights.size())
200
+ {
201
+ col_weights_ptr = &col_weights[0];
202
+ }
203
+
204
+ CoefType coef_type_C = Normal;
205
+ CategSplit cat_split_type_C = SubSet;
206
+ NewCategAction new_cat_action_C = Weighted;
207
+ MissingAction missing_action_C = Divide;
208
+ UseDepthImp depth_imp_C = Higher;
209
+ WeighImpRows weigh_imp_rows_C = Inverse;
210
+
211
+ if (Rcpp::as<std::string>(coef_type) == std::string("uniform"))
212
+ {
213
+ coef_type_C = Uniform;
214
+ }
215
+ if (Rcpp::as<std::string>(cat_split_type) == std::string("single_categ"))
216
+ {
217
+ cat_split_type_C = SingleCateg;
218
+ }
219
+ if (Rcpp::as<std::string>(new_cat_action) == std::string("smallest"))
220
+ {
221
+ new_cat_action_C = Smallest;
222
+ }
223
+ else if (Rcpp::as<std::string>(new_cat_action) == std::string("random"))
224
+ {
225
+ new_cat_action_C = Random;
226
+ }
227
+ if (Rcpp::as<std::string>(missing_action) == std::string("impute"))
228
+ {
229
+ missing_action_C = Impute;
230
+ }
231
+ else if (Rcpp::as<std::string>(missing_action) == std::string("fail"))
232
+ {
233
+ missing_action_C = Fail;
234
+ }
235
+ if (Rcpp::as<std::string>(depth_imp) == std::string("lower"))
236
+ {
237
+ depth_imp_C = Lower;
238
+ }
239
+ else if (Rcpp::as<std::string>(depth_imp) == std::string("same"))
240
+ {
241
+ depth_imp_C = Same;
242
+ }
243
+ if (Rcpp::as<std::string>(weigh_imp_rows) == std::string("prop"))
244
+ {
245
+ weigh_imp_rows_C = Prop;
246
+ }
247
+ else if (Rcpp::as<std::string>(weigh_imp_rows) == std::string("flat"))
248
+ {
249
+ weigh_imp_rows_C = Flat;
250
+ }
251
+
252
+ Rcpp::NumericVector tmat = Rcpp::NumericVector();
253
+ Rcpp::NumericMatrix dmat = Rcpp::NumericMatrix();
254
+ Rcpp::NumericVector depths = Rcpp::NumericVector();
255
+ double* tmat_ptr = NULL;
256
+ double* dmat_ptr = NULL;
257
+ double* depths_ptr = NULL;
258
+
259
+ if (calc_dist)
260
+ {
261
+ tmat = Rcpp::NumericVector((nrows * (nrows - 1)) / 2);
262
+ tmat_ptr = &tmat[0];
263
+ if (sq_dist)
264
+ {
265
+ dmat = Rcpp::NumericMatrix(nrows);
266
+ dmat_ptr = &dmat(0, 0);
267
+ }
268
+ }
269
+
270
+ if (calc_depth)
271
+ {
272
+ depths = Rcpp::NumericVector(nrows);
273
+ depths_ptr = &depths[0];
274
+ }
275
+
276
+ std::unique_ptr<IsoForest> model_ptr = std::unique_ptr<IsoForest>();
277
+ std::unique_ptr<ExtIsoForest> ext_model_ptr = std::unique_ptr<ExtIsoForest>();
278
+ std::unique_ptr<Imputer> imputer_ptr = std::unique_ptr<Imputer>();
279
+
280
+ if (ndim == 1)
281
+ model_ptr = std::unique_ptr<IsoForest>(new IsoForest());
282
+ else
283
+ ext_model_ptr = std::unique_ptr<ExtIsoForest>(new ExtIsoForest());
284
+
285
+ if (build_imputer)
286
+ imputer_ptr = std::unique_ptr<Imputer>(new Imputer());
287
+
288
+ int ret_val =
289
+ fit_iforest(model_ptr.get(), ext_model_ptr.get(),
290
+ numeric_data_ptr, ncols_numeric,
291
+ categ_data_ptr, ncols_categ, ncat_ptr,
292
+ Xc_ptr, Xc_ind_ptr, Xc_indptr_ptr,
293
+ ndim, ntry, coef_type_C, coef_by_prop,
294
+ sample_weights_ptr, with_replacement, weight_as_sample,
295
+ nrows, sample_size, ntrees, max_depth,
296
+ limit_depth, penalize_range,
297
+ standardize_dist, tmat_ptr,
298
+ depths_ptr, standardize_depth,
299
+ col_weights_ptr, weigh_by_kurt,
300
+ prob_pick_by_gain_avg, prob_split_by_gain_avg,
301
+ prob_pick_by_gain_pl, prob_split_by_gain_pl,
302
+ min_gain, missing_action_C,
303
+ cat_split_type_C, new_cat_action_C,
304
+ all_perm, imputer_ptr.get(), min_imp_obs,
305
+ depth_imp_C, weigh_imp_rows_C, output_imputations,
306
+ (uint64_t) random_seed, nthreads);
307
+
308
+ if (ret_val == EXIT_FAILURE)
309
+ {
310
+ return Rcpp::List::create(Rcpp::_["err"] = Rcpp::LogicalVector::create(1));
311
+ }
312
+
313
+ if (calc_dist && sq_dist)
314
+ tmat_to_dense(tmat_ptr, dmat_ptr, nrows, !standardize_dist);
315
+
316
+ Rcpp::RawVector serialized_obj;
317
+ if (ndim == 1)
318
+ serialized_obj = serialize_cpp_obj(model_ptr.get());
319
+ else
320
+ serialized_obj = serialize_cpp_obj(ext_model_ptr.get());
321
+
322
+ Rcpp::List outp = Rcpp::List::create(
323
+ Rcpp::_["serialized_obj"] = serialized_obj,
324
+ Rcpp::_["depths"] = depths,
325
+ Rcpp::_["tmat"] = tmat,
326
+ Rcpp::_["dmat"] = dmat
327
+ );
328
+
329
+ if (ndim == 1)
330
+ outp["model_ptr"] = Rcpp::XPtr<IsoForest>(model_ptr.release(), true);
331
+ else
332
+ outp["model_ptr"] = Rcpp::XPtr<ExtIsoForest>(ext_model_ptr.release(), true);
333
+
334
+ if (build_imputer)
335
+ {
336
+ outp["imputer_ser"] = serialize_cpp_obj(imputer_ptr.get());
337
+ outp["imputer_ptr"] = Rcpp::XPtr<Imputer>(imputer_ptr.release(), true);
338
+ }
339
+
340
+ if (output_imputations)
341
+ {
342
+ outp["imputed_num"] = Rcpp::NumericVector(Xcpp.begin(), Xcpp.end());
343
+ outp["imputed_cat"] = X_cat;
344
+ }
345
+
346
+ outp["err"] = Rcpp::LogicalVector::create(0);
347
+
348
+ return outp;
349
+ }
350
+
351
+ // [[Rcpp::export]]
352
+ Rcpp::RawVector fit_tree(SEXP model_R_ptr,
353
+ Rcpp::NumericVector X_num, Rcpp::IntegerVector X_cat, Rcpp::IntegerVector ncat,
354
+ Rcpp::NumericVector Xc, Rcpp::IntegerVector Xc_ind, Rcpp::IntegerVector Xc_indptr,
355
+ Rcpp::NumericVector sample_weights, Rcpp::NumericVector col_weights,
356
+ size_t nrows, size_t ncols_numeric, size_t ncols_categ,
357
+ size_t ndim, size_t ntry, Rcpp::CharacterVector coef_type, bool coef_by_prop,
358
+ size_t max_depth, bool limit_depth, bool penalize_range,
359
+ bool weigh_by_kurt,
360
+ double prob_pick_by_gain_avg, double prob_split_by_gain_avg,
361
+ double prob_pick_by_gain_pl, double prob_split_by_gain_pl, double min_gain,
362
+ Rcpp::CharacterVector cat_split_type, Rcpp::CharacterVector new_cat_action,
363
+ Rcpp::CharacterVector missing_action, bool build_imputer, size_t min_imp_obs, SEXP imp_R_ptr,
364
+ Rcpp::CharacterVector depth_imp, Rcpp::CharacterVector weigh_imp_rows,
365
+ bool all_perm, uint64_t random_seed)
366
+ {
367
+ double* numeric_data_ptr = NULL;
368
+ int* categ_data_ptr = NULL;
369
+ int* ncat_ptr = NULL;
370
+ double* Xc_ptr = NULL;
371
+ sparse_ix* Xc_ind_ptr = NULL;
372
+ sparse_ix* Xc_indptr_ptr = NULL;
373
+ double* sample_weights_ptr = NULL;
374
+ double* col_weights_ptr = NULL;
375
+ std::vector<double> Xcpp;
376
+
377
+ if (X_num.size())
378
+ {
379
+ numeric_data_ptr = &X_num[0];
380
+ if (Rcpp::as<std::string>(missing_action) != std::string("fail"))
381
+ numeric_data_ptr = set_R_nan_as_C_nan(numeric_data_ptr, nrows * ncols_numeric, Xcpp, 1);
382
+ }
383
+
384
+ if (X_cat.size())
385
+ {
386
+ categ_data_ptr = &X_cat[0];
387
+ ncat_ptr = &ncat[0];
388
+ }
389
+
390
+ if (Xc.size())
391
+ {
392
+ Xc_ptr = &Xc[0];
393
+ Xc_ind_ptr = &Xc_ind[0];
394
+ Xc_indptr_ptr = &Xc_indptr[0];
395
+ if (Rcpp::as<std::string>(missing_action) != std::string("fail"))
396
+ Xc_ptr = set_R_nan_as_C_nan(Xc_ptr, Xc.size(), Xcpp, 1);
397
+ }
398
+
399
+ if (sample_weights.size())
400
+ {
401
+ sample_weights_ptr = &sample_weights[0];
402
+ }
403
+
404
+ if (col_weights.size())
405
+ {
406
+ col_weights_ptr = &col_weights[0];
407
+ }
408
+
409
+ CoefType coef_type_C = Normal;
410
+ CategSplit cat_split_type_C = SubSet;
411
+ NewCategAction new_cat_action_C = Weighted;
412
+ MissingAction missing_action_C = Divide;
413
+ UseDepthImp depth_imp_C = Higher;
414
+ WeighImpRows weigh_imp_rows_C = Inverse;
415
+
416
+ if (Rcpp::as<std::string>(coef_type) == std::string("uniform"))
417
+ {
418
+ coef_type_C = Uniform;
419
+ }
420
+ if (Rcpp::as<std::string>(cat_split_type) == std::string("single_categ"))
421
+ {
422
+ cat_split_type_C = SingleCateg;
423
+ }
424
+ if (Rcpp::as<std::string>(new_cat_action) == std::string("smallest"))
425
+ {
426
+ new_cat_action_C = Smallest;
427
+ }
428
+ else if (Rcpp::as<std::string>(new_cat_action) == std::string("random"))
429
+ {
430
+ new_cat_action_C = Random;
431
+ }
432
+ if (Rcpp::as<std::string>(missing_action) == std::string("impute"))
433
+ {
434
+ missing_action_C = Impute;
435
+ }
436
+ else if (Rcpp::as<std::string>(missing_action) == std::string("fail"))
437
+ {
438
+ missing_action_C = Fail;
439
+ }
440
+ if (Rcpp::as<std::string>(depth_imp) == std::string("lower"))
441
+ {
442
+ depth_imp_C = Lower;
443
+ }
444
+ else if (Rcpp::as<std::string>(depth_imp) == std::string("same"))
445
+ {
446
+ depth_imp_C = Same;
447
+ }
448
+ if (Rcpp::as<std::string>(weigh_imp_rows) == std::string("prop"))
449
+ {
450
+ weigh_imp_rows_C = Prop;
451
+ }
452
+ else if (Rcpp::as<std::string>(weigh_imp_rows) == std::string("flat"))
453
+ {
454
+ weigh_imp_rows_C = Flat;
455
+ }
456
+
457
+ IsoForest* model_ptr = NULL;
458
+ ExtIsoForest* ext_model_ptr = NULL;
459
+ Imputer* imputer_ptr = NULL;
460
+ if (ndim == 1)
461
+ model_ptr = static_cast<IsoForest*>(R_ExternalPtrAddr(model_R_ptr));
462
+ else
463
+ ext_model_ptr = static_cast<ExtIsoForest*>(R_ExternalPtrAddr(model_R_ptr));
464
+
465
+ std::vector<ImputeNode> *imp_ptr = NULL;
466
+ if (build_imputer)
467
+ {
468
+ imputer_ptr = static_cast<Imputer*>(R_ExternalPtrAddr(imp_R_ptr));
469
+ imputer_ptr->imputer_tree.emplace_back();
470
+ imp_ptr = &imputer_ptr->imputer_tree.back();
471
+ }
472
+
473
+ add_tree(model_ptr, ext_model_ptr,
474
+ numeric_data_ptr, ncols_numeric,
475
+ categ_data_ptr, ncols_categ, ncat_ptr,
476
+ Xc_ptr, Xc_ind_ptr, Xc_indptr_ptr,
477
+ ndim, ntry, coef_type_C, coef_by_prop,
478
+ sample_weights_ptr,
479
+ nrows, max_depth,
480
+ limit_depth, penalize_range,
481
+ col_weights_ptr, weigh_by_kurt,
482
+ prob_pick_by_gain_avg, prob_split_by_gain_avg,
483
+ prob_pick_by_gain_pl, prob_split_by_gain_pl,
484
+ min_gain, missing_action_C,
485
+ cat_split_type_C, new_cat_action_C,
486
+ depth_imp_C, weigh_imp_rows_C, all_perm,
487
+ imp_ptr, min_imp_obs, (uint64_t)random_seed);
488
+
489
+ if (ndim == 1)
490
+ return serialize_cpp_obj(model_ptr);
491
+ else
492
+ return serialize_cpp_obj(ext_model_ptr);
493
+ }
494
+
495
+ // [[Rcpp::export]]
496
+ void predict_iso(SEXP model_R_ptr, Rcpp::NumericVector outp, Rcpp::IntegerVector tree_num, bool is_extended,
497
+ Rcpp::NumericVector X_num, Rcpp::IntegerVector X_cat,
498
+ Rcpp::NumericVector Xc, Rcpp::IntegerVector Xc_ind, Rcpp::IntegerVector Xc_indptr,
499
+ Rcpp::NumericVector Xr, Rcpp::IntegerVector Xr_ind, Rcpp::IntegerVector Xr_indptr,
500
+ size_t nrows, int nthreads, bool standardize)
501
+ {
502
+ double* numeric_data_ptr = NULL;
503
+ int* categ_data_ptr = NULL;
504
+ double* Xc_ptr = NULL;
505
+ sparse_ix* Xc_ind_ptr = NULL;
506
+ sparse_ix* Xc_indptr_ptr = NULL;
507
+ double* Xr_ptr = NULL;
508
+ sparse_ix* Xr_ind_ptr = NULL;
509
+ sparse_ix* Xr_indptr_ptr = NULL;
510
+ sparse_ix* tree_num_ptr = NULL;
511
+ std::vector<double> Xcpp;
512
+
513
+ if (X_num.size())
514
+ {
515
+ numeric_data_ptr = &X_num[0];
516
+ }
517
+
518
+ if (X_cat.size())
519
+ {
520
+ categ_data_ptr = &X_cat[0];
521
+ }
522
+
523
+ if (Xc.size())
524
+ {
525
+ Xc_ptr = &Xc[0];
526
+ Xc_ind_ptr = &Xc_ind[0];
527
+ Xc_indptr_ptr = &Xc_indptr[0];
528
+ }
529
+
530
+ if (Xr.size())
531
+ {
532
+ Xr_ptr = &Xr[0];
533
+ Xr_ind_ptr = &Xr_ind[0];
534
+ Xr_indptr_ptr = &Xr_indptr[0];
535
+ }
536
+
537
+ if (tree_num.size())
538
+ {
539
+ tree_num_ptr = &tree_num[0];
540
+ }
541
+
542
+ double* depths_ptr = &outp[0];
543
+
544
+ IsoForest* model_ptr = NULL;
545
+ ExtIsoForest* ext_model_ptr = NULL;
546
+ if (is_extended)
547
+ ext_model_ptr = static_cast<ExtIsoForest*>(R_ExternalPtrAddr(model_R_ptr));
548
+ else
549
+ model_ptr = static_cast<IsoForest*>(R_ExternalPtrAddr(model_R_ptr));
550
+
551
+ MissingAction missing_action = is_extended?
552
+ ext_model_ptr->missing_action
553
+ :
554
+ model_ptr->missing_action;
555
+ if (missing_action != Fail)
556
+ {
557
+ if (X_num.size()) numeric_data_ptr = set_R_nan_as_C_nan(numeric_data_ptr, X_num.size(), Xcpp, nthreads);
558
+ if (Xc.size()) Xc_ptr = set_R_nan_as_C_nan(Xc_ptr, Xc.size(), Xcpp, nthreads);
559
+ if (Xr.size()) Xr_ptr = set_R_nan_as_C_nan(Xr_ptr, Xr.size(), Xcpp, nthreads);
560
+ }
561
+
562
+ predict_iforest(numeric_data_ptr, categ_data_ptr,
563
+ Xc_ptr, Xc_ind_ptr, Xc_indptr_ptr,
564
+ Xr_ptr, Xr_ind_ptr, Xr_indptr_ptr,
565
+ nrows, nthreads, standardize,
566
+ model_ptr, ext_model_ptr,
567
+ depths_ptr, tree_num_ptr);
568
+ }
569
+
570
+ // [[Rcpp::export]]
571
+ void dist_iso(SEXP model_R_ptr, Rcpp::NumericVector tmat, Rcpp::NumericVector dmat,
572
+ Rcpp::NumericVector rmat, bool is_extended,
573
+ Rcpp::NumericVector X_num, Rcpp::IntegerVector X_cat,
574
+ Rcpp::NumericVector Xc, Rcpp::IntegerVector Xc_ind, Rcpp::IntegerVector Xc_indptr,
575
+ size_t nrows, int nthreads, bool assume_full_distr,
576
+ bool standardize_dist, bool sq_dist, size_t n_from)
577
+ {
578
+ double* numeric_data_ptr = NULL;
579
+ int* categ_data_ptr = NULL;
580
+ double* Xc_ptr = NULL;
581
+ sparse_ix* Xc_ind_ptr = NULL;
582
+ sparse_ix* Xc_indptr_ptr = NULL;
583
+ std::vector<double> Xcpp;
584
+
585
+ if (X_num.size())
586
+ {
587
+ numeric_data_ptr = &X_num[0];
588
+ }
589
+
590
+ if (X_cat.size())
591
+ {
592
+ categ_data_ptr = &X_cat[0];
593
+ }
594
+
595
+ if (Xc.size())
596
+ {
597
+ Xc_ptr = &Xc[0];
598
+ Xc_ind_ptr = &Xc_ind[0];
599
+ Xc_indptr_ptr = &Xc_indptr[0];
600
+ }
601
+
602
+ double* tmat_ptr = n_from? (double*)NULL : &tmat[0];
603
+ double* dmat_ptr = (sq_dist & !n_from)? &dmat[0] : NULL;
604
+ double* rmat_ptr = n_from? &rmat[0] : NULL;
605
+
606
+ IsoForest* model_ptr = NULL;
607
+ ExtIsoForest* ext_model_ptr = NULL;
608
+ if (is_extended)
609
+ ext_model_ptr = static_cast<ExtIsoForest*>(R_ExternalPtrAddr(model_R_ptr));
610
+ else
611
+ model_ptr = static_cast<IsoForest*>(R_ExternalPtrAddr(model_R_ptr));
612
+
613
+
614
+ MissingAction missing_action = is_extended?
615
+ ext_model_ptr->missing_action
616
+ :
617
+ model_ptr->missing_action;
618
+ if (missing_action != Fail)
619
+ {
620
+ if (X_num.size()) numeric_data_ptr = set_R_nan_as_C_nan(numeric_data_ptr, X_num.size(), Xcpp, nthreads);
621
+ if (Xc.size()) Xc_ptr = set_R_nan_as_C_nan(Xc_ptr, Xc.size(), Xcpp, nthreads);
622
+ }
623
+
624
+
625
+ calc_similarity(numeric_data_ptr, categ_data_ptr,
626
+ Xc_ptr, Xc_ind_ptr, Xc_indptr_ptr,
627
+ nrows, nthreads, assume_full_distr, standardize_dist,
628
+ model_ptr, ext_model_ptr,
629
+ tmat_ptr, rmat_ptr, n_from);
630
+
631
+ if (sq_dist & !n_from)
632
+ tmat_to_dense(tmat_ptr, dmat_ptr, nrows, !standardize_dist);
633
+ }
634
+
635
+ // [[Rcpp::export]]
636
+ Rcpp::List impute_iso(SEXP model_R_ptr, SEXP imputer_R_ptr, bool is_extended,
637
+ Rcpp::NumericVector X_num, Rcpp::IntegerVector X_cat,
638
+ Rcpp::NumericVector Xr, Rcpp::IntegerVector Xr_ind, Rcpp::IntegerVector Xr_indptr,
639
+ size_t nrows, int nthreads)
640
+ {
641
+ double* numeric_data_ptr = NULL;
642
+ int* categ_data_ptr = NULL;
643
+ double* Xr_ptr = NULL;
644
+ sparse_ix* Xr_ind_ptr = NULL;
645
+ sparse_ix* Xr_indptr_ptr = NULL;
646
+
647
+ if (X_num.size())
648
+ {
649
+ numeric_data_ptr = &X_num[0];
650
+ }
651
+
652
+ if (X_cat.size())
653
+ {
654
+ categ_data_ptr = &X_cat[0];
655
+ }
656
+
657
+ if (Xr.size())
658
+ {
659
+ Xr_ptr = &Xr[0];
660
+ Xr_ind_ptr = &Xr_ind[0];
661
+ Xr_indptr_ptr = &Xr_indptr[0];
662
+ }
663
+
664
+ if (X_num.size()) numeric_data_ptr = set_R_nan_as_C_nan(numeric_data_ptr, X_num.size(), nthreads);
665
+ if (Xr.size()) Xr_ptr = set_R_nan_as_C_nan(Xr_ptr, Xr.size(), nthreads);
666
+
667
+ IsoForest* model_ptr = NULL;
668
+ ExtIsoForest* ext_model_ptr = NULL;
669
+ if (is_extended)
670
+ ext_model_ptr = static_cast<ExtIsoForest*>(R_ExternalPtrAddr(model_R_ptr));
671
+ else
672
+ model_ptr = static_cast<IsoForest*>(R_ExternalPtrAddr(model_R_ptr));
673
+
674
+ Imputer* imputer_ptr = static_cast<Imputer*>(R_ExternalPtrAddr(imputer_R_ptr));
675
+
676
+
677
+ impute_missing_values(numeric_data_ptr, categ_data_ptr,
678
+ Xr_ptr, Xr_ind_ptr, Xr_indptr_ptr,
679
+ nrows, nthreads,
680
+ model_ptr, ext_model_ptr,
681
+ *imputer_ptr);
682
+
683
+ return Rcpp::List::create(
684
+ Rcpp::_["X_num"] = Xr.size()? Xr : X_num,
685
+ Rcpp::_["X_cat"] = X_cat
686
+ );
687
+ }
688
+
689
+ // [[Rcpp::export]]
690
+ Rcpp::List get_n_nodes(SEXP model_R_ptr, bool is_extended, int nthreads)
691
+ {
692
+ size_t ntrees;
693
+ IsoForest* model_ptr = NULL;
694
+ ExtIsoForest* ext_model_ptr = NULL;
695
+ if (is_extended)
696
+ {
697
+ ext_model_ptr = static_cast<ExtIsoForest*>(R_ExternalPtrAddr(model_R_ptr));
698
+ ntrees = ext_model_ptr->hplanes.size();
699
+ }
700
+ else
701
+ {
702
+ model_ptr = static_cast<IsoForest*>(R_ExternalPtrAddr(model_R_ptr));
703
+ ntrees = model_ptr->trees.size();
704
+ }
705
+
706
+ Rcpp::IntegerVector n_nodes(ntrees);
707
+ Rcpp::IntegerVector n_terminal(ntrees);
708
+ if (is_extended)
709
+ get_num_nodes(*ext_model_ptr, &n_nodes[0], &n_terminal[0], nthreads);
710
+ else
711
+ get_num_nodes(*model_ptr, &n_nodes[0], &n_terminal[0], nthreads);
712
+
713
+ return Rcpp::List::create(
714
+ Rcpp::_["total"] = n_nodes,
715
+ Rcpp::_["terminal"] = n_terminal
716
+ );
717
+ }
718
+
719
+ // [[Rcpp::export]]
720
+ Rcpp::List append_trees_from_other(SEXP model_R_ptr, SEXP other_R_ptr,
721
+ SEXP imp_R_ptr, SEXP oimp_R_ptr,
722
+ bool is_extended)
723
+ {
724
+ Rcpp::List out;
725
+ IsoForest* model_ptr = NULL;
726
+ IsoForest* other_ptr = NULL;
727
+ ExtIsoForest* ext_model_ptr = NULL;
728
+ ExtIsoForest* ext_other_ptr = NULL;
729
+ Imputer* imputer_ptr = NULL;
730
+ Imputer* oimputer_ptr = NULL;
731
+
732
+ if (is_extended) {
733
+ ext_model_ptr = static_cast<ExtIsoForest*>(R_ExternalPtrAddr(model_R_ptr));
734
+ ext_other_ptr = static_cast<ExtIsoForest*>(R_ExternalPtrAddr(other_R_ptr));
735
+ } else {
736
+ model_ptr = static_cast<IsoForest*>(R_ExternalPtrAddr(model_R_ptr));
737
+ other_ptr = static_cast<IsoForest*>(R_ExternalPtrAddr(other_R_ptr));
738
+ }
739
+
740
+ if (!Rf_isNull(imp_R_ptr) && !Rf_isNull(oimp_R_ptr) &&
741
+ R_ExternalPtrAddr(imp_R_ptr) != NULL &&
742
+ R_ExternalPtrAddr(oimp_R_ptr) != NULL)
743
+ {
744
+ imputer_ptr = static_cast<Imputer*>(R_ExternalPtrAddr(imp_R_ptr));
745
+ oimputer_ptr = static_cast<Imputer*>(R_ExternalPtrAddr(oimp_R_ptr));
746
+ }
747
+
748
+ merge_models(model_ptr, other_ptr,
749
+ ext_model_ptr, ext_other_ptr,
750
+ imputer_ptr, oimputer_ptr);
751
+
752
+
753
+ if (is_extended)
754
+ out["serialized"] = serialize_cpp_obj(ext_model_ptr);
755
+ else
756
+ out["serialized"] = serialize_cpp_obj(model_ptr);
757
+
758
+ if (imputer_ptr != NULL && oimputer_ptr != NULL)
759
+ out["imp_ser"] = serialize_cpp_obj(imputer_ptr);
760
+
761
+ return out;
762
+ }