isotree 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,762 @@
1
+ /* Isolation forests and variations thereof, with adjustments for incorporation
2
+ * of categorical variables and missing values.
3
+ * Writen for C++11 standard and aimed at being used in R and Python.
4
+ *
5
+ * This library is based on the following works:
6
+ * [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
7
+ * "Isolation forest."
8
+ * 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
9
+ * [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
10
+ * "Isolation-based anomaly detection."
11
+ * ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
12
+ * [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
13
+ * "Extended Isolation Forest."
14
+ * arXiv preprint arXiv:1811.02141 (2018).
15
+ * [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
16
+ * "On detecting clustered anomalies using SCiForest."
17
+ * Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
18
+ * [5] https://sourceforge.net/projects/iforest/
19
+ * [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
20
+ * [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
21
+ * [8] Cortes, David. "Distance approximation using Isolation Forests." arXiv preprint arXiv:1910.12362 (2019).
22
+ * [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
23
+ *
24
+ * BSD 2-Clause License
25
+ * Copyright (c) 2019, David Cortes
26
+ * All rights reserved.
27
+ * Redistribution and use in source and binary forms, with or without
28
+ * modification, are permitted provided that the following conditions are met:
29
+ * * Redistributions of source code must retain the above copyright notice, this
30
+ * list of conditions and the following disclaimer.
31
+ * * Redistributions in binary form must reproduce the above copyright notice,
32
+ * this list of conditions and the following disclaimer in the documentation
33
+ * and/or other materials provided with the distribution.
34
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
35
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
36
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
37
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
38
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
39
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
40
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
41
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
42
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
43
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
44
+ */
45
+
46
+ #include <Rcpp.h>
47
+ // [[Rcpp::plugins(cpp11)]]
48
+
49
+ /* This is to serialize the model objects */
50
+ // [[Rcpp::depends(Rcereal)]]
51
+ #include <cereal/archives/binary.hpp>
52
+ #include <cereal/types/vector.hpp>
53
+ #include <sstream>
54
+ #include <string>
55
+
56
+ /* This is the package's header */
57
+ #include "isotree.hpp"
58
+
59
+ /* for model serialization and re-usage in R */
60
+ /* https://stackoverflow.com/questions/18474292/how-to-handle-c-internal-data-structure-in-r-in-order-to-allow-save-load */
61
+ /* this extra comment below the link is a workaround for Rcpp issue 675 in GitHub, do not remove it */
62
+ #include <Rinternals.h>
63
+ template <class T>
64
+ Rcpp::RawVector serialize_cpp_obj(T *model_outputs)
65
+ {
66
+ std::stringstream ss;
67
+ {
68
+ cereal::BinaryOutputArchive oarchive(ss); // Create an output archive
69
+ oarchive(*model_outputs);
70
+ }
71
+ ss.seekg(0, ss.end);
72
+ Rcpp::RawVector retval(ss.tellg());
73
+ ss.seekg(0, ss.beg);
74
+ ss.read(reinterpret_cast<char*>(&retval[0]), retval.size());
75
+ return retval;
76
+ }
77
+
78
+ // [[Rcpp::export]]
79
+ SEXP deserialize_IsoForest(Rcpp::RawVector src)
80
+ {
81
+ std::stringstream ss;
82
+ ss.write(reinterpret_cast<char*>(&src[0]), src.size());
83
+ ss.seekg(0, ss.beg);
84
+ std::unique_ptr<IsoForest> model_outputs = std::unique_ptr<IsoForest>(new IsoForest());
85
+ {
86
+ cereal::BinaryInputArchive iarchive(ss);
87
+ iarchive(*model_outputs);
88
+ }
89
+ return Rcpp::XPtr<IsoForest>(model_outputs.release(), true);
90
+ }
91
+
92
+ // [[Rcpp::export]]
93
+ SEXP deserialize_ExtIsoForest(Rcpp::RawVector src)
94
+ {
95
+ std::stringstream ss;
96
+ ss.write(reinterpret_cast<char*>(&src[0]), src.size());
97
+ ss.seekg(0, ss.beg);
98
+ std::unique_ptr<ExtIsoForest> model_outputs = std::unique_ptr<ExtIsoForest>(new ExtIsoForest());
99
+ {
100
+ cereal::BinaryInputArchive iarchive(ss);
101
+ iarchive(*model_outputs);
102
+ }
103
+ return Rcpp::XPtr<ExtIsoForest>(model_outputs.release(), true);
104
+ }
105
+
106
+ // [[Rcpp::export]]
107
+ SEXP deserialize_Imputer(Rcpp::RawVector src)
108
+ {
109
+ std::stringstream ss;
110
+ ss.write(reinterpret_cast<char*>(&src[0]), src.size());
111
+ ss.seekg(0, ss.beg);
112
+ std::unique_ptr<Imputer> imputer = std::unique_ptr<Imputer>(new Imputer());
113
+ {
114
+ cereal::BinaryInputArchive iarchive(ss);
115
+ iarchive(*imputer);
116
+ }
117
+ return Rcpp::XPtr<Imputer>(imputer.release(), true);
118
+ }
119
+
120
+ // [[Rcpp::export]]
121
+ Rcpp::LogicalVector check_null_ptr_model(SEXP ptr_model)
122
+ {
123
+ return Rcpp::LogicalVector(R_ExternalPtrAddr(ptr_model) == NULL);
124
+ }
125
+
126
+ double* set_R_nan_as_C_nan(double *x, size_t n, std::vector<double> &v, int nthreads)
127
+ {
128
+ v.assign(x, x + n);
129
+ #pragma omp parallel for schedule(static) num_threads(nthreads) shared(x, v, n)
130
+ for (size_t_for i = 0; i < n; i++)
131
+ if (isnan(v[i]))
132
+ v[i] = NAN;
133
+ return v.data();
134
+ }
135
+
136
+ double* set_R_nan_as_C_nan(double *x, size_t n, int nthreads)
137
+ {
138
+ #pragma omp parallel for schedule(static) num_threads(nthreads) shared(x, n)
139
+ for (size_t_for i = 0; i < n; i++)
140
+ if (isnan(x[i]))
141
+ x[i] = NAN;
142
+ return &x[0];
143
+ }
144
+
145
+ // [[Rcpp::export]]
146
+ Rcpp::List fit_model(Rcpp::NumericVector X_num, Rcpp::IntegerVector X_cat, Rcpp::IntegerVector ncat,
147
+ Rcpp::NumericVector Xc, Rcpp::IntegerVector Xc_ind, Rcpp::IntegerVector Xc_indptr,
148
+ Rcpp::NumericVector sample_weights, Rcpp::NumericVector col_weights,
149
+ size_t nrows, size_t ncols_numeric, size_t ncols_categ, size_t ndim, size_t ntry,
150
+ Rcpp::CharacterVector coef_type, bool coef_by_prop, bool with_replacement, bool weight_as_sample,
151
+ size_t sample_size, size_t ntrees, size_t max_depth, bool limit_depth,
152
+ bool penalize_range, bool calc_dist, bool standardize_dist, bool sq_dist,
153
+ bool calc_depth, bool standardize_depth, bool weigh_by_kurt,
154
+ double prob_pick_by_gain_avg, double prob_split_by_gain_avg,
155
+ double prob_pick_by_gain_pl, double prob_split_by_gain_pl, double min_gain,
156
+ Rcpp::CharacterVector cat_split_type, Rcpp::CharacterVector new_cat_action,
157
+ Rcpp::CharacterVector missing_action, bool all_perm,
158
+ bool build_imputer, bool output_imputations, size_t min_imp_obs,
159
+ Rcpp::CharacterVector depth_imp, Rcpp::CharacterVector weigh_imp_rows,
160
+ int random_seed, int nthreads)
161
+ {
162
+ double* numeric_data_ptr = NULL;
163
+ int* categ_data_ptr = NULL;
164
+ int* ncat_ptr = NULL;
165
+ double* Xc_ptr = NULL;
166
+ sparse_ix* Xc_ind_ptr = NULL;
167
+ sparse_ix* Xc_indptr_ptr = NULL;
168
+ double* sample_weights_ptr = NULL;
169
+ double* col_weights_ptr = NULL;
170
+ std::vector<double> Xcpp;
171
+
172
+ if (X_num.size())
173
+ {
174
+ numeric_data_ptr = &X_num[0];
175
+ if (Rcpp::as<std::string>(missing_action) != std::string("fail"))
176
+ numeric_data_ptr = set_R_nan_as_C_nan(numeric_data_ptr, nrows * ncols_numeric, Xcpp, nthreads);
177
+ }
178
+
179
+ if (X_cat.size())
180
+ {
181
+ categ_data_ptr = &X_cat[0];
182
+ ncat_ptr = &ncat[0];
183
+ }
184
+
185
+ if (Xc.size())
186
+ {
187
+ Xc_ptr = &Xc[0];
188
+ Xc_ind_ptr = &Xc_ind[0];
189
+ Xc_indptr_ptr = &Xc_indptr[0];
190
+ if (Rcpp::as<std::string>(missing_action) != std::string("fail"))
191
+ Xc_ptr = set_R_nan_as_C_nan(Xc_ptr, Xc.size(), Xcpp, nthreads);
192
+ }
193
+
194
+ if (sample_weights.size())
195
+ {
196
+ sample_weights_ptr = &sample_weights[0];
197
+ }
198
+
199
+ if (col_weights.size())
200
+ {
201
+ col_weights_ptr = &col_weights[0];
202
+ }
203
+
204
+ CoefType coef_type_C = Normal;
205
+ CategSplit cat_split_type_C = SubSet;
206
+ NewCategAction new_cat_action_C = Weighted;
207
+ MissingAction missing_action_C = Divide;
208
+ UseDepthImp depth_imp_C = Higher;
209
+ WeighImpRows weigh_imp_rows_C = Inverse;
210
+
211
+ if (Rcpp::as<std::string>(coef_type) == std::string("uniform"))
212
+ {
213
+ coef_type_C = Uniform;
214
+ }
215
+ if (Rcpp::as<std::string>(cat_split_type) == std::string("single_categ"))
216
+ {
217
+ cat_split_type_C = SingleCateg;
218
+ }
219
+ if (Rcpp::as<std::string>(new_cat_action) == std::string("smallest"))
220
+ {
221
+ new_cat_action_C = Smallest;
222
+ }
223
+ else if (Rcpp::as<std::string>(new_cat_action) == std::string("random"))
224
+ {
225
+ new_cat_action_C = Random;
226
+ }
227
+ if (Rcpp::as<std::string>(missing_action) == std::string("impute"))
228
+ {
229
+ missing_action_C = Impute;
230
+ }
231
+ else if (Rcpp::as<std::string>(missing_action) == std::string("fail"))
232
+ {
233
+ missing_action_C = Fail;
234
+ }
235
+ if (Rcpp::as<std::string>(depth_imp) == std::string("lower"))
236
+ {
237
+ depth_imp_C = Lower;
238
+ }
239
+ else if (Rcpp::as<std::string>(depth_imp) == std::string("same"))
240
+ {
241
+ depth_imp_C = Same;
242
+ }
243
+ if (Rcpp::as<std::string>(weigh_imp_rows) == std::string("prop"))
244
+ {
245
+ weigh_imp_rows_C = Prop;
246
+ }
247
+ else if (Rcpp::as<std::string>(weigh_imp_rows) == std::string("flat"))
248
+ {
249
+ weigh_imp_rows_C = Flat;
250
+ }
251
+
252
+ Rcpp::NumericVector tmat = Rcpp::NumericVector();
253
+ Rcpp::NumericMatrix dmat = Rcpp::NumericMatrix();
254
+ Rcpp::NumericVector depths = Rcpp::NumericVector();
255
+ double* tmat_ptr = NULL;
256
+ double* dmat_ptr = NULL;
257
+ double* depths_ptr = NULL;
258
+
259
+ if (calc_dist)
260
+ {
261
+ tmat = Rcpp::NumericVector((nrows * (nrows - 1)) / 2);
262
+ tmat_ptr = &tmat[0];
263
+ if (sq_dist)
264
+ {
265
+ dmat = Rcpp::NumericMatrix(nrows);
266
+ dmat_ptr = &dmat(0, 0);
267
+ }
268
+ }
269
+
270
+ if (calc_depth)
271
+ {
272
+ depths = Rcpp::NumericVector(nrows);
273
+ depths_ptr = &depths[0];
274
+ }
275
+
276
+ std::unique_ptr<IsoForest> model_ptr = std::unique_ptr<IsoForest>();
277
+ std::unique_ptr<ExtIsoForest> ext_model_ptr = std::unique_ptr<ExtIsoForest>();
278
+ std::unique_ptr<Imputer> imputer_ptr = std::unique_ptr<Imputer>();
279
+
280
+ if (ndim == 1)
281
+ model_ptr = std::unique_ptr<IsoForest>(new IsoForest());
282
+ else
283
+ ext_model_ptr = std::unique_ptr<ExtIsoForest>(new ExtIsoForest());
284
+
285
+ if (build_imputer)
286
+ imputer_ptr = std::unique_ptr<Imputer>(new Imputer());
287
+
288
+ int ret_val =
289
+ fit_iforest(model_ptr.get(), ext_model_ptr.get(),
290
+ numeric_data_ptr, ncols_numeric,
291
+ categ_data_ptr, ncols_categ, ncat_ptr,
292
+ Xc_ptr, Xc_ind_ptr, Xc_indptr_ptr,
293
+ ndim, ntry, coef_type_C, coef_by_prop,
294
+ sample_weights_ptr, with_replacement, weight_as_sample,
295
+ nrows, sample_size, ntrees, max_depth,
296
+ limit_depth, penalize_range,
297
+ standardize_dist, tmat_ptr,
298
+ depths_ptr, standardize_depth,
299
+ col_weights_ptr, weigh_by_kurt,
300
+ prob_pick_by_gain_avg, prob_split_by_gain_avg,
301
+ prob_pick_by_gain_pl, prob_split_by_gain_pl,
302
+ min_gain, missing_action_C,
303
+ cat_split_type_C, new_cat_action_C,
304
+ all_perm, imputer_ptr.get(), min_imp_obs,
305
+ depth_imp_C, weigh_imp_rows_C, output_imputations,
306
+ (uint64_t) random_seed, nthreads);
307
+
308
+ if (ret_val == EXIT_FAILURE)
309
+ {
310
+ return Rcpp::List::create(Rcpp::_["err"] = Rcpp::LogicalVector::create(1));
311
+ }
312
+
313
+ if (calc_dist && sq_dist)
314
+ tmat_to_dense(tmat_ptr, dmat_ptr, nrows, !standardize_dist);
315
+
316
+ Rcpp::RawVector serialized_obj;
317
+ if (ndim == 1)
318
+ serialized_obj = serialize_cpp_obj(model_ptr.get());
319
+ else
320
+ serialized_obj = serialize_cpp_obj(ext_model_ptr.get());
321
+
322
+ Rcpp::List outp = Rcpp::List::create(
323
+ Rcpp::_["serialized_obj"] = serialized_obj,
324
+ Rcpp::_["depths"] = depths,
325
+ Rcpp::_["tmat"] = tmat,
326
+ Rcpp::_["dmat"] = dmat
327
+ );
328
+
329
+ if (ndim == 1)
330
+ outp["model_ptr"] = Rcpp::XPtr<IsoForest>(model_ptr.release(), true);
331
+ else
332
+ outp["model_ptr"] = Rcpp::XPtr<ExtIsoForest>(ext_model_ptr.release(), true);
333
+
334
+ if (build_imputer)
335
+ {
336
+ outp["imputer_ser"] = serialize_cpp_obj(imputer_ptr.get());
337
+ outp["imputer_ptr"] = Rcpp::XPtr<Imputer>(imputer_ptr.release(), true);
338
+ }
339
+
340
+ if (output_imputations)
341
+ {
342
+ outp["imputed_num"] = Rcpp::NumericVector(Xcpp.begin(), Xcpp.end());
343
+ outp["imputed_cat"] = X_cat;
344
+ }
345
+
346
+ outp["err"] = Rcpp::LogicalVector::create(0);
347
+
348
+ return outp;
349
+ }
350
+
351
+ // [[Rcpp::export]]
352
+ Rcpp::RawVector fit_tree(SEXP model_R_ptr,
353
+ Rcpp::NumericVector X_num, Rcpp::IntegerVector X_cat, Rcpp::IntegerVector ncat,
354
+ Rcpp::NumericVector Xc, Rcpp::IntegerVector Xc_ind, Rcpp::IntegerVector Xc_indptr,
355
+ Rcpp::NumericVector sample_weights, Rcpp::NumericVector col_weights,
356
+ size_t nrows, size_t ncols_numeric, size_t ncols_categ,
357
+ size_t ndim, size_t ntry, Rcpp::CharacterVector coef_type, bool coef_by_prop,
358
+ size_t max_depth, bool limit_depth, bool penalize_range,
359
+ bool weigh_by_kurt,
360
+ double prob_pick_by_gain_avg, double prob_split_by_gain_avg,
361
+ double prob_pick_by_gain_pl, double prob_split_by_gain_pl, double min_gain,
362
+ Rcpp::CharacterVector cat_split_type, Rcpp::CharacterVector new_cat_action,
363
+ Rcpp::CharacterVector missing_action, bool build_imputer, size_t min_imp_obs, SEXP imp_R_ptr,
364
+ Rcpp::CharacterVector depth_imp, Rcpp::CharacterVector weigh_imp_rows,
365
+ bool all_perm, uint64_t random_seed)
366
+ {
367
+ double* numeric_data_ptr = NULL;
368
+ int* categ_data_ptr = NULL;
369
+ int* ncat_ptr = NULL;
370
+ double* Xc_ptr = NULL;
371
+ sparse_ix* Xc_ind_ptr = NULL;
372
+ sparse_ix* Xc_indptr_ptr = NULL;
373
+ double* sample_weights_ptr = NULL;
374
+ double* col_weights_ptr = NULL;
375
+ std::vector<double> Xcpp;
376
+
377
+ if (X_num.size())
378
+ {
379
+ numeric_data_ptr = &X_num[0];
380
+ if (Rcpp::as<std::string>(missing_action) != std::string("fail"))
381
+ numeric_data_ptr = set_R_nan_as_C_nan(numeric_data_ptr, nrows * ncols_numeric, Xcpp, 1);
382
+ }
383
+
384
+ if (X_cat.size())
385
+ {
386
+ categ_data_ptr = &X_cat[0];
387
+ ncat_ptr = &ncat[0];
388
+ }
389
+
390
+ if (Xc.size())
391
+ {
392
+ Xc_ptr = &Xc[0];
393
+ Xc_ind_ptr = &Xc_ind[0];
394
+ Xc_indptr_ptr = &Xc_indptr[0];
395
+ if (Rcpp::as<std::string>(missing_action) != std::string("fail"))
396
+ Xc_ptr = set_R_nan_as_C_nan(Xc_ptr, Xc.size(), Xcpp, 1);
397
+ }
398
+
399
+ if (sample_weights.size())
400
+ {
401
+ sample_weights_ptr = &sample_weights[0];
402
+ }
403
+
404
+ if (col_weights.size())
405
+ {
406
+ col_weights_ptr = &col_weights[0];
407
+ }
408
+
409
+ CoefType coef_type_C = Normal;
410
+ CategSplit cat_split_type_C = SubSet;
411
+ NewCategAction new_cat_action_C = Weighted;
412
+ MissingAction missing_action_C = Divide;
413
+ UseDepthImp depth_imp_C = Higher;
414
+ WeighImpRows weigh_imp_rows_C = Inverse;
415
+
416
+ if (Rcpp::as<std::string>(coef_type) == std::string("uniform"))
417
+ {
418
+ coef_type_C = Uniform;
419
+ }
420
+ if (Rcpp::as<std::string>(cat_split_type) == std::string("single_categ"))
421
+ {
422
+ cat_split_type_C = SingleCateg;
423
+ }
424
+ if (Rcpp::as<std::string>(new_cat_action) == std::string("smallest"))
425
+ {
426
+ new_cat_action_C = Smallest;
427
+ }
428
+ else if (Rcpp::as<std::string>(new_cat_action) == std::string("random"))
429
+ {
430
+ new_cat_action_C = Random;
431
+ }
432
+ if (Rcpp::as<std::string>(missing_action) == std::string("impute"))
433
+ {
434
+ missing_action_C = Impute;
435
+ }
436
+ else if (Rcpp::as<std::string>(missing_action) == std::string("fail"))
437
+ {
438
+ missing_action_C = Fail;
439
+ }
440
+ if (Rcpp::as<std::string>(depth_imp) == std::string("lower"))
441
+ {
442
+ depth_imp_C = Lower;
443
+ }
444
+ else if (Rcpp::as<std::string>(depth_imp) == std::string("same"))
445
+ {
446
+ depth_imp_C = Same;
447
+ }
448
+ if (Rcpp::as<std::string>(weigh_imp_rows) == std::string("prop"))
449
+ {
450
+ weigh_imp_rows_C = Prop;
451
+ }
452
+ else if (Rcpp::as<std::string>(weigh_imp_rows) == std::string("flat"))
453
+ {
454
+ weigh_imp_rows_C = Flat;
455
+ }
456
+
457
+ IsoForest* model_ptr = NULL;
458
+ ExtIsoForest* ext_model_ptr = NULL;
459
+ Imputer* imputer_ptr = NULL;
460
+ if (ndim == 1)
461
+ model_ptr = static_cast<IsoForest*>(R_ExternalPtrAddr(model_R_ptr));
462
+ else
463
+ ext_model_ptr = static_cast<ExtIsoForest*>(R_ExternalPtrAddr(model_R_ptr));
464
+
465
+ std::vector<ImputeNode> *imp_ptr = NULL;
466
+ if (build_imputer)
467
+ {
468
+ imputer_ptr = static_cast<Imputer*>(R_ExternalPtrAddr(imp_R_ptr));
469
+ imputer_ptr->imputer_tree.emplace_back();
470
+ imp_ptr = &imputer_ptr->imputer_tree.back();
471
+ }
472
+
473
+ add_tree(model_ptr, ext_model_ptr,
474
+ numeric_data_ptr, ncols_numeric,
475
+ categ_data_ptr, ncols_categ, ncat_ptr,
476
+ Xc_ptr, Xc_ind_ptr, Xc_indptr_ptr,
477
+ ndim, ntry, coef_type_C, coef_by_prop,
478
+ sample_weights_ptr,
479
+ nrows, max_depth,
480
+ limit_depth, penalize_range,
481
+ col_weights_ptr, weigh_by_kurt,
482
+ prob_pick_by_gain_avg, prob_split_by_gain_avg,
483
+ prob_pick_by_gain_pl, prob_split_by_gain_pl,
484
+ min_gain, missing_action_C,
485
+ cat_split_type_C, new_cat_action_C,
486
+ depth_imp_C, weigh_imp_rows_C, all_perm,
487
+ imp_ptr, min_imp_obs, (uint64_t)random_seed);
488
+
489
+ if (ndim == 1)
490
+ return serialize_cpp_obj(model_ptr);
491
+ else
492
+ return serialize_cpp_obj(ext_model_ptr);
493
+ }
494
+
495
+ // [[Rcpp::export]]
496
+ void predict_iso(SEXP model_R_ptr, Rcpp::NumericVector outp, Rcpp::IntegerVector tree_num, bool is_extended,
497
+ Rcpp::NumericVector X_num, Rcpp::IntegerVector X_cat,
498
+ Rcpp::NumericVector Xc, Rcpp::IntegerVector Xc_ind, Rcpp::IntegerVector Xc_indptr,
499
+ Rcpp::NumericVector Xr, Rcpp::IntegerVector Xr_ind, Rcpp::IntegerVector Xr_indptr,
500
+ size_t nrows, int nthreads, bool standardize)
501
+ {
502
+ double* numeric_data_ptr = NULL;
503
+ int* categ_data_ptr = NULL;
504
+ double* Xc_ptr = NULL;
505
+ sparse_ix* Xc_ind_ptr = NULL;
506
+ sparse_ix* Xc_indptr_ptr = NULL;
507
+ double* Xr_ptr = NULL;
508
+ sparse_ix* Xr_ind_ptr = NULL;
509
+ sparse_ix* Xr_indptr_ptr = NULL;
510
+ sparse_ix* tree_num_ptr = NULL;
511
+ std::vector<double> Xcpp;
512
+
513
+ if (X_num.size())
514
+ {
515
+ numeric_data_ptr = &X_num[0];
516
+ }
517
+
518
+ if (X_cat.size())
519
+ {
520
+ categ_data_ptr = &X_cat[0];
521
+ }
522
+
523
+ if (Xc.size())
524
+ {
525
+ Xc_ptr = &Xc[0];
526
+ Xc_ind_ptr = &Xc_ind[0];
527
+ Xc_indptr_ptr = &Xc_indptr[0];
528
+ }
529
+
530
+ if (Xr.size())
531
+ {
532
+ Xr_ptr = &Xr[0];
533
+ Xr_ind_ptr = &Xr_ind[0];
534
+ Xr_indptr_ptr = &Xr_indptr[0];
535
+ }
536
+
537
+ if (tree_num.size())
538
+ {
539
+ tree_num_ptr = &tree_num[0];
540
+ }
541
+
542
+ double* depths_ptr = &outp[0];
543
+
544
+ IsoForest* model_ptr = NULL;
545
+ ExtIsoForest* ext_model_ptr = NULL;
546
+ if (is_extended)
547
+ ext_model_ptr = static_cast<ExtIsoForest*>(R_ExternalPtrAddr(model_R_ptr));
548
+ else
549
+ model_ptr = static_cast<IsoForest*>(R_ExternalPtrAddr(model_R_ptr));
550
+
551
+ MissingAction missing_action = is_extended?
552
+ ext_model_ptr->missing_action
553
+ :
554
+ model_ptr->missing_action;
555
+ if (missing_action != Fail)
556
+ {
557
+ if (X_num.size()) numeric_data_ptr = set_R_nan_as_C_nan(numeric_data_ptr, X_num.size(), Xcpp, nthreads);
558
+ if (Xc.size()) Xc_ptr = set_R_nan_as_C_nan(Xc_ptr, Xc.size(), Xcpp, nthreads);
559
+ if (Xr.size()) Xr_ptr = set_R_nan_as_C_nan(Xr_ptr, Xr.size(), Xcpp, nthreads);
560
+ }
561
+
562
+ predict_iforest(numeric_data_ptr, categ_data_ptr,
563
+ Xc_ptr, Xc_ind_ptr, Xc_indptr_ptr,
564
+ Xr_ptr, Xr_ind_ptr, Xr_indptr_ptr,
565
+ nrows, nthreads, standardize,
566
+ model_ptr, ext_model_ptr,
567
+ depths_ptr, tree_num_ptr);
568
+ }
569
+
570
+ // [[Rcpp::export]]
571
+ void dist_iso(SEXP model_R_ptr, Rcpp::NumericVector tmat, Rcpp::NumericVector dmat,
572
+ Rcpp::NumericVector rmat, bool is_extended,
573
+ Rcpp::NumericVector X_num, Rcpp::IntegerVector X_cat,
574
+ Rcpp::NumericVector Xc, Rcpp::IntegerVector Xc_ind, Rcpp::IntegerVector Xc_indptr,
575
+ size_t nrows, int nthreads, bool assume_full_distr,
576
+ bool standardize_dist, bool sq_dist, size_t n_from)
577
+ {
578
+ double* numeric_data_ptr = NULL;
579
+ int* categ_data_ptr = NULL;
580
+ double* Xc_ptr = NULL;
581
+ sparse_ix* Xc_ind_ptr = NULL;
582
+ sparse_ix* Xc_indptr_ptr = NULL;
583
+ std::vector<double> Xcpp;
584
+
585
+ if (X_num.size())
586
+ {
587
+ numeric_data_ptr = &X_num[0];
588
+ }
589
+
590
+ if (X_cat.size())
591
+ {
592
+ categ_data_ptr = &X_cat[0];
593
+ }
594
+
595
+ if (Xc.size())
596
+ {
597
+ Xc_ptr = &Xc[0];
598
+ Xc_ind_ptr = &Xc_ind[0];
599
+ Xc_indptr_ptr = &Xc_indptr[0];
600
+ }
601
+
602
+ double* tmat_ptr = n_from? (double*)NULL : &tmat[0];
603
+ double* dmat_ptr = (sq_dist & !n_from)? &dmat[0] : NULL;
604
+ double* rmat_ptr = n_from? &rmat[0] : NULL;
605
+
606
+ IsoForest* model_ptr = NULL;
607
+ ExtIsoForest* ext_model_ptr = NULL;
608
+ if (is_extended)
609
+ ext_model_ptr = static_cast<ExtIsoForest*>(R_ExternalPtrAddr(model_R_ptr));
610
+ else
611
+ model_ptr = static_cast<IsoForest*>(R_ExternalPtrAddr(model_R_ptr));
612
+
613
+
614
+ MissingAction missing_action = is_extended?
615
+ ext_model_ptr->missing_action
616
+ :
617
+ model_ptr->missing_action;
618
+ if (missing_action != Fail)
619
+ {
620
+ if (X_num.size()) numeric_data_ptr = set_R_nan_as_C_nan(numeric_data_ptr, X_num.size(), Xcpp, nthreads);
621
+ if (Xc.size()) Xc_ptr = set_R_nan_as_C_nan(Xc_ptr, Xc.size(), Xcpp, nthreads);
622
+ }
623
+
624
+
625
+ calc_similarity(numeric_data_ptr, categ_data_ptr,
626
+ Xc_ptr, Xc_ind_ptr, Xc_indptr_ptr,
627
+ nrows, nthreads, assume_full_distr, standardize_dist,
628
+ model_ptr, ext_model_ptr,
629
+ tmat_ptr, rmat_ptr, n_from);
630
+
631
+ if (sq_dist & !n_from)
632
+ tmat_to_dense(tmat_ptr, dmat_ptr, nrows, !standardize_dist);
633
+ }
634
+
635
+ // [[Rcpp::export]]
636
+ Rcpp::List impute_iso(SEXP model_R_ptr, SEXP imputer_R_ptr, bool is_extended,
637
+ Rcpp::NumericVector X_num, Rcpp::IntegerVector X_cat,
638
+ Rcpp::NumericVector Xr, Rcpp::IntegerVector Xr_ind, Rcpp::IntegerVector Xr_indptr,
639
+ size_t nrows, int nthreads)
640
+ {
641
+ double* numeric_data_ptr = NULL;
642
+ int* categ_data_ptr = NULL;
643
+ double* Xr_ptr = NULL;
644
+ sparse_ix* Xr_ind_ptr = NULL;
645
+ sparse_ix* Xr_indptr_ptr = NULL;
646
+
647
+ if (X_num.size())
648
+ {
649
+ numeric_data_ptr = &X_num[0];
650
+ }
651
+
652
+ if (X_cat.size())
653
+ {
654
+ categ_data_ptr = &X_cat[0];
655
+ }
656
+
657
+ if (Xr.size())
658
+ {
659
+ Xr_ptr = &Xr[0];
660
+ Xr_ind_ptr = &Xr_ind[0];
661
+ Xr_indptr_ptr = &Xr_indptr[0];
662
+ }
663
+
664
+ if (X_num.size()) numeric_data_ptr = set_R_nan_as_C_nan(numeric_data_ptr, X_num.size(), nthreads);
665
+ if (Xr.size()) Xr_ptr = set_R_nan_as_C_nan(Xr_ptr, Xr.size(), nthreads);
666
+
667
+ IsoForest* model_ptr = NULL;
668
+ ExtIsoForest* ext_model_ptr = NULL;
669
+ if (is_extended)
670
+ ext_model_ptr = static_cast<ExtIsoForest*>(R_ExternalPtrAddr(model_R_ptr));
671
+ else
672
+ model_ptr = static_cast<IsoForest*>(R_ExternalPtrAddr(model_R_ptr));
673
+
674
+ Imputer* imputer_ptr = static_cast<Imputer*>(R_ExternalPtrAddr(imputer_R_ptr));
675
+
676
+
677
+ impute_missing_values(numeric_data_ptr, categ_data_ptr,
678
+ Xr_ptr, Xr_ind_ptr, Xr_indptr_ptr,
679
+ nrows, nthreads,
680
+ model_ptr, ext_model_ptr,
681
+ *imputer_ptr);
682
+
683
+ return Rcpp::List::create(
684
+ Rcpp::_["X_num"] = Xr.size()? Xr : X_num,
685
+ Rcpp::_["X_cat"] = X_cat
686
+ );
687
+ }
688
+
689
+ // [[Rcpp::export]]
690
+ Rcpp::List get_n_nodes(SEXP model_R_ptr, bool is_extended, int nthreads)
691
+ {
692
+ size_t ntrees;
693
+ IsoForest* model_ptr = NULL;
694
+ ExtIsoForest* ext_model_ptr = NULL;
695
+ if (is_extended)
696
+ {
697
+ ext_model_ptr = static_cast<ExtIsoForest*>(R_ExternalPtrAddr(model_R_ptr));
698
+ ntrees = ext_model_ptr->hplanes.size();
699
+ }
700
+ else
701
+ {
702
+ model_ptr = static_cast<IsoForest*>(R_ExternalPtrAddr(model_R_ptr));
703
+ ntrees = model_ptr->trees.size();
704
+ }
705
+
706
+ Rcpp::IntegerVector n_nodes(ntrees);
707
+ Rcpp::IntegerVector n_terminal(ntrees);
708
+ if (is_extended)
709
+ get_num_nodes(*ext_model_ptr, &n_nodes[0], &n_terminal[0], nthreads);
710
+ else
711
+ get_num_nodes(*model_ptr, &n_nodes[0], &n_terminal[0], nthreads);
712
+
713
+ return Rcpp::List::create(
714
+ Rcpp::_["total"] = n_nodes,
715
+ Rcpp::_["terminal"] = n_terminal
716
+ );
717
+ }
718
+
719
+ // [[Rcpp::export]]
720
+ Rcpp::List append_trees_from_other(SEXP model_R_ptr, SEXP other_R_ptr,
721
+ SEXP imp_R_ptr, SEXP oimp_R_ptr,
722
+ bool is_extended)
723
+ {
724
+ Rcpp::List out;
725
+ IsoForest* model_ptr = NULL;
726
+ IsoForest* other_ptr = NULL;
727
+ ExtIsoForest* ext_model_ptr = NULL;
728
+ ExtIsoForest* ext_other_ptr = NULL;
729
+ Imputer* imputer_ptr = NULL;
730
+ Imputer* oimputer_ptr = NULL;
731
+
732
+ if (is_extended) {
733
+ ext_model_ptr = static_cast<ExtIsoForest*>(R_ExternalPtrAddr(model_R_ptr));
734
+ ext_other_ptr = static_cast<ExtIsoForest*>(R_ExternalPtrAddr(other_R_ptr));
735
+ } else {
736
+ model_ptr = static_cast<IsoForest*>(R_ExternalPtrAddr(model_R_ptr));
737
+ other_ptr = static_cast<IsoForest*>(R_ExternalPtrAddr(other_R_ptr));
738
+ }
739
+
740
+ if (!Rf_isNull(imp_R_ptr) && !Rf_isNull(oimp_R_ptr) &&
741
+ R_ExternalPtrAddr(imp_R_ptr) != NULL &&
742
+ R_ExternalPtrAddr(oimp_R_ptr) != NULL)
743
+ {
744
+ imputer_ptr = static_cast<Imputer*>(R_ExternalPtrAddr(imp_R_ptr));
745
+ oimputer_ptr = static_cast<Imputer*>(R_ExternalPtrAddr(oimp_R_ptr));
746
+ }
747
+
748
+ merge_models(model_ptr, other_ptr,
749
+ ext_model_ptr, ext_other_ptr,
750
+ imputer_ptr, oimputer_ptr);
751
+
752
+
753
+ if (is_extended)
754
+ out["serialized"] = serialize_cpp_obj(ext_model_ptr);
755
+ else
756
+ out["serialized"] = serialize_cpp_obj(model_ptr);
757
+
758
+ if (imputer_ptr != NULL && oimputer_ptr != NULL)
759
+ out["imp_ser"] = serialize_cpp_obj(imputer_ptr);
760
+
761
+ return out;
762
+ }