isotree 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,929 @@
1
+ /* Isolation forests and variations thereof, with adjustments for incorporation
2
+ * of categorical variables and missing values.
3
+ * Writen for C++11 standard and aimed at being used in R and Python.
4
+ *
5
+ * This library is based on the following works:
6
+ * [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
7
+ * "Isolation forest."
8
+ * 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
9
+ * [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
10
+ * "Isolation-based anomaly detection."
11
+ * ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
12
+ * [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
13
+ * "Extended Isolation Forest."
14
+ * arXiv preprint arXiv:1811.02141 (2018).
15
+ * [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
16
+ * "On detecting clustered anomalies using SCiForest."
17
+ * Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
18
+ * [5] https://sourceforge.net/projects/iforest/
19
+ * [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
20
+ * [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
21
+ * [8] Cortes, David. "Distance approximation using Isolation Forests." arXiv preprint arXiv:1910.12362 (2019).
22
+ * [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
23
+ *
24
+ * BSD 2-Clause License
25
+ * Copyright (c) 2019, David Cortes
26
+ * All rights reserved.
27
+ * Redistribution and use in source and binary forms, with or without
28
+ * modification, are permitted provided that the following conditions are met:
29
+ * * Redistributions of source code must retain the above copyright notice, this
30
+ * list of conditions and the following disclaimer.
31
+ * * Redistributions in binary form must reproduce the above copyright notice,
32
+ * this list of conditions and the following disclaimer in the documentation
33
+ * and/or other materials provided with the distribution.
34
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
35
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
36
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
37
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
38
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
39
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
40
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
41
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
42
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
43
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
44
+ */
45
+
46
+ /* Standard headers */
47
+ #include <stddef.h>
48
+ #include <math.h>
49
+ #include <limits.h>
50
+ #include <string.h>
51
+ #include <signal.h>
52
+ #include <vector>
53
+ #include <iterator>
54
+ #include <numeric>
55
+ #include <algorithm>
56
+ #include <random>
57
+ #include <unordered_set>
58
+ #include <unordered_map>
59
+ #include <memory>
60
+ #include <utility>
61
+ #include <cstdint>
62
+ #include <iostream>
63
+ #ifndef _FOR_R
64
+ #include <stdio.h>
65
+ #else
66
+ extern "C" {
67
+ #include <R_ext/Print.h>
68
+ }
69
+ #define printf Rprintf
70
+ #define fprintf(f, message) REprintf(message)
71
+ #endif
72
+ #ifdef _OPENMP
73
+ #include <omp.h>
74
+ #endif
75
+ #ifdef _ENABLE_CEREAL
76
+ #include <cereal/archives/binary.hpp>
77
+ #include <cereal/types/vector.hpp>
78
+ #include <sstream>
79
+ #include <string>
80
+ #include <fstream>
81
+ #endif
82
+
83
+ /* By default, will use Mersenne-Twister for RNG, but can be switched to something faster */
84
+ #ifdef _USE_MERSENNE_TWISTER
85
+ #if SIZE_MAX >= UINT64_MAX /* 64-bit systems or higher */
86
+ #define RNG_engine std::mt19937_64
87
+ #else /* 32-bit systems and non-standard architectures */
88
+ #define RNG_engine std::mt19937
89
+ #endif
90
+ #else
91
+ #define RNG_engine std::default_random_engine
92
+ #endif
93
+
94
+ /* Short functions */
95
+ #define ix_parent(ix) (((ix) - 1) / 2) /* integer division takes care of deciding left-right */
96
+ #define ix_child(ix) (2 * (ix) + 1)
97
+ /* https://stackoverflow.com/questions/101439/the-most-efficient-way-to-implement-an-integer-based-power-function-powint-int */
98
+ #define pow2(n) ( ((size_t) 1) << (n) )
99
+ #define square(x) ((x) * (x))
100
+ /* https://stackoverflow.com/questions/2249731/how-do-i-get-bit-by-bit-data-from-an-integer-value-in-c */
101
+ #define extract_bit(number, bit) (((number) >> (bit)) & 1)
102
+ #ifndef isinf
103
+ #define isinf std::isinf
104
+ #endif
105
+ #ifndef isnan
106
+ #define isnan std::isnan
107
+ #endif
108
+ #define is_na_or_inf(x) (isnan(x) || isinf(x))
109
+
110
+
111
+ /* Aliasing for compiler optimizations */
112
+ #if defined(__GNUG__) || defined(__GNUC__) || defined(_MSC_VER) || defined(__clang__) || defined(__INTEL_COMPILER)
113
+ #define restrict __restrict
114
+ #else
115
+ #define restrict
116
+ #endif
117
+
118
+ /* MSVC is stuck with an OpenMP version that's 19 years old at the time of writing and does not support unsigned iterators */
119
+ #ifdef _OPENMP
120
+ #if (_OPENMP < 200801) || defined(_WIN32) || defined(_WIN64) /* OpenMP < 3.0 */
121
+ #define size_t_for long
122
+ #else
123
+ #define size_t_for size_t
124
+ #endif
125
+ #else
126
+ #define size_t_for size_t
127
+ #endif
128
+
129
+
130
+ /* Apple at some point decided to drop OMP library and headersfrom its compiler distribution
131
+ * and to alias 'gcc' to 'clang', which work differently when given flags they cannot interpret,
132
+ * causing installation issues with pretty much all scientific software due to OMP headers that
133
+ * would normally do nothing. This piece of code is to allow compilation without OMP header. */
134
+ #ifndef _OPENMP
135
+ #define omp_get_thread_num() 0
136
+ #endif
137
+
138
+
139
+ /* For sparse matrices */
140
+ #ifdef _FOR_R
141
+ #define sparse_ix int
142
+ #else
143
+ #define sparse_ix size_t
144
+ #endif
145
+
146
+
147
+ /* Types used through the package */
148
+ typedef enum NewCategAction {Weighted, Smallest, Random} NewCategAction; /* Weighted means Impute in the extended model */
149
+ typedef enum MissingAction {Divide, Impute, Fail} MissingAction; /* Divide is only for non-extended model */
150
+ typedef enum ColType {Numeric, Categorical, NotUsed} ColType;
151
+ typedef enum CategSplit {SubSet, SingleCateg} CategSplit;
152
+ typedef enum GainCriterion {Averaged, Pooled, NoCrit} Criterion; /* For guided splits */
153
+ typedef enum CoefType {Uniform, Normal} CoefType; /* For extended model */
154
+ typedef enum UseDepthImp {Lower, Higher, Same} UseDepthImp; /* For NA imputation */
155
+ typedef enum WeighImpRows {Inverse, Prop, Flat} WeighImpRows; /* For NA imputation */
156
+
157
+ /* Notes about new categorical action:
158
+ * - For single-variable case, if using 'Smallest', can then pass data at prediction time
159
+ * having categories that were never in the training data (as an integer higher than 'ncat'
160
+ * for that column), but if using 'Random' or 'Weighted', these must be passed as NA (int < 0)
161
+ * - For extended case, 'Weighted' becomes a weighted imputation instead, and if using either
162
+ * 'Weighted' or 'Smallest', can pass newer, unseen categories at prediction time too.
163
+ * - If using 'Random', cannot pass new categories at prediction time.
164
+ * - If using 'Weighted' for single-variable case, cannot predict similarity with a value
165
+ * for MissingAction other than 'Divide'. */
166
+
167
+
168
+ /* Structs that are output (modified) from the main function */
169
+ typedef struct IsoTree {
170
+ ColType col_type = NotUsed; /* issues with uninitialized values passed to Cereal */
171
+ size_t col_num;
172
+ double num_split;
173
+ std::vector<char> cat_split;
174
+ int chosen_cat;
175
+ size_t tree_left;
176
+ size_t tree_right;
177
+ double pct_tree_left;
178
+ double score; /* will not be integer when there are weights or early stop */
179
+ double range_low = -HUGE_VAL;
180
+ double range_high = HUGE_VAL;
181
+ double remainder; /* only used for distance/similarity */
182
+
183
+ #ifdef _ENABLE_CEREAL
184
+ template<class Archive>
185
+ void serialize(Archive &archive)
186
+ {
187
+ archive(
188
+ this->col_type,
189
+ this->col_num,
190
+ this->num_split,
191
+ this->cat_split,
192
+ this->chosen_cat,
193
+ this->tree_left,
194
+ this->tree_right,
195
+ this->pct_tree_left,
196
+ this->score,
197
+ this->range_low,
198
+ this->range_high,
199
+ this->remainder
200
+ );
201
+ }
202
+ #endif
203
+
204
+ IsoTree() = default;
205
+
206
+ } IsoTree;
207
+
208
+ typedef struct IsoHPlane {
209
+ std::vector<size_t> col_num;
210
+ std::vector<ColType> col_type;
211
+ std::vector<double> coef;
212
+ std::vector<double> mean;
213
+ std::vector<std::vector<double>> cat_coef;
214
+ std::vector<int> chosen_cat;
215
+ std::vector<double> fill_val;
216
+ std::vector<double> fill_new;
217
+
218
+ double split_point;
219
+ size_t hplane_left;
220
+ size_t hplane_right;
221
+ double score; /* will not be integer when there are weights or early stop */
222
+ double range_low = -HUGE_VAL;
223
+ double range_high = HUGE_VAL;
224
+ double remainder; /* only used for distance/similarity */
225
+
226
+ #ifdef _ENABLE_CEREAL
227
+ template<class Archive>
228
+ void serialize(Archive &archive)
229
+ {
230
+ archive(
231
+ this->col_num,
232
+ this->col_type,
233
+ this->coef,
234
+ this->mean,
235
+ this->cat_coef,
236
+ this->chosen_cat,
237
+ this->fill_val,
238
+ this->fill_new,
239
+ this->split_point,
240
+ this->hplane_left,
241
+ this->hplane_right,
242
+ this->score,
243
+ this->range_low,
244
+ this->range_high,
245
+ this->remainder
246
+ );
247
+ }
248
+ #endif
249
+
250
+ IsoHPlane() = default;
251
+ } IsoHPlane;
252
+
253
+ /* Note: don't use long doubles in the outside outputs or there will be issues with MINGW in windows */
254
+
255
+
256
+ typedef struct IsoForest {
257
+ std::vector< std::vector<IsoTree> > trees;
258
+ NewCategAction new_cat_action;
259
+ CategSplit cat_split_type;
260
+ MissingAction missing_action;
261
+ double exp_avg_depth;
262
+ double exp_avg_sep;
263
+ size_t orig_sample_size;
264
+
265
+ #ifdef _ENABLE_CEREAL
266
+ template<class Archive>
267
+ void serialize(Archive &archive)
268
+ {
269
+ archive(
270
+ this->trees,
271
+ this->new_cat_action,
272
+ this->cat_split_type,
273
+ this->missing_action,
274
+ this->exp_avg_depth,
275
+ this->exp_avg_sep,
276
+ this->orig_sample_size
277
+ );
278
+ }
279
+ #endif
280
+
281
+ IsoForest() = default;
282
+ } IsoForest;
283
+
284
+ typedef struct ExtIsoForest {
285
+ std::vector< std::vector<IsoHPlane> > hplanes;
286
+ NewCategAction new_cat_action;
287
+ CategSplit cat_split_type;
288
+ MissingAction missing_action;
289
+ double exp_avg_depth;
290
+ double exp_avg_sep;
291
+ size_t orig_sample_size;
292
+
293
+ #ifdef _ENABLE_CEREAL
294
+ template<class Archive>
295
+ void serialize(Archive &archive)
296
+ {
297
+ archive(
298
+ this->hplanes,
299
+ this->new_cat_action,
300
+ this->cat_split_type,
301
+ this->missing_action,
302
+ this->exp_avg_depth,
303
+ this->exp_avg_sep,
304
+ this->orig_sample_size
305
+ );
306
+ }
307
+ #endif
308
+
309
+ ExtIsoForest() = default;
310
+ } ExtIsoForest;
311
+
312
+ typedef struct ImputeNode {
313
+ std::vector<double> num_sum;
314
+ std::vector<double> num_weight;
315
+ std::vector<std::vector<double>> cat_sum;
316
+ std::vector<double> cat_weight;
317
+ size_t parent;
318
+
319
+ #ifdef _ENABLE_CEREAL
320
+ template<class Archive>
321
+ void serialize(Archive &archive)
322
+ {
323
+ archive(
324
+ this->num_sum,
325
+ this->num_weight,
326
+ this->cat_sum,
327
+ this->cat_weight,
328
+ this->parent
329
+ );
330
+ }
331
+ #endif
332
+ ImputeNode() = default;
333
+
334
+ ImputeNode(size_t parent)
335
+ {
336
+ this->parent = parent;
337
+ }
338
+
339
+ } ImputeNode; /* this is for each tree node */
340
+
341
+ typedef struct Imputer {
342
+ size_t ncols_numeric;
343
+ size_t ncols_categ;
344
+ std::vector<int> ncat;
345
+ std::vector<std::vector<ImputeNode>> imputer_tree;
346
+ std::vector<double> col_means;
347
+ std::vector<int> col_modes;
348
+
349
+ #ifdef _ENABLE_CEREAL
350
+ template<class Archive>
351
+ void serialize(Archive &archive)
352
+ {
353
+ archive(
354
+ this->ncols_numeric,
355
+ this->ncols_categ,
356
+ this->ncat,
357
+ this->imputer_tree,
358
+ this->col_means,
359
+ this->col_modes
360
+ );
361
+ }
362
+ #endif
363
+
364
+ Imputer() = default;
365
+
366
+ } Imputer;
367
+
368
+
369
+ /* Structs that are only used internally */
370
+ typedef struct {
371
+ double* numeric_data;
372
+ size_t ncols_numeric;
373
+ int* categ_data;
374
+ int* ncat;
375
+ int max_categ;
376
+ size_t ncols_categ;
377
+ size_t nrows;
378
+ size_t ncols_tot;
379
+ double* sample_weights;
380
+ bool weight_as_sample;
381
+ double* col_weights;
382
+ double* Xc; /* only for sparse matrices */
383
+ sparse_ix* Xc_ind; /* only for sparse matrices */
384
+ sparse_ix* Xc_indptr; /* only for sparse matrices */
385
+ size_t log2_n; /* only when using weights for sampling */
386
+ size_t btree_offset; /* only when using weights for sampling */
387
+ std::vector<double> btree_weights_init; /* only when using weights for sampling */
388
+ std::vector<char> has_missing; /* only used when producing missing imputations on-the-fly */
389
+ size_t n_missing; /* only used when producing missing imputations on-the-fly */
390
+ } InputData;
391
+
392
+
393
+ typedef struct {
394
+ double* numeric_data;
395
+ int* categ_data;
396
+ size_t nrows;
397
+ double* Xc; /* only for sparse matrices */
398
+ sparse_ix* Xc_ind; /* only for sparse matrices */
399
+ sparse_ix* Xc_indptr; /* only for sparse matrices */
400
+ double* Xr; /* only for sparse matrices */
401
+ sparse_ix* Xr_ind; /* only for sparse matrices */
402
+ sparse_ix* Xr_indptr; /* only for sparse matrices */
403
+ } PredictionData;
404
+
405
+ typedef struct {
406
+ bool with_replacement;
407
+ size_t sample_size;
408
+ size_t ntrees;
409
+ size_t max_depth;
410
+ bool penalize_range;
411
+ uint64_t random_seed;
412
+ bool weigh_by_kurt;
413
+ double prob_pick_by_gain_avg;
414
+ double prob_split_by_gain_avg;
415
+ double prob_pick_by_gain_pl;
416
+ double prob_split_by_gain_pl;
417
+ double min_gain;
418
+ CategSplit cat_split_type;
419
+ NewCategAction new_cat_action;
420
+ MissingAction missing_action;
421
+ bool all_perm;
422
+
423
+ size_t ndim; /* only for extended model */
424
+ size_t ntry; /* only for extended model */
425
+ CoefType coef_type; /* only for extended model */
426
+ bool coef_by_prop; /* only for extended model */
427
+
428
+ bool calc_dist; /* checkbox for calculating distances on-the-fly */
429
+ bool calc_depth; /* checkbox for calculating depths on-the-fly */
430
+ bool impute_at_fit; /* checkbox for producing imputed missing values on-the-fly */
431
+
432
+ UseDepthImp depth_imp; /* only when building NA imputer */
433
+ WeighImpRows weigh_imp_rows; /* only when building NA imputer */
434
+ size_t min_imp_obs; /* only when building NA imputer */
435
+ } ModelParams;
436
+
437
+ typedef struct ImputedData {
438
+ std::vector<long double> num_sum;
439
+ std::vector<long double> num_weight;
440
+ std::vector<std::vector<long double>> cat_sum;
441
+ std::vector<long double> cat_weight;
442
+ std::vector<long double> sp_num_sum;
443
+ std::vector<long double> sp_num_weight;
444
+
445
+ std::vector<size_t> missing_num;
446
+ std::vector<size_t> missing_cat;
447
+ std::vector<sparse_ix> missing_sp;
448
+ size_t n_missing_num;
449
+ size_t n_missing_cat;
450
+ size_t n_missing_sp;
451
+
452
+ ImputedData() {};
453
+
454
+ ImputedData(InputData &input_data, size_t row);
455
+
456
+ } ImputedData;
457
+
458
+ typedef struct {
459
+ std::vector<size_t> ix_arr;
460
+ std::vector<size_t> ix_all;
461
+ RNG_engine rnd_generator;
462
+ std::uniform_int_distribution<size_t> runif;
463
+ std::uniform_real_distribution<double> rbin;
464
+ size_t st;
465
+ size_t end;
466
+ size_t st_NA;
467
+ size_t end_NA;
468
+ size_t split_ix;
469
+ std::unordered_map<size_t, double> weights_map;
470
+ std::vector<double> weights_arr; /* when not ignoring NAs and when using weights as density */
471
+ double xmin;
472
+ double xmax;
473
+ size_t npresent; /* 'npresent' and 'ncols_tried' are used interchangeable and for unrelated things */
474
+ bool unsplittable;
475
+ std::vector<bool> is_repeated;
476
+ std::vector<char> categs;
477
+ size_t ncols_tried; /* 'npresent' and 'ncols_tried' are used interchangeable and for unrelated things */
478
+ int ncat_tried;
479
+ std::vector<bool> cols_possible;
480
+ std::vector<double> btree_weights; /* only when using weights for sampling */
481
+ std::discrete_distribution<size_t> col_sampler; /* columns can get eliminated, keep a copy for each thread */
482
+
483
+ /* for split criterion */
484
+ std::vector<double> buffer_dbl;
485
+ std::vector<size_t> buffer_szt;
486
+ std::vector<char> buffer_chr;
487
+ double prob_split_type;
488
+ GainCriterion criterion;
489
+ double this_gain;
490
+ double this_split_point;
491
+ int this_categ;
492
+ std::vector<char> this_split_categ;
493
+ bool determine_split;
494
+
495
+ /* for the extended model */
496
+ size_t ntry;
497
+ size_t ntaken;
498
+ size_t ntaken_best;
499
+ bool tried_all;
500
+ size_t col_chosen;
501
+ ColType col_type;
502
+ double ext_sd;
503
+ std::vector<size_t> cols_shuffled;
504
+ std::vector<double> comb_val;
505
+ std::vector<size_t> col_take;
506
+ std::vector<ColType> col_take_type;
507
+ std::vector<double> ext_offset;
508
+ std::vector<double> ext_coef;
509
+ std::vector<double> ext_mean;
510
+ std::vector<double> ext_fill_val;
511
+ std::vector<double> ext_fill_new;
512
+ std::vector<int> chosen_cat;
513
+ std::vector<std::vector<double>> ext_cat_coef;
514
+ std::uniform_real_distribution<double> coef_unif;
515
+ std::normal_distribution<double> coef_norm;
516
+
517
+ /* for similarity/distance calculations */
518
+ std::vector<double> tmat_sep;
519
+
520
+ /* when calculating average depth on-the-fly */
521
+ std::vector<double> row_depths;
522
+
523
+ /* when imputing NAs on-the-fly */
524
+ std::vector<ImputedData> impute_vec;
525
+ std::unordered_map<size_t, ImputedData> impute_map;
526
+
527
+ } WorkerMemory;
528
+
529
+ typedef struct WorkerForSimilarity {
530
+ std::vector<size_t> ix_arr;
531
+ size_t st;
532
+ size_t end;
533
+ std::vector<double> weights_arr;
534
+ std::vector<double> comb_val;
535
+ std::vector<double> tmat_sep;
536
+ std::vector<double> rmat;
537
+ size_t n_from;
538
+ bool assume_full_distr; /* doesn't need to have one copy per worker */
539
+ } WorkerForSimilarity;
540
+
541
+ typedef struct {
542
+ size_t st;
543
+ size_t st_NA;
544
+ size_t end_NA;
545
+ size_t split_ix;
546
+ size_t end;
547
+ std::vector<size_t> ix_arr;
548
+ std::unordered_map<size_t, double> weights_map;
549
+ std::vector<double> weights_arr;
550
+ std::vector<bool> cols_possible;
551
+ std::discrete_distribution<size_t> col_sampler;
552
+ } RecursionState;
553
+
554
+ /* Function prototypes */
555
+
556
+ /* fit_model.cpp */
557
+ extern bool interrupt_switch;
558
+ int fit_iforest(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
559
+ double numeric_data[], size_t ncols_numeric,
560
+ int categ_data[], size_t ncols_categ, int ncat[],
561
+ double Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
562
+ size_t ndim, size_t ntry, CoefType coef_type, bool coef_by_prop,
563
+ double sample_weights[], bool with_replacement, bool weight_as_sample,
564
+ size_t nrows, size_t sample_size, size_t ntrees, size_t max_depth,
565
+ bool limit_depth, bool penalize_range,
566
+ bool standardize_dist, double tmat[],
567
+ double output_depths[], bool standardize_depth,
568
+ double col_weights[], bool weigh_by_kurt,
569
+ double prob_pick_by_gain_avg, double prob_split_by_gain_avg,
570
+ double prob_pick_by_gain_pl, double prob_split_by_gain_pl,
571
+ double min_gain, MissingAction missing_action,
572
+ CategSplit cat_split_type, NewCategAction new_cat_action,
573
+ bool all_perm, Imputer *imputer, size_t min_imp_obs,
574
+ UseDepthImp depth_imp, WeighImpRows weigh_imp_rows, bool impute_at_fit,
575
+ uint64_t random_seed, int nthreads);
576
+ int add_tree(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
577
+ double numeric_data[], size_t ncols_numeric,
578
+ int categ_data[], size_t ncols_categ, int ncat[],
579
+ double Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
580
+ size_t ndim, size_t ntry, CoefType coef_type, bool coef_by_prop,
581
+ double sample_weights[], size_t nrows, size_t max_depth,
582
+ bool limit_depth, bool penalize_range,
583
+ double col_weights[], bool weigh_by_kurt,
584
+ double prob_pick_by_gain_avg, double prob_split_by_gain_avg,
585
+ double prob_pick_by_gain_pl, double prob_split_by_gain_pl,
586
+ double min_gain, MissingAction missing_action,
587
+ CategSplit cat_split_type, NewCategAction new_cat_action,
588
+ UseDepthImp depth_imp, WeighImpRows weigh_imp_rows,
589
+ bool all_perm, std::vector<ImputeNode> *impute_nodes, size_t min_imp_obs,
590
+ uint64_t random_seed);
591
+ void fit_itree(std::vector<IsoTree> *tree_root,
592
+ std::vector<IsoHPlane> *hplane_root,
593
+ WorkerMemory &workspace,
594
+ InputData &input_data,
595
+ ModelParams &model_params,
596
+ std::vector<ImputeNode> *impute_nodes,
597
+ size_t tree_num);
598
+
599
+ /* isoforest.cpp */
600
+ void split_itree_recursive(std::vector<IsoTree> &trees,
601
+ WorkerMemory &workspace,
602
+ InputData &input_data,
603
+ ModelParams &model_params,
604
+ std::vector<ImputeNode> *impute_nodes,
605
+ size_t curr_depth);
606
+
607
+ /* extended.cpp */
608
+ void split_hplane_recursive(std::vector<IsoHPlane> &hplanes,
609
+ WorkerMemory &workspace,
610
+ InputData &input_data,
611
+ ModelParams &model_params,
612
+ std::vector<ImputeNode> *impute_nodes,
613
+ size_t curr_depth);
614
+ void add_chosen_column(WorkerMemory &workspace, InputData &input_data, ModelParams &model_params,
615
+ std::vector<bool> &col_is_taken, std::unordered_set<size_t> &col_is_taken_s);
616
+ void shrink_to_fit_hplane(IsoHPlane &hplane, bool clear_vectors);
617
+ void simplify_hplane(IsoHPlane &hplane, WorkerMemory &workspace, InputData &input_data, ModelParams &model_params);
618
+
619
+
620
+ /* predict.cpp */
621
+ void predict_iforest(double numeric_data[], int categ_data[],
622
+ double Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
623
+ double Xr[], sparse_ix Xr_ind[], sparse_ix Xr_indptr[],
624
+ size_t nrows, int nthreads, bool standardize,
625
+ IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
626
+ double output_depths[], sparse_ix tree_num[]);
627
+ void traverse_itree_no_recurse(std::vector<IsoTree> &tree,
628
+ IsoForest &model_outputs,
629
+ PredictionData &prediction_data,
630
+ double &output_depth,
631
+ sparse_ix *restrict tree_num,
632
+ size_t row);
633
+ double traverse_itree(std::vector<IsoTree> &tree,
634
+ IsoForest &model_outputs,
635
+ PredictionData &prediction_data,
636
+ std::vector<ImputeNode> *impute_nodes,
637
+ ImputedData *imputed_data,
638
+ double curr_weight,
639
+ size_t row,
640
+ sparse_ix *restrict tree_num,
641
+ size_t curr_lev);
642
+ void traverse_hplane_fast(std::vector<IsoHPlane> &hplane,
643
+ ExtIsoForest &model_outputs,
644
+ PredictionData &prediction_data,
645
+ double &output_depth,
646
+ sparse_ix *restrict tree_num,
647
+ size_t row);
648
+ void traverse_hplane(std::vector<IsoHPlane> &hplane,
649
+ ExtIsoForest &model_outputs,
650
+ PredictionData &prediction_data,
651
+ double &output_depth,
652
+ std::vector<ImputeNode> *impute_nodes,
653
+ ImputedData *imputed_data,
654
+ sparse_ix *restrict tree_num,
655
+ size_t row);
656
+ double extract_spC(PredictionData &prediction_data, size_t row, size_t col_num);
657
+ double extract_spR(PredictionData &prediction_data, sparse_ix *row_st, sparse_ix *row_end, size_t col_num);
658
+ void get_num_nodes(IsoForest &model_outputs, sparse_ix *restrict n_nodes, sparse_ix *restrict n_terminal, int nthreads);
659
+ void get_num_nodes(ExtIsoForest &model_outputs, sparse_ix *restrict n_nodes, sparse_ix *restrict n_terminal, int nthreads);
660
+
661
+ /* dist.cpp */
662
+ void calc_similarity(double numeric_data[], int categ_data[],
663
+ double Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
664
+ size_t nrows, int nthreads, bool assume_full_distr, bool standardize_dist,
665
+ IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
666
+ double tmat[], double rmat[], size_t n_from);
667
+ void traverse_tree_sim(WorkerForSimilarity &workspace,
668
+ PredictionData &prediction_data,
669
+ IsoForest &model_outputs,
670
+ std::vector<IsoTree> &trees,
671
+ size_t curr_tree);
672
+ void traverse_hplane_sim(WorkerForSimilarity &workspace,
673
+ PredictionData &prediction_data,
674
+ ExtIsoForest &model_outputs,
675
+ std::vector<IsoHPlane> &hplanes,
676
+ size_t curr_tree);
677
+ void gather_sim_result(std::vector<WorkerForSimilarity> *worker_memory,
678
+ std::vector<WorkerMemory> *worker_memory_m,
679
+ PredictionData *prediction_data, InputData *input_data,
680
+ IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
681
+ double *restrict tmat, double *restrict rmat, size_t n_from,
682
+ size_t ntrees, bool assume_full_distr,
683
+ bool standardize_dist, int nthreads);
684
+ void initialize_worker_for_sim(WorkerForSimilarity &workspace,
685
+ PredictionData &prediction_data,
686
+ IsoForest *model_outputs,
687
+ ExtIsoForest *model_outputs_ext,
688
+ size_t n_from,
689
+ bool assume_full_distr);
690
+
691
+ /* impute.cpp */
692
+ void impute_missing_values(double numeric_data[], int categ_data[],
693
+ double Xr[], sparse_ix Xr_ind[], sparse_ix Xr_indptr[],
694
+ size_t nrows, int nthreads,
695
+ IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
696
+ Imputer &imputer);
697
+ void initialize_imputer(Imputer &imputer, InputData &input_data, size_t ntrees, int nthreads);
698
+ void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
699
+ InputData &input_data, ModelParams &model_params,
700
+ std::vector<ImputeNode> &imputer_tree,
701
+ size_t curr_depth, size_t min_imp_obs);
702
+ void shrink_impute_node(ImputeNode &imputer);
703
+ void drop_nonterminal_imp_node(std::vector<ImputeNode> &imputer_tree,
704
+ std::vector<IsoTree> *trees,
705
+ std::vector<IsoHPlane> *hplanes);
706
+ void combine_imp_single(ImputedData &imp_addfrom, ImputedData &imp_addto);
707
+ void combine_tree_imputations(WorkerMemory &workspace,
708
+ std::vector<ImputedData> &impute_vec,
709
+ std::unordered_map<size_t, ImputedData> &impute_map,
710
+ std::vector<char> &has_missing,
711
+ int nthreads);
712
+ void add_from_impute_node(ImputeNode &imputer, ImputedData &imputed_data, double w);
713
+ void add_from_impute_node(ImputeNode &imputer, WorkerMemory &workspace, InputData &input_data);
714
+ template <class imp_arr>
715
+ void apply_imputation_results(imp_arr &impute_vec,
716
+ Imputer &imputer,
717
+ InputData &input_data,
718
+ int nthreads);
719
+ void apply_imputation_results(std::vector<ImputedData> &impute_vec,
720
+ std::unordered_map<size_t, ImputedData> &impute_map,
721
+ Imputer &imputer,
722
+ InputData &input_data,
723
+ int nthreads);
724
+ void apply_imputation_results(PredictionData &prediction_data,
725
+ ImputedData &imp,
726
+ Imputer &imputer,
727
+ size_t row);
728
+ void initialize_impute_calc(ImputedData &imp, InputData &input_data, size_t row);
729
+ void initialize_impute_calc(ImputedData &imp, PredictionData &prediction_data, Imputer &imputer, size_t row);
730
+ void allocate_imp_vec(std::vector<ImputedData> &impute_vec, InputData &input_data, int nthreads);
731
+ void allocate_imp_map(std::unordered_map<size_t, ImputedData> &impute_map, InputData &input_data);
732
+ void allocate_imp(InputData &input_data,
733
+ std::vector<ImputedData> &impute_vec,
734
+ std::unordered_map<size_t, ImputedData> &impute_map,
735
+ int nthreads);
736
+ void check_for_missing(InputData &input_data,
737
+ std::vector<ImputedData> &impute_vec,
738
+ std::unordered_map<size_t, ImputedData> &impute_map,
739
+ int nthreads);
740
+ size_t check_for_missing(PredictionData &prediction_data,
741
+ Imputer &imputer,
742
+ size_t ix_arr[],
743
+ int nthreads);
744
+
745
+ /* helpers_iforest.cpp */
746
+ void decide_column(size_t ncols_numeric, size_t ncols_categ, size_t &col_chosen, ColType &col_type,
747
+ RNG_engine &rnd_generator, std::uniform_int_distribution<size_t> &runif,
748
+ std::discrete_distribution<size_t> &col_sampler);
749
+ void add_unsplittable_col(WorkerMemory &workspace, IsoTree &tree, InputData &input_data);
750
+ void add_unsplittable_col(WorkerMemory &workspace, InputData &input_data);
751
+ bool check_is_not_unsplittable_col(WorkerMemory &workspace, IsoTree &tree, InputData &input_data);
752
+ void get_split_range(WorkerMemory &workspace, InputData &input_data, ModelParams &model_params, IsoTree &tree);
753
+ void get_split_range(WorkerMemory &workspace, InputData &input_data, ModelParams &model_params);
754
+ int choose_cat_from_present(WorkerMemory &workspace, InputData &input_data, size_t col_num);
755
+ void update_col_sampler(WorkerMemory &workspace, InputData &input_data);
756
+ bool is_col_taken(std::vector<bool> &col_is_taken, std::unordered_set<size_t> &col_is_taken_s,
757
+ InputData &input_data, size_t col_num, ColType col_type);
758
+ void set_col_as_taken(std::vector<bool> &col_is_taken, std::unordered_set<size_t> &col_is_taken_s,
759
+ InputData &input_data, size_t col_num, ColType col_type);
760
+ void add_separation_step(WorkerMemory &workspace, InputData &input_data, double remainder);
761
+ void add_remainder_separation_steps(WorkerMemory &workspace, InputData &input_data, long double sum_weight);
762
+ void remap_terminal_trees(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
763
+ PredictionData &prediction_data, sparse_ix *restrict tree_num, int nthreads);
764
+ void backup_recursion_state(WorkerMemory &workspace, RecursionState &recursion_state);
765
+ void restore_recursion_state(WorkerMemory &workspace, RecursionState &recursion_state);
766
+
767
+
768
+ /* utils.cpp */
769
+ size_t log2ceil(size_t x);
770
+ double harmonic(size_t n);
771
+ double harmonic_recursive(double a, double b);
772
+ double expected_avg_depth(size_t sample_size);
773
+ double expected_avg_depth(long double approx_sample_size);
774
+ double expected_separation_depth(size_t n);
775
+ double expected_separation_depth_hotstart(double curr, size_t n_curr, size_t n_final);
776
+ double expected_separation_depth(long double n);
777
+ void increase_comb_counter(size_t ix_arr[], size_t st, size_t end, size_t n, double counter[], double exp_remainder);
778
+ void increase_comb_counter(size_t ix_arr[], size_t st, size_t end, size_t n,
779
+ double *restrict counter, double *restrict weights, double exp_remainder);
780
+ void increase_comb_counter(size_t ix_arr[], size_t st, size_t end, size_t n,
781
+ double counter[], std::unordered_map<size_t, double> &weights, double exp_remainder);
782
+ void increase_comb_counter_in_groups(size_t ix_arr[], size_t st, size_t end, size_t split_ix, size_t n,
783
+ double counter[], double exp_remainder);
784
+ void increase_comb_counter_in_groups(size_t ix_arr[], size_t st, size_t end, size_t split_ix, size_t n,
785
+ double *restrict counter, double *restrict weights, double exp_remainder);
786
+ void tmat_to_dense(double *restrict tmat, double *restrict dmat, size_t n, bool diag_to_one);
787
+ double calc_sd_raw(size_t cnt, long double sum, long double sum_sq);
788
+ long double calc_sd_raw_l(size_t cnt, long double sum, long double sum_sq);
789
+ void build_btree_sampler(std::vector<double> &btree_weights, double *restrict sample_weights,
790
+ size_t nrows, size_t &log2_n, size_t &btree_offset);
791
+ void sample_random_rows(std::vector<size_t> &ix_arr, size_t nrows, bool with_replacement,
792
+ RNG_engine &rnd_generator, std::vector<size_t> &ix_all,
793
+ double sample_weights[], std::vector<double> &btree_weights,
794
+ size_t log2_n, size_t btree_offset, std::vector<bool> &is_repeated);
795
+ void weighted_shuffle(size_t *restrict outp, size_t n, double *restrict weights, double *restrict buffer_arr, RNG_engine &rnd_generator);
796
+ size_t divide_subset_split(size_t ix_arr[], double x[], size_t st, size_t end, double split_point);
797
+ void divide_subset_split(size_t ix_arr[], double x[], size_t st, size_t end, double split_point,
798
+ MissingAction missing_action, size_t &st_NA, size_t &end_NA, size_t &split_ix);
799
+ void divide_subset_split(size_t ix_arr[], size_t st, size_t end, size_t col_num,
800
+ double Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[], double split_point,
801
+ MissingAction missing_action, size_t &st_NA, size_t &end_NA, size_t &split_ix);
802
+ void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, char split_categ[],
803
+ MissingAction missing_action, size_t &st_NA, size_t &end_NA, size_t &split_ix);
804
+ void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, char split_categ[],
805
+ int ncat, MissingAction missing_action, NewCategAction new_cat_action,
806
+ bool move_new_to_left, size_t &st_NA, size_t &end_NA, size_t &split_ix);
807
+ void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, int split_categ,
808
+ MissingAction missing_action, size_t &st_NA, size_t &end_NA, size_t &split_ix);
809
+ void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end,
810
+ MissingAction missing_action, NewCategAction new_cat_action,
811
+ bool move_new_to_left, size_t &st_NA, size_t &end_NA, size_t &split_ix);
812
+ void get_range(size_t ix_arr[], double x[], size_t st, size_t end,
813
+ MissingAction missing_action, double &xmin, double &xmax, bool &unsplittable);
814
+ void get_range(size_t ix_arr[], size_t st, size_t end, size_t col_num,
815
+ double Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
816
+ MissingAction missing_action, double &xmin, double &xmax, bool &unsplittable);
817
+ void get_categs(size_t ix_arr[], int x[], size_t st, size_t end, int ncat,
818
+ MissingAction missing_action, char categs[], size_t &npresent, bool &unsplittable);
819
+ long double calculate_sum_weights(std::vector<size_t> &ix_arr, size_t st, size_t end, size_t curr_depth,
820
+ std::vector<double> &weights_arr, std::unordered_map<size_t, double> &weights_map);
821
+ void set_interrup_global_variable(int s);
822
+ int return_EXIT_SUCCESS();
823
+ int return_EXIT_FAILURE();
824
+
825
+
826
+
827
+ size_t move_NAs_to_front(size_t ix_arr[], size_t st, size_t end, double x[]);
828
+ size_t move_NAs_to_front(size_t ix_arr[], size_t st, size_t end, size_t col_num, double Xc[], size_t Xc_ind[], size_t Xc_indptr[]);
829
+ size_t move_NAs_to_front(size_t ix_arr[], size_t st, size_t end, int x[]);
830
+ size_t center_NAs(size_t *restrict ix_arr, size_t st_left, size_t st, size_t curr_pos);
831
+ void todense(size_t ix_arr[], size_t st, size_t end,
832
+ size_t col_num, double *restrict Xc, sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
833
+ double *restrict buffer_arr);
834
+
835
+ /* mult.cpp */
836
+ void calc_mean_and_sd(size_t ix_arr[], size_t st, size_t end, double *restrict x,
837
+ MissingAction missing_action, double &x_sd, double &x_mean);
838
+ void calc_mean_and_sd(size_t ix_arr[], size_t st, size_t end, size_t col_num,
839
+ double *restrict Xc, sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
840
+ double &x_sd, double &x_mean);
841
+ void add_linear_comb(size_t ix_arr[], size_t st, size_t end, double *restrict res,
842
+ double *restrict x, double &coef, double x_sd, double x_mean, double &fill_val,
843
+ MissingAction missing_action, double *restrict buffer_arr,
844
+ size_t *restrict buffer_NAs, bool first_run);
845
+ void add_linear_comb(size_t *restrict ix_arr, size_t st, size_t end, size_t col_num, double *restrict res,
846
+ double *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
847
+ double &coef, double x_sd, double x_mean, double &fill_val, MissingAction missing_action,
848
+ double *restrict buffer_arr, size_t *restrict buffer_NAs, bool first_run);
849
+ void add_linear_comb(size_t *restrict ix_arr, size_t st, size_t end, double *restrict res,
850
+ int x[], int ncat, double *restrict cat_coef, double single_cat_coef, int chosen_cat,
851
+ double &fill_val, double &fill_new, size_t *restrict buffer_cnt, size_t *restrict buffer_pos,
852
+ NewCategAction new_cat_action, MissingAction missing_action, CategSplit cat_split_type, bool first_run);
853
+
854
+ /* crit.cpp */
855
+ double calc_kurtosis(size_t ix_arr[], size_t st, size_t end, double x[], MissingAction missing_action);
856
+ double calc_kurtosis(size_t ix_arr[], size_t st, size_t end, size_t col_num,
857
+ double Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
858
+ MissingAction missing_action);
859
+ double calc_kurtosis(size_t ix_arr[], size_t st, size_t end, int x[], int ncat, size_t buffer_cnt[], double buffer_prob[],
860
+ MissingAction missing_action, CategSplit cat_split_type, RNG_engine &rnd_generator);
861
+ double expected_sd_cat(double p[], size_t n, size_t pos[]);
862
+ double expected_sd_cat(size_t counts[], double p[], size_t n, size_t pos[]);
863
+ double expected_sd_cat_single(size_t counts[], double p[], size_t n, size_t pos[], size_t cat_exclude, size_t cnt);
864
+ double numeric_gain(size_t cnt_left, size_t cnt_right,
865
+ long double sum_left, long double sum_right,
866
+ long double sum_sq_left, long double sum_sq_right,
867
+ double sd_full, long double cnt);
868
+ double numeric_gain_no_div(size_t cnt_left, size_t cnt_right,
869
+ long double sum_left, long double sum_right,
870
+ long double sum_sq_left, long double sum_sq_right,
871
+ double sd_full, long double cnt);
872
+ double categ_gain(size_t cnt_left, size_t cnt_right,
873
+ long double s_left, long double s_right,
874
+ long double base_info, long double cnt);
875
+ double eval_guided_crit(double *restrict x, size_t n, GainCriterion criterion, double min_gain,
876
+ double &split_point, double &xmin, double &xmax);
877
+ double eval_guided_crit(size_t *restrict ix_arr, size_t st, size_t end, double *restrict x,
878
+ size_t &split_ix, double &split_point, double &xmin, double &xmax,
879
+ GainCriterion criterion, double min_gain, MissingAction missing_action);
880
+ double eval_guided_crit(size_t ix_arr[], size_t st, size_t end,
881
+ size_t col_num, double Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
882
+ double buffer_arr[], size_t buffer_pos[],
883
+ double &split_point, double &xmin, double &xmax,
884
+ GainCriterion criterion, double min_gain, MissingAction missing_action);
885
+ double eval_guided_crit(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, int ncat,
886
+ size_t *restrict buffer_cnt, size_t *restrict buffer_pos, double *restrict buffer_prob,
887
+ int &chosen_cat, char *restrict split_categ, char *restrict buffer_split,
888
+ GainCriterion criterion, double min_gain, bool all_perm, MissingAction missing_action, CategSplit cat_split_type);
889
+
890
+ /* merge_models.cpp */
891
+ void merge_models(IsoForest* model, IsoForest* other,
892
+ ExtIsoForest* ext_model, ExtIsoForest* ext_other,
893
+ Imputer* imputer, Imputer* iother);
894
+
895
+ #ifdef _ENABLE_CEREAL
896
+ /* serialize.cpp */
897
+ void serialize_isoforest(IsoForest &model, std::ostream &output);
898
+ void serialize_isoforest(IsoForest &model, const char *output_file_path);
899
+ std::string serialize_isoforest(IsoForest &model);
900
+ void deserialize_isoforest(IsoForest &output_obj, std::istream &serialized);
901
+ void deserialize_isoforest(IsoForest &output_obj, const char *input_file_path);
902
+ void deserialize_isoforest(IsoForest &output_obj, std::string &serialized, bool move_str);
903
+ void serialize_ext_isoforest(ExtIsoForest &model, std::ostream &output);
904
+ void serialize_ext_isoforest(ExtIsoForest &model, const char *output_file_path);
905
+ std::string serialize_ext_isoforest(ExtIsoForest &model);
906
+ void deserialize_ext_isoforest(ExtIsoForest &output_obj, std::istream &serialized);
907
+ void deserialize_ext_isoforest(ExtIsoForest &output_obj, const char *input_file_path);
908
+ void deserialize_ext_isoforest(ExtIsoForest &output_obj, std::string &serialized, bool move_str);
909
+ void serialize_imputer(Imputer &imputer, std::ostream &output);
910
+ void serialize_imputer(Imputer &imputer, const char *output_file_path);
911
+ std::string serialize_imputer(Imputer &imputer);
912
+ void deserialize_imputer(Imputer &output_obj, std::istream &serialized);
913
+ void deserialize_imputer(Imputer &output_obj, const char *input_file_path);
914
+ void deserialize_imputer(Imputer &output_obj, std::string &serialized, bool move_str);
915
+ #ifdef _MSC_VER
916
+ void serialize_isoforest(IsoForest &model, const wchar_t *output_file_path);
917
+ void deserialize_isoforest(IsoForest &output_obj, const wchar_t *input_file_path);
918
+ void serialize_ext_isoforest(ExtIsoForest &model, const wchar_t *output_file_path);
919
+ void deserialize_ext_isoforest(ExtIsoForest &output_obj, const wchar_t *input_file_path);
920
+ void serialize_imputer(Imputer &imputer, const wchar_t *output_file_path);
921
+ void deserialize_imputer(Imputer &output_obj, const wchar_t *input_file_path);
922
+ #endif /* _MSC_VER */
923
+ bool has_msvc();
924
+ #endif /* _ENABLE_CEREAL */
925
+
926
+ /* dealloc.cpp */
927
+ void dealloc_IsoForest(IsoForest &model_outputs);
928
+ void dealloc_IsoExtForest(ExtIsoForest &model_outputs_ext);
929
+ void dealloc_Imputer(Imputer &imputer);