isotree 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE.txt +25 -0
- data/README.md +111 -0
- data/ext/isotree/ext.cpp +178 -0
- data/ext/isotree/extconf.rb +21 -0
- data/lib/isotree.rb +9 -0
- data/lib/isotree/isolation_forest.rb +94 -0
- data/lib/isotree/version.rb +3 -0
- data/vendor/isotree/LICENSE +25 -0
- data/vendor/isotree/README.md +167 -0
- data/vendor/isotree/src/Makevars +4 -0
- data/vendor/isotree/src/RcppExports.cpp +267 -0
- data/vendor/isotree/src/Rwrapper.cpp +762 -0
- data/vendor/isotree/src/crit.cpp +912 -0
- data/vendor/isotree/src/dealloc.cpp +66 -0
- data/vendor/isotree/src/dist.cpp +749 -0
- data/vendor/isotree/src/extended.cpp +790 -0
- data/vendor/isotree/src/fit_model.cpp +1068 -0
- data/vendor/isotree/src/helpers_iforest.cpp +309 -0
- data/vendor/isotree/src/impute.cpp +1205 -0
- data/vendor/isotree/src/isoforest.cpp +771 -0
- data/vendor/isotree/src/isotree.hpp +929 -0
- data/vendor/isotree/src/merge_models.cpp +116 -0
- data/vendor/isotree/src/mult.cpp +607 -0
- data/vendor/isotree/src/predict.cpp +849 -0
- data/vendor/isotree/src/serialize.cpp +262 -0
- data/vendor/isotree/src/utils.cpp +1574 -0
- metadata +154 -0
@@ -0,0 +1,929 @@
|
|
1
|
+
/* Isolation forests and variations thereof, with adjustments for incorporation
|
2
|
+
* of categorical variables and missing values.
|
3
|
+
* Writen for C++11 standard and aimed at being used in R and Python.
|
4
|
+
*
|
5
|
+
* This library is based on the following works:
|
6
|
+
* [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
7
|
+
* "Isolation forest."
|
8
|
+
* 2008 Eighth IEEE International Conference on Data Mining. IEEE, 2008.
|
9
|
+
* [2] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
10
|
+
* "Isolation-based anomaly detection."
|
11
|
+
* ACM Transactions on Knowledge Discovery from Data (TKDD) 6.1 (2012): 3.
|
12
|
+
* [3] Hariri, Sahand, Matias Carrasco Kind, and Robert J. Brunner.
|
13
|
+
* "Extended Isolation Forest."
|
14
|
+
* arXiv preprint arXiv:1811.02141 (2018).
|
15
|
+
* [4] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
16
|
+
* "On detecting clustered anomalies using SCiForest."
|
17
|
+
* Joint European Conference on Machine Learning and Knowledge Discovery in Databases. Springer, Berlin, Heidelberg, 2010.
|
18
|
+
* [5] https://sourceforge.net/projects/iforest/
|
19
|
+
* [6] https://math.stackexchange.com/questions/3388518/expected-number-of-paths-required-to-separate-elements-in-a-binary-tree
|
20
|
+
* [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
|
21
|
+
* [8] Cortes, David. "Distance approximation using Isolation Forests." arXiv preprint arXiv:1910.12362 (2019).
|
22
|
+
* [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
|
23
|
+
*
|
24
|
+
* BSD 2-Clause License
|
25
|
+
* Copyright (c) 2019, David Cortes
|
26
|
+
* All rights reserved.
|
27
|
+
* Redistribution and use in source and binary forms, with or without
|
28
|
+
* modification, are permitted provided that the following conditions are met:
|
29
|
+
* * Redistributions of source code must retain the above copyright notice, this
|
30
|
+
* list of conditions and the following disclaimer.
|
31
|
+
* * Redistributions in binary form must reproduce the above copyright notice,
|
32
|
+
* this list of conditions and the following disclaimer in the documentation
|
33
|
+
* and/or other materials provided with the distribution.
|
34
|
+
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
35
|
+
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
36
|
+
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
37
|
+
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
38
|
+
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
39
|
+
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
40
|
+
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
41
|
+
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
42
|
+
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
43
|
+
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
44
|
+
*/
|
45
|
+
|
46
|
+
/* Standard headers */
|
47
|
+
#include <stddef.h>
|
48
|
+
#include <math.h>
|
49
|
+
#include <limits.h>
|
50
|
+
#include <string.h>
|
51
|
+
#include <signal.h>
|
52
|
+
#include <vector>
|
53
|
+
#include <iterator>
|
54
|
+
#include <numeric>
|
55
|
+
#include <algorithm>
|
56
|
+
#include <random>
|
57
|
+
#include <unordered_set>
|
58
|
+
#include <unordered_map>
|
59
|
+
#include <memory>
|
60
|
+
#include <utility>
|
61
|
+
#include <cstdint>
|
62
|
+
#include <iostream>
|
63
|
+
#ifndef _FOR_R
|
64
|
+
#include <stdio.h>
|
65
|
+
#else
|
66
|
+
extern "C" {
|
67
|
+
#include <R_ext/Print.h>
|
68
|
+
}
|
69
|
+
#define printf Rprintf
|
70
|
+
#define fprintf(f, message) REprintf(message)
|
71
|
+
#endif
|
72
|
+
#ifdef _OPENMP
|
73
|
+
#include <omp.h>
|
74
|
+
#endif
|
75
|
+
#ifdef _ENABLE_CEREAL
|
76
|
+
#include <cereal/archives/binary.hpp>
|
77
|
+
#include <cereal/types/vector.hpp>
|
78
|
+
#include <sstream>
|
79
|
+
#include <string>
|
80
|
+
#include <fstream>
|
81
|
+
#endif
|
82
|
+
|
83
|
+
/* By default, will use Mersenne-Twister for RNG, but can be switched to something faster */
|
84
|
+
#ifdef _USE_MERSENNE_TWISTER
|
85
|
+
#if SIZE_MAX >= UINT64_MAX /* 64-bit systems or higher */
|
86
|
+
#define RNG_engine std::mt19937_64
|
87
|
+
#else /* 32-bit systems and non-standard architectures */
|
88
|
+
#define RNG_engine std::mt19937
|
89
|
+
#endif
|
90
|
+
#else
|
91
|
+
#define RNG_engine std::default_random_engine
|
92
|
+
#endif
|
93
|
+
|
94
|
+
/* Short functions */
|
95
|
+
#define ix_parent(ix) (((ix) - 1) / 2) /* integer division takes care of deciding left-right */
|
96
|
+
#define ix_child(ix) (2 * (ix) + 1)
|
97
|
+
/* https://stackoverflow.com/questions/101439/the-most-efficient-way-to-implement-an-integer-based-power-function-powint-int */
|
98
|
+
#define pow2(n) ( ((size_t) 1) << (n) )
|
99
|
+
#define square(x) ((x) * (x))
|
100
|
+
/* https://stackoverflow.com/questions/2249731/how-do-i-get-bit-by-bit-data-from-an-integer-value-in-c */
|
101
|
+
#define extract_bit(number, bit) (((number) >> (bit)) & 1)
|
102
|
+
#ifndef isinf
|
103
|
+
#define isinf std::isinf
|
104
|
+
#endif
|
105
|
+
#ifndef isnan
|
106
|
+
#define isnan std::isnan
|
107
|
+
#endif
|
108
|
+
#define is_na_or_inf(x) (isnan(x) || isinf(x))
|
109
|
+
|
110
|
+
|
111
|
+
/* Aliasing for compiler optimizations */
|
112
|
+
#if defined(__GNUG__) || defined(__GNUC__) || defined(_MSC_VER) || defined(__clang__) || defined(__INTEL_COMPILER)
|
113
|
+
#define restrict __restrict
|
114
|
+
#else
|
115
|
+
#define restrict
|
116
|
+
#endif
|
117
|
+
|
118
|
+
/* MSVC is stuck with an OpenMP version that's 19 years old at the time of writing and does not support unsigned iterators */
|
119
|
+
#ifdef _OPENMP
|
120
|
+
#if (_OPENMP < 200801) || defined(_WIN32) || defined(_WIN64) /* OpenMP < 3.0 */
|
121
|
+
#define size_t_for long
|
122
|
+
#else
|
123
|
+
#define size_t_for size_t
|
124
|
+
#endif
|
125
|
+
#else
|
126
|
+
#define size_t_for size_t
|
127
|
+
#endif
|
128
|
+
|
129
|
+
|
130
|
+
/* Apple at some point decided to drop OMP library and headersfrom its compiler distribution
|
131
|
+
* and to alias 'gcc' to 'clang', which work differently when given flags they cannot interpret,
|
132
|
+
* causing installation issues with pretty much all scientific software due to OMP headers that
|
133
|
+
* would normally do nothing. This piece of code is to allow compilation without OMP header. */
|
134
|
+
#ifndef _OPENMP
|
135
|
+
#define omp_get_thread_num() 0
|
136
|
+
#endif
|
137
|
+
|
138
|
+
|
139
|
+
/* For sparse matrices */
|
140
|
+
#ifdef _FOR_R
|
141
|
+
#define sparse_ix int
|
142
|
+
#else
|
143
|
+
#define sparse_ix size_t
|
144
|
+
#endif
|
145
|
+
|
146
|
+
|
147
|
+
/* Types used through the package */
|
148
|
+
typedef enum NewCategAction {Weighted, Smallest, Random} NewCategAction; /* Weighted means Impute in the extended model */
|
149
|
+
typedef enum MissingAction {Divide, Impute, Fail} MissingAction; /* Divide is only for non-extended model */
|
150
|
+
typedef enum ColType {Numeric, Categorical, NotUsed} ColType;
|
151
|
+
typedef enum CategSplit {SubSet, SingleCateg} CategSplit;
|
152
|
+
typedef enum GainCriterion {Averaged, Pooled, NoCrit} Criterion; /* For guided splits */
|
153
|
+
typedef enum CoefType {Uniform, Normal} CoefType; /* For extended model */
|
154
|
+
typedef enum UseDepthImp {Lower, Higher, Same} UseDepthImp; /* For NA imputation */
|
155
|
+
typedef enum WeighImpRows {Inverse, Prop, Flat} WeighImpRows; /* For NA imputation */
|
156
|
+
|
157
|
+
/* Notes about new categorical action:
|
158
|
+
* - For single-variable case, if using 'Smallest', can then pass data at prediction time
|
159
|
+
* having categories that were never in the training data (as an integer higher than 'ncat'
|
160
|
+
* for that column), but if using 'Random' or 'Weighted', these must be passed as NA (int < 0)
|
161
|
+
* - For extended case, 'Weighted' becomes a weighted imputation instead, and if using either
|
162
|
+
* 'Weighted' or 'Smallest', can pass newer, unseen categories at prediction time too.
|
163
|
+
* - If using 'Random', cannot pass new categories at prediction time.
|
164
|
+
* - If using 'Weighted' for single-variable case, cannot predict similarity with a value
|
165
|
+
* for MissingAction other than 'Divide'. */
|
166
|
+
|
167
|
+
|
168
|
+
/* Structs that are output (modified) from the main function */
|
169
|
+
typedef struct IsoTree {
|
170
|
+
ColType col_type = NotUsed; /* issues with uninitialized values passed to Cereal */
|
171
|
+
size_t col_num;
|
172
|
+
double num_split;
|
173
|
+
std::vector<char> cat_split;
|
174
|
+
int chosen_cat;
|
175
|
+
size_t tree_left;
|
176
|
+
size_t tree_right;
|
177
|
+
double pct_tree_left;
|
178
|
+
double score; /* will not be integer when there are weights or early stop */
|
179
|
+
double range_low = -HUGE_VAL;
|
180
|
+
double range_high = HUGE_VAL;
|
181
|
+
double remainder; /* only used for distance/similarity */
|
182
|
+
|
183
|
+
#ifdef _ENABLE_CEREAL
|
184
|
+
template<class Archive>
|
185
|
+
void serialize(Archive &archive)
|
186
|
+
{
|
187
|
+
archive(
|
188
|
+
this->col_type,
|
189
|
+
this->col_num,
|
190
|
+
this->num_split,
|
191
|
+
this->cat_split,
|
192
|
+
this->chosen_cat,
|
193
|
+
this->tree_left,
|
194
|
+
this->tree_right,
|
195
|
+
this->pct_tree_left,
|
196
|
+
this->score,
|
197
|
+
this->range_low,
|
198
|
+
this->range_high,
|
199
|
+
this->remainder
|
200
|
+
);
|
201
|
+
}
|
202
|
+
#endif
|
203
|
+
|
204
|
+
IsoTree() = default;
|
205
|
+
|
206
|
+
} IsoTree;
|
207
|
+
|
208
|
+
typedef struct IsoHPlane {
|
209
|
+
std::vector<size_t> col_num;
|
210
|
+
std::vector<ColType> col_type;
|
211
|
+
std::vector<double> coef;
|
212
|
+
std::vector<double> mean;
|
213
|
+
std::vector<std::vector<double>> cat_coef;
|
214
|
+
std::vector<int> chosen_cat;
|
215
|
+
std::vector<double> fill_val;
|
216
|
+
std::vector<double> fill_new;
|
217
|
+
|
218
|
+
double split_point;
|
219
|
+
size_t hplane_left;
|
220
|
+
size_t hplane_right;
|
221
|
+
double score; /* will not be integer when there are weights or early stop */
|
222
|
+
double range_low = -HUGE_VAL;
|
223
|
+
double range_high = HUGE_VAL;
|
224
|
+
double remainder; /* only used for distance/similarity */
|
225
|
+
|
226
|
+
#ifdef _ENABLE_CEREAL
|
227
|
+
template<class Archive>
|
228
|
+
void serialize(Archive &archive)
|
229
|
+
{
|
230
|
+
archive(
|
231
|
+
this->col_num,
|
232
|
+
this->col_type,
|
233
|
+
this->coef,
|
234
|
+
this->mean,
|
235
|
+
this->cat_coef,
|
236
|
+
this->chosen_cat,
|
237
|
+
this->fill_val,
|
238
|
+
this->fill_new,
|
239
|
+
this->split_point,
|
240
|
+
this->hplane_left,
|
241
|
+
this->hplane_right,
|
242
|
+
this->score,
|
243
|
+
this->range_low,
|
244
|
+
this->range_high,
|
245
|
+
this->remainder
|
246
|
+
);
|
247
|
+
}
|
248
|
+
#endif
|
249
|
+
|
250
|
+
IsoHPlane() = default;
|
251
|
+
} IsoHPlane;
|
252
|
+
|
253
|
+
/* Note: don't use long doubles in the outside outputs or there will be issues with MINGW in windows */
|
254
|
+
|
255
|
+
|
256
|
+
typedef struct IsoForest {
|
257
|
+
std::vector< std::vector<IsoTree> > trees;
|
258
|
+
NewCategAction new_cat_action;
|
259
|
+
CategSplit cat_split_type;
|
260
|
+
MissingAction missing_action;
|
261
|
+
double exp_avg_depth;
|
262
|
+
double exp_avg_sep;
|
263
|
+
size_t orig_sample_size;
|
264
|
+
|
265
|
+
#ifdef _ENABLE_CEREAL
|
266
|
+
template<class Archive>
|
267
|
+
void serialize(Archive &archive)
|
268
|
+
{
|
269
|
+
archive(
|
270
|
+
this->trees,
|
271
|
+
this->new_cat_action,
|
272
|
+
this->cat_split_type,
|
273
|
+
this->missing_action,
|
274
|
+
this->exp_avg_depth,
|
275
|
+
this->exp_avg_sep,
|
276
|
+
this->orig_sample_size
|
277
|
+
);
|
278
|
+
}
|
279
|
+
#endif
|
280
|
+
|
281
|
+
IsoForest() = default;
|
282
|
+
} IsoForest;
|
283
|
+
|
284
|
+
typedef struct ExtIsoForest {
|
285
|
+
std::vector< std::vector<IsoHPlane> > hplanes;
|
286
|
+
NewCategAction new_cat_action;
|
287
|
+
CategSplit cat_split_type;
|
288
|
+
MissingAction missing_action;
|
289
|
+
double exp_avg_depth;
|
290
|
+
double exp_avg_sep;
|
291
|
+
size_t orig_sample_size;
|
292
|
+
|
293
|
+
#ifdef _ENABLE_CEREAL
|
294
|
+
template<class Archive>
|
295
|
+
void serialize(Archive &archive)
|
296
|
+
{
|
297
|
+
archive(
|
298
|
+
this->hplanes,
|
299
|
+
this->new_cat_action,
|
300
|
+
this->cat_split_type,
|
301
|
+
this->missing_action,
|
302
|
+
this->exp_avg_depth,
|
303
|
+
this->exp_avg_sep,
|
304
|
+
this->orig_sample_size
|
305
|
+
);
|
306
|
+
}
|
307
|
+
#endif
|
308
|
+
|
309
|
+
ExtIsoForest() = default;
|
310
|
+
} ExtIsoForest;
|
311
|
+
|
312
|
+
typedef struct ImputeNode {
|
313
|
+
std::vector<double> num_sum;
|
314
|
+
std::vector<double> num_weight;
|
315
|
+
std::vector<std::vector<double>> cat_sum;
|
316
|
+
std::vector<double> cat_weight;
|
317
|
+
size_t parent;
|
318
|
+
|
319
|
+
#ifdef _ENABLE_CEREAL
|
320
|
+
template<class Archive>
|
321
|
+
void serialize(Archive &archive)
|
322
|
+
{
|
323
|
+
archive(
|
324
|
+
this->num_sum,
|
325
|
+
this->num_weight,
|
326
|
+
this->cat_sum,
|
327
|
+
this->cat_weight,
|
328
|
+
this->parent
|
329
|
+
);
|
330
|
+
}
|
331
|
+
#endif
|
332
|
+
ImputeNode() = default;
|
333
|
+
|
334
|
+
ImputeNode(size_t parent)
|
335
|
+
{
|
336
|
+
this->parent = parent;
|
337
|
+
}
|
338
|
+
|
339
|
+
} ImputeNode; /* this is for each tree node */
|
340
|
+
|
341
|
+
typedef struct Imputer {
|
342
|
+
size_t ncols_numeric;
|
343
|
+
size_t ncols_categ;
|
344
|
+
std::vector<int> ncat;
|
345
|
+
std::vector<std::vector<ImputeNode>> imputer_tree;
|
346
|
+
std::vector<double> col_means;
|
347
|
+
std::vector<int> col_modes;
|
348
|
+
|
349
|
+
#ifdef _ENABLE_CEREAL
|
350
|
+
template<class Archive>
|
351
|
+
void serialize(Archive &archive)
|
352
|
+
{
|
353
|
+
archive(
|
354
|
+
this->ncols_numeric,
|
355
|
+
this->ncols_categ,
|
356
|
+
this->ncat,
|
357
|
+
this->imputer_tree,
|
358
|
+
this->col_means,
|
359
|
+
this->col_modes
|
360
|
+
);
|
361
|
+
}
|
362
|
+
#endif
|
363
|
+
|
364
|
+
Imputer() = default;
|
365
|
+
|
366
|
+
} Imputer;
|
367
|
+
|
368
|
+
|
369
|
+
/* Structs that are only used internally */
|
370
|
+
typedef struct {
|
371
|
+
double* numeric_data;
|
372
|
+
size_t ncols_numeric;
|
373
|
+
int* categ_data;
|
374
|
+
int* ncat;
|
375
|
+
int max_categ;
|
376
|
+
size_t ncols_categ;
|
377
|
+
size_t nrows;
|
378
|
+
size_t ncols_tot;
|
379
|
+
double* sample_weights;
|
380
|
+
bool weight_as_sample;
|
381
|
+
double* col_weights;
|
382
|
+
double* Xc; /* only for sparse matrices */
|
383
|
+
sparse_ix* Xc_ind; /* only for sparse matrices */
|
384
|
+
sparse_ix* Xc_indptr; /* only for sparse matrices */
|
385
|
+
size_t log2_n; /* only when using weights for sampling */
|
386
|
+
size_t btree_offset; /* only when using weights for sampling */
|
387
|
+
std::vector<double> btree_weights_init; /* only when using weights for sampling */
|
388
|
+
std::vector<char> has_missing; /* only used when producing missing imputations on-the-fly */
|
389
|
+
size_t n_missing; /* only used when producing missing imputations on-the-fly */
|
390
|
+
} InputData;
|
391
|
+
|
392
|
+
|
393
|
+
typedef struct {
|
394
|
+
double* numeric_data;
|
395
|
+
int* categ_data;
|
396
|
+
size_t nrows;
|
397
|
+
double* Xc; /* only for sparse matrices */
|
398
|
+
sparse_ix* Xc_ind; /* only for sparse matrices */
|
399
|
+
sparse_ix* Xc_indptr; /* only for sparse matrices */
|
400
|
+
double* Xr; /* only for sparse matrices */
|
401
|
+
sparse_ix* Xr_ind; /* only for sparse matrices */
|
402
|
+
sparse_ix* Xr_indptr; /* only for sparse matrices */
|
403
|
+
} PredictionData;
|
404
|
+
|
405
|
+
typedef struct {
|
406
|
+
bool with_replacement;
|
407
|
+
size_t sample_size;
|
408
|
+
size_t ntrees;
|
409
|
+
size_t max_depth;
|
410
|
+
bool penalize_range;
|
411
|
+
uint64_t random_seed;
|
412
|
+
bool weigh_by_kurt;
|
413
|
+
double prob_pick_by_gain_avg;
|
414
|
+
double prob_split_by_gain_avg;
|
415
|
+
double prob_pick_by_gain_pl;
|
416
|
+
double prob_split_by_gain_pl;
|
417
|
+
double min_gain;
|
418
|
+
CategSplit cat_split_type;
|
419
|
+
NewCategAction new_cat_action;
|
420
|
+
MissingAction missing_action;
|
421
|
+
bool all_perm;
|
422
|
+
|
423
|
+
size_t ndim; /* only for extended model */
|
424
|
+
size_t ntry; /* only for extended model */
|
425
|
+
CoefType coef_type; /* only for extended model */
|
426
|
+
bool coef_by_prop; /* only for extended model */
|
427
|
+
|
428
|
+
bool calc_dist; /* checkbox for calculating distances on-the-fly */
|
429
|
+
bool calc_depth; /* checkbox for calculating depths on-the-fly */
|
430
|
+
bool impute_at_fit; /* checkbox for producing imputed missing values on-the-fly */
|
431
|
+
|
432
|
+
UseDepthImp depth_imp; /* only when building NA imputer */
|
433
|
+
WeighImpRows weigh_imp_rows; /* only when building NA imputer */
|
434
|
+
size_t min_imp_obs; /* only when building NA imputer */
|
435
|
+
} ModelParams;
|
436
|
+
|
437
|
+
typedef struct ImputedData {
|
438
|
+
std::vector<long double> num_sum;
|
439
|
+
std::vector<long double> num_weight;
|
440
|
+
std::vector<std::vector<long double>> cat_sum;
|
441
|
+
std::vector<long double> cat_weight;
|
442
|
+
std::vector<long double> sp_num_sum;
|
443
|
+
std::vector<long double> sp_num_weight;
|
444
|
+
|
445
|
+
std::vector<size_t> missing_num;
|
446
|
+
std::vector<size_t> missing_cat;
|
447
|
+
std::vector<sparse_ix> missing_sp;
|
448
|
+
size_t n_missing_num;
|
449
|
+
size_t n_missing_cat;
|
450
|
+
size_t n_missing_sp;
|
451
|
+
|
452
|
+
ImputedData() {};
|
453
|
+
|
454
|
+
ImputedData(InputData &input_data, size_t row);
|
455
|
+
|
456
|
+
} ImputedData;
|
457
|
+
|
458
|
+
typedef struct {
|
459
|
+
std::vector<size_t> ix_arr;
|
460
|
+
std::vector<size_t> ix_all;
|
461
|
+
RNG_engine rnd_generator;
|
462
|
+
std::uniform_int_distribution<size_t> runif;
|
463
|
+
std::uniform_real_distribution<double> rbin;
|
464
|
+
size_t st;
|
465
|
+
size_t end;
|
466
|
+
size_t st_NA;
|
467
|
+
size_t end_NA;
|
468
|
+
size_t split_ix;
|
469
|
+
std::unordered_map<size_t, double> weights_map;
|
470
|
+
std::vector<double> weights_arr; /* when not ignoring NAs and when using weights as density */
|
471
|
+
double xmin;
|
472
|
+
double xmax;
|
473
|
+
size_t npresent; /* 'npresent' and 'ncols_tried' are used interchangeable and for unrelated things */
|
474
|
+
bool unsplittable;
|
475
|
+
std::vector<bool> is_repeated;
|
476
|
+
std::vector<char> categs;
|
477
|
+
size_t ncols_tried; /* 'npresent' and 'ncols_tried' are used interchangeable and for unrelated things */
|
478
|
+
int ncat_tried;
|
479
|
+
std::vector<bool> cols_possible;
|
480
|
+
std::vector<double> btree_weights; /* only when using weights for sampling */
|
481
|
+
std::discrete_distribution<size_t> col_sampler; /* columns can get eliminated, keep a copy for each thread */
|
482
|
+
|
483
|
+
/* for split criterion */
|
484
|
+
std::vector<double> buffer_dbl;
|
485
|
+
std::vector<size_t> buffer_szt;
|
486
|
+
std::vector<char> buffer_chr;
|
487
|
+
double prob_split_type;
|
488
|
+
GainCriterion criterion;
|
489
|
+
double this_gain;
|
490
|
+
double this_split_point;
|
491
|
+
int this_categ;
|
492
|
+
std::vector<char> this_split_categ;
|
493
|
+
bool determine_split;
|
494
|
+
|
495
|
+
/* for the extended model */
|
496
|
+
size_t ntry;
|
497
|
+
size_t ntaken;
|
498
|
+
size_t ntaken_best;
|
499
|
+
bool tried_all;
|
500
|
+
size_t col_chosen;
|
501
|
+
ColType col_type;
|
502
|
+
double ext_sd;
|
503
|
+
std::vector<size_t> cols_shuffled;
|
504
|
+
std::vector<double> comb_val;
|
505
|
+
std::vector<size_t> col_take;
|
506
|
+
std::vector<ColType> col_take_type;
|
507
|
+
std::vector<double> ext_offset;
|
508
|
+
std::vector<double> ext_coef;
|
509
|
+
std::vector<double> ext_mean;
|
510
|
+
std::vector<double> ext_fill_val;
|
511
|
+
std::vector<double> ext_fill_new;
|
512
|
+
std::vector<int> chosen_cat;
|
513
|
+
std::vector<std::vector<double>> ext_cat_coef;
|
514
|
+
std::uniform_real_distribution<double> coef_unif;
|
515
|
+
std::normal_distribution<double> coef_norm;
|
516
|
+
|
517
|
+
/* for similarity/distance calculations */
|
518
|
+
std::vector<double> tmat_sep;
|
519
|
+
|
520
|
+
/* when calculating average depth on-the-fly */
|
521
|
+
std::vector<double> row_depths;
|
522
|
+
|
523
|
+
/* when imputing NAs on-the-fly */
|
524
|
+
std::vector<ImputedData> impute_vec;
|
525
|
+
std::unordered_map<size_t, ImputedData> impute_map;
|
526
|
+
|
527
|
+
} WorkerMemory;
|
528
|
+
|
529
|
+
typedef struct WorkerForSimilarity {
|
530
|
+
std::vector<size_t> ix_arr;
|
531
|
+
size_t st;
|
532
|
+
size_t end;
|
533
|
+
std::vector<double> weights_arr;
|
534
|
+
std::vector<double> comb_val;
|
535
|
+
std::vector<double> tmat_sep;
|
536
|
+
std::vector<double> rmat;
|
537
|
+
size_t n_from;
|
538
|
+
bool assume_full_distr; /* doesn't need to have one copy per worker */
|
539
|
+
} WorkerForSimilarity;
|
540
|
+
|
541
|
+
typedef struct {
|
542
|
+
size_t st;
|
543
|
+
size_t st_NA;
|
544
|
+
size_t end_NA;
|
545
|
+
size_t split_ix;
|
546
|
+
size_t end;
|
547
|
+
std::vector<size_t> ix_arr;
|
548
|
+
std::unordered_map<size_t, double> weights_map;
|
549
|
+
std::vector<double> weights_arr;
|
550
|
+
std::vector<bool> cols_possible;
|
551
|
+
std::discrete_distribution<size_t> col_sampler;
|
552
|
+
} RecursionState;
|
553
|
+
|
554
|
+
/* Function prototypes */
|
555
|
+
|
556
|
+
/* fit_model.cpp */
|
557
|
+
extern bool interrupt_switch;
|
558
|
+
int fit_iforest(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
559
|
+
double numeric_data[], size_t ncols_numeric,
|
560
|
+
int categ_data[], size_t ncols_categ, int ncat[],
|
561
|
+
double Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
|
562
|
+
size_t ndim, size_t ntry, CoefType coef_type, bool coef_by_prop,
|
563
|
+
double sample_weights[], bool with_replacement, bool weight_as_sample,
|
564
|
+
size_t nrows, size_t sample_size, size_t ntrees, size_t max_depth,
|
565
|
+
bool limit_depth, bool penalize_range,
|
566
|
+
bool standardize_dist, double tmat[],
|
567
|
+
double output_depths[], bool standardize_depth,
|
568
|
+
double col_weights[], bool weigh_by_kurt,
|
569
|
+
double prob_pick_by_gain_avg, double prob_split_by_gain_avg,
|
570
|
+
double prob_pick_by_gain_pl, double prob_split_by_gain_pl,
|
571
|
+
double min_gain, MissingAction missing_action,
|
572
|
+
CategSplit cat_split_type, NewCategAction new_cat_action,
|
573
|
+
bool all_perm, Imputer *imputer, size_t min_imp_obs,
|
574
|
+
UseDepthImp depth_imp, WeighImpRows weigh_imp_rows, bool impute_at_fit,
|
575
|
+
uint64_t random_seed, int nthreads);
|
576
|
+
int add_tree(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
577
|
+
double numeric_data[], size_t ncols_numeric,
|
578
|
+
int categ_data[], size_t ncols_categ, int ncat[],
|
579
|
+
double Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
|
580
|
+
size_t ndim, size_t ntry, CoefType coef_type, bool coef_by_prop,
|
581
|
+
double sample_weights[], size_t nrows, size_t max_depth,
|
582
|
+
bool limit_depth, bool penalize_range,
|
583
|
+
double col_weights[], bool weigh_by_kurt,
|
584
|
+
double prob_pick_by_gain_avg, double prob_split_by_gain_avg,
|
585
|
+
double prob_pick_by_gain_pl, double prob_split_by_gain_pl,
|
586
|
+
double min_gain, MissingAction missing_action,
|
587
|
+
CategSplit cat_split_type, NewCategAction new_cat_action,
|
588
|
+
UseDepthImp depth_imp, WeighImpRows weigh_imp_rows,
|
589
|
+
bool all_perm, std::vector<ImputeNode> *impute_nodes, size_t min_imp_obs,
|
590
|
+
uint64_t random_seed);
|
591
|
+
void fit_itree(std::vector<IsoTree> *tree_root,
|
592
|
+
std::vector<IsoHPlane> *hplane_root,
|
593
|
+
WorkerMemory &workspace,
|
594
|
+
InputData &input_data,
|
595
|
+
ModelParams &model_params,
|
596
|
+
std::vector<ImputeNode> *impute_nodes,
|
597
|
+
size_t tree_num);
|
598
|
+
|
599
|
+
/* isoforest.cpp */
|
600
|
+
void split_itree_recursive(std::vector<IsoTree> &trees,
|
601
|
+
WorkerMemory &workspace,
|
602
|
+
InputData &input_data,
|
603
|
+
ModelParams &model_params,
|
604
|
+
std::vector<ImputeNode> *impute_nodes,
|
605
|
+
size_t curr_depth);
|
606
|
+
|
607
|
+
/* extended.cpp */
|
608
|
+
void split_hplane_recursive(std::vector<IsoHPlane> &hplanes,
|
609
|
+
WorkerMemory &workspace,
|
610
|
+
InputData &input_data,
|
611
|
+
ModelParams &model_params,
|
612
|
+
std::vector<ImputeNode> *impute_nodes,
|
613
|
+
size_t curr_depth);
|
614
|
+
void add_chosen_column(WorkerMemory &workspace, InputData &input_data, ModelParams &model_params,
|
615
|
+
std::vector<bool> &col_is_taken, std::unordered_set<size_t> &col_is_taken_s);
|
616
|
+
void shrink_to_fit_hplane(IsoHPlane &hplane, bool clear_vectors);
|
617
|
+
void simplify_hplane(IsoHPlane &hplane, WorkerMemory &workspace, InputData &input_data, ModelParams &model_params);
|
618
|
+
|
619
|
+
|
620
|
+
/* predict.cpp */
|
621
|
+
void predict_iforest(double numeric_data[], int categ_data[],
|
622
|
+
double Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
|
623
|
+
double Xr[], sparse_ix Xr_ind[], sparse_ix Xr_indptr[],
|
624
|
+
size_t nrows, int nthreads, bool standardize,
|
625
|
+
IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
626
|
+
double output_depths[], sparse_ix tree_num[]);
|
627
|
+
void traverse_itree_no_recurse(std::vector<IsoTree> &tree,
|
628
|
+
IsoForest &model_outputs,
|
629
|
+
PredictionData &prediction_data,
|
630
|
+
double &output_depth,
|
631
|
+
sparse_ix *restrict tree_num,
|
632
|
+
size_t row);
|
633
|
+
double traverse_itree(std::vector<IsoTree> &tree,
|
634
|
+
IsoForest &model_outputs,
|
635
|
+
PredictionData &prediction_data,
|
636
|
+
std::vector<ImputeNode> *impute_nodes,
|
637
|
+
ImputedData *imputed_data,
|
638
|
+
double curr_weight,
|
639
|
+
size_t row,
|
640
|
+
sparse_ix *restrict tree_num,
|
641
|
+
size_t curr_lev);
|
642
|
+
void traverse_hplane_fast(std::vector<IsoHPlane> &hplane,
|
643
|
+
ExtIsoForest &model_outputs,
|
644
|
+
PredictionData &prediction_data,
|
645
|
+
double &output_depth,
|
646
|
+
sparse_ix *restrict tree_num,
|
647
|
+
size_t row);
|
648
|
+
void traverse_hplane(std::vector<IsoHPlane> &hplane,
|
649
|
+
ExtIsoForest &model_outputs,
|
650
|
+
PredictionData &prediction_data,
|
651
|
+
double &output_depth,
|
652
|
+
std::vector<ImputeNode> *impute_nodes,
|
653
|
+
ImputedData *imputed_data,
|
654
|
+
sparse_ix *restrict tree_num,
|
655
|
+
size_t row);
|
656
|
+
double extract_spC(PredictionData &prediction_data, size_t row, size_t col_num);
|
657
|
+
double extract_spR(PredictionData &prediction_data, sparse_ix *row_st, sparse_ix *row_end, size_t col_num);
|
658
|
+
void get_num_nodes(IsoForest &model_outputs, sparse_ix *restrict n_nodes, sparse_ix *restrict n_terminal, int nthreads);
|
659
|
+
void get_num_nodes(ExtIsoForest &model_outputs, sparse_ix *restrict n_nodes, sparse_ix *restrict n_terminal, int nthreads);
|
660
|
+
|
661
|
+
/* dist.cpp */
|
662
|
+
void calc_similarity(double numeric_data[], int categ_data[],
|
663
|
+
double Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
|
664
|
+
size_t nrows, int nthreads, bool assume_full_distr, bool standardize_dist,
|
665
|
+
IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
666
|
+
double tmat[], double rmat[], size_t n_from);
|
667
|
+
void traverse_tree_sim(WorkerForSimilarity &workspace,
|
668
|
+
PredictionData &prediction_data,
|
669
|
+
IsoForest &model_outputs,
|
670
|
+
std::vector<IsoTree> &trees,
|
671
|
+
size_t curr_tree);
|
672
|
+
void traverse_hplane_sim(WorkerForSimilarity &workspace,
|
673
|
+
PredictionData &prediction_data,
|
674
|
+
ExtIsoForest &model_outputs,
|
675
|
+
std::vector<IsoHPlane> &hplanes,
|
676
|
+
size_t curr_tree);
|
677
|
+
void gather_sim_result(std::vector<WorkerForSimilarity> *worker_memory,
|
678
|
+
std::vector<WorkerMemory> *worker_memory_m,
|
679
|
+
PredictionData *prediction_data, InputData *input_data,
|
680
|
+
IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
681
|
+
double *restrict tmat, double *restrict rmat, size_t n_from,
|
682
|
+
size_t ntrees, bool assume_full_distr,
|
683
|
+
bool standardize_dist, int nthreads);
|
684
|
+
void initialize_worker_for_sim(WorkerForSimilarity &workspace,
|
685
|
+
PredictionData &prediction_data,
|
686
|
+
IsoForest *model_outputs,
|
687
|
+
ExtIsoForest *model_outputs_ext,
|
688
|
+
size_t n_from,
|
689
|
+
bool assume_full_distr);
|
690
|
+
|
691
|
+
/* impute.cpp */
|
692
|
+
void impute_missing_values(double numeric_data[], int categ_data[],
|
693
|
+
double Xr[], sparse_ix Xr_ind[], sparse_ix Xr_indptr[],
|
694
|
+
size_t nrows, int nthreads,
|
695
|
+
IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
696
|
+
Imputer &imputer);
|
697
|
+
void initialize_imputer(Imputer &imputer, InputData &input_data, size_t ntrees, int nthreads);
|
698
|
+
void build_impute_node(ImputeNode &imputer, WorkerMemory &workspace,
|
699
|
+
InputData &input_data, ModelParams &model_params,
|
700
|
+
std::vector<ImputeNode> &imputer_tree,
|
701
|
+
size_t curr_depth, size_t min_imp_obs);
|
702
|
+
void shrink_impute_node(ImputeNode &imputer);
|
703
|
+
void drop_nonterminal_imp_node(std::vector<ImputeNode> &imputer_tree,
|
704
|
+
std::vector<IsoTree> *trees,
|
705
|
+
std::vector<IsoHPlane> *hplanes);
|
706
|
+
void combine_imp_single(ImputedData &imp_addfrom, ImputedData &imp_addto);
|
707
|
+
void combine_tree_imputations(WorkerMemory &workspace,
|
708
|
+
std::vector<ImputedData> &impute_vec,
|
709
|
+
std::unordered_map<size_t, ImputedData> &impute_map,
|
710
|
+
std::vector<char> &has_missing,
|
711
|
+
int nthreads);
|
712
|
+
void add_from_impute_node(ImputeNode &imputer, ImputedData &imputed_data, double w);
|
713
|
+
void add_from_impute_node(ImputeNode &imputer, WorkerMemory &workspace, InputData &input_data);
|
714
|
+
template <class imp_arr>
|
715
|
+
void apply_imputation_results(imp_arr &impute_vec,
|
716
|
+
Imputer &imputer,
|
717
|
+
InputData &input_data,
|
718
|
+
int nthreads);
|
719
|
+
void apply_imputation_results(std::vector<ImputedData> &impute_vec,
|
720
|
+
std::unordered_map<size_t, ImputedData> &impute_map,
|
721
|
+
Imputer &imputer,
|
722
|
+
InputData &input_data,
|
723
|
+
int nthreads);
|
724
|
+
void apply_imputation_results(PredictionData &prediction_data,
|
725
|
+
ImputedData &imp,
|
726
|
+
Imputer &imputer,
|
727
|
+
size_t row);
|
728
|
+
void initialize_impute_calc(ImputedData &imp, InputData &input_data, size_t row);
|
729
|
+
void initialize_impute_calc(ImputedData &imp, PredictionData &prediction_data, Imputer &imputer, size_t row);
|
730
|
+
void allocate_imp_vec(std::vector<ImputedData> &impute_vec, InputData &input_data, int nthreads);
|
731
|
+
void allocate_imp_map(std::unordered_map<size_t, ImputedData> &impute_map, InputData &input_data);
|
732
|
+
void allocate_imp(InputData &input_data,
|
733
|
+
std::vector<ImputedData> &impute_vec,
|
734
|
+
std::unordered_map<size_t, ImputedData> &impute_map,
|
735
|
+
int nthreads);
|
736
|
+
void check_for_missing(InputData &input_data,
|
737
|
+
std::vector<ImputedData> &impute_vec,
|
738
|
+
std::unordered_map<size_t, ImputedData> &impute_map,
|
739
|
+
int nthreads);
|
740
|
+
size_t check_for_missing(PredictionData &prediction_data,
|
741
|
+
Imputer &imputer,
|
742
|
+
size_t ix_arr[],
|
743
|
+
int nthreads);
|
744
|
+
|
745
|
+
/* helpers_iforest.cpp */
|
746
|
+
void decide_column(size_t ncols_numeric, size_t ncols_categ, size_t &col_chosen, ColType &col_type,
|
747
|
+
RNG_engine &rnd_generator, std::uniform_int_distribution<size_t> &runif,
|
748
|
+
std::discrete_distribution<size_t> &col_sampler);
|
749
|
+
void add_unsplittable_col(WorkerMemory &workspace, IsoTree &tree, InputData &input_data);
|
750
|
+
void add_unsplittable_col(WorkerMemory &workspace, InputData &input_data);
|
751
|
+
bool check_is_not_unsplittable_col(WorkerMemory &workspace, IsoTree &tree, InputData &input_data);
|
752
|
+
void get_split_range(WorkerMemory &workspace, InputData &input_data, ModelParams &model_params, IsoTree &tree);
|
753
|
+
void get_split_range(WorkerMemory &workspace, InputData &input_data, ModelParams &model_params);
|
754
|
+
int choose_cat_from_present(WorkerMemory &workspace, InputData &input_data, size_t col_num);
|
755
|
+
void update_col_sampler(WorkerMemory &workspace, InputData &input_data);
|
756
|
+
bool is_col_taken(std::vector<bool> &col_is_taken, std::unordered_set<size_t> &col_is_taken_s,
|
757
|
+
InputData &input_data, size_t col_num, ColType col_type);
|
758
|
+
void set_col_as_taken(std::vector<bool> &col_is_taken, std::unordered_set<size_t> &col_is_taken_s,
|
759
|
+
InputData &input_data, size_t col_num, ColType col_type);
|
760
|
+
void add_separation_step(WorkerMemory &workspace, InputData &input_data, double remainder);
|
761
|
+
void add_remainder_separation_steps(WorkerMemory &workspace, InputData &input_data, long double sum_weight);
|
762
|
+
void remap_terminal_trees(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
763
|
+
PredictionData &prediction_data, sparse_ix *restrict tree_num, int nthreads);
|
764
|
+
void backup_recursion_state(WorkerMemory &workspace, RecursionState &recursion_state);
|
765
|
+
void restore_recursion_state(WorkerMemory &workspace, RecursionState &recursion_state);
|
766
|
+
|
767
|
+
|
768
|
+
/* utils.cpp */
|
769
|
+
size_t log2ceil(size_t x);
|
770
|
+
double harmonic(size_t n);
|
771
|
+
double harmonic_recursive(double a, double b);
|
772
|
+
double expected_avg_depth(size_t sample_size);
|
773
|
+
double expected_avg_depth(long double approx_sample_size);
|
774
|
+
double expected_separation_depth(size_t n);
|
775
|
+
double expected_separation_depth_hotstart(double curr, size_t n_curr, size_t n_final);
|
776
|
+
double expected_separation_depth(long double n);
|
777
|
+
void increase_comb_counter(size_t ix_arr[], size_t st, size_t end, size_t n, double counter[], double exp_remainder);
|
778
|
+
void increase_comb_counter(size_t ix_arr[], size_t st, size_t end, size_t n,
|
779
|
+
double *restrict counter, double *restrict weights, double exp_remainder);
|
780
|
+
void increase_comb_counter(size_t ix_arr[], size_t st, size_t end, size_t n,
|
781
|
+
double counter[], std::unordered_map<size_t, double> &weights, double exp_remainder);
|
782
|
+
void increase_comb_counter_in_groups(size_t ix_arr[], size_t st, size_t end, size_t split_ix, size_t n,
|
783
|
+
double counter[], double exp_remainder);
|
784
|
+
void increase_comb_counter_in_groups(size_t ix_arr[], size_t st, size_t end, size_t split_ix, size_t n,
|
785
|
+
double *restrict counter, double *restrict weights, double exp_remainder);
|
786
|
+
void tmat_to_dense(double *restrict tmat, double *restrict dmat, size_t n, bool diag_to_one);
|
787
|
+
double calc_sd_raw(size_t cnt, long double sum, long double sum_sq);
|
788
|
+
long double calc_sd_raw_l(size_t cnt, long double sum, long double sum_sq);
|
789
|
+
void build_btree_sampler(std::vector<double> &btree_weights, double *restrict sample_weights,
|
790
|
+
size_t nrows, size_t &log2_n, size_t &btree_offset);
|
791
|
+
void sample_random_rows(std::vector<size_t> &ix_arr, size_t nrows, bool with_replacement,
|
792
|
+
RNG_engine &rnd_generator, std::vector<size_t> &ix_all,
|
793
|
+
double sample_weights[], std::vector<double> &btree_weights,
|
794
|
+
size_t log2_n, size_t btree_offset, std::vector<bool> &is_repeated);
|
795
|
+
void weighted_shuffle(size_t *restrict outp, size_t n, double *restrict weights, double *restrict buffer_arr, RNG_engine &rnd_generator);
|
796
|
+
size_t divide_subset_split(size_t ix_arr[], double x[], size_t st, size_t end, double split_point);
|
797
|
+
void divide_subset_split(size_t ix_arr[], double x[], size_t st, size_t end, double split_point,
|
798
|
+
MissingAction missing_action, size_t &st_NA, size_t &end_NA, size_t &split_ix);
|
799
|
+
void divide_subset_split(size_t ix_arr[], size_t st, size_t end, size_t col_num,
|
800
|
+
double Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[], double split_point,
|
801
|
+
MissingAction missing_action, size_t &st_NA, size_t &end_NA, size_t &split_ix);
|
802
|
+
void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, char split_categ[],
|
803
|
+
MissingAction missing_action, size_t &st_NA, size_t &end_NA, size_t &split_ix);
|
804
|
+
void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, char split_categ[],
|
805
|
+
int ncat, MissingAction missing_action, NewCategAction new_cat_action,
|
806
|
+
bool move_new_to_left, size_t &st_NA, size_t &end_NA, size_t &split_ix);
|
807
|
+
void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, int split_categ,
|
808
|
+
MissingAction missing_action, size_t &st_NA, size_t &end_NA, size_t &split_ix);
|
809
|
+
void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end,
|
810
|
+
MissingAction missing_action, NewCategAction new_cat_action,
|
811
|
+
bool move_new_to_left, size_t &st_NA, size_t &end_NA, size_t &split_ix);
|
812
|
+
void get_range(size_t ix_arr[], double x[], size_t st, size_t end,
|
813
|
+
MissingAction missing_action, double &xmin, double &xmax, bool &unsplittable);
|
814
|
+
void get_range(size_t ix_arr[], size_t st, size_t end, size_t col_num,
|
815
|
+
double Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
|
816
|
+
MissingAction missing_action, double &xmin, double &xmax, bool &unsplittable);
|
817
|
+
void get_categs(size_t ix_arr[], int x[], size_t st, size_t end, int ncat,
|
818
|
+
MissingAction missing_action, char categs[], size_t &npresent, bool &unsplittable);
|
819
|
+
long double calculate_sum_weights(std::vector<size_t> &ix_arr, size_t st, size_t end, size_t curr_depth,
|
820
|
+
std::vector<double> &weights_arr, std::unordered_map<size_t, double> &weights_map);
|
821
|
+
void set_interrup_global_variable(int s);
|
822
|
+
int return_EXIT_SUCCESS();
|
823
|
+
int return_EXIT_FAILURE();
|
824
|
+
|
825
|
+
|
826
|
+
|
827
|
+
size_t move_NAs_to_front(size_t ix_arr[], size_t st, size_t end, double x[]);
|
828
|
+
size_t move_NAs_to_front(size_t ix_arr[], size_t st, size_t end, size_t col_num, double Xc[], size_t Xc_ind[], size_t Xc_indptr[]);
|
829
|
+
size_t move_NAs_to_front(size_t ix_arr[], size_t st, size_t end, int x[]);
|
830
|
+
size_t center_NAs(size_t *restrict ix_arr, size_t st_left, size_t st, size_t curr_pos);
|
831
|
+
void todense(size_t ix_arr[], size_t st, size_t end,
|
832
|
+
size_t col_num, double *restrict Xc, sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
|
833
|
+
double *restrict buffer_arr);
|
834
|
+
|
835
|
+
/* mult.cpp */
|
836
|
+
void calc_mean_and_sd(size_t ix_arr[], size_t st, size_t end, double *restrict x,
|
837
|
+
MissingAction missing_action, double &x_sd, double &x_mean);
|
838
|
+
void calc_mean_and_sd(size_t ix_arr[], size_t st, size_t end, size_t col_num,
|
839
|
+
double *restrict Xc, sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
|
840
|
+
double &x_sd, double &x_mean);
|
841
|
+
void add_linear_comb(size_t ix_arr[], size_t st, size_t end, double *restrict res,
|
842
|
+
double *restrict x, double &coef, double x_sd, double x_mean, double &fill_val,
|
843
|
+
MissingAction missing_action, double *restrict buffer_arr,
|
844
|
+
size_t *restrict buffer_NAs, bool first_run);
|
845
|
+
void add_linear_comb(size_t *restrict ix_arr, size_t st, size_t end, size_t col_num, double *restrict res,
|
846
|
+
double *restrict Xc, sparse_ix *restrict Xc_ind, sparse_ix *restrict Xc_indptr,
|
847
|
+
double &coef, double x_sd, double x_mean, double &fill_val, MissingAction missing_action,
|
848
|
+
double *restrict buffer_arr, size_t *restrict buffer_NAs, bool first_run);
|
849
|
+
void add_linear_comb(size_t *restrict ix_arr, size_t st, size_t end, double *restrict res,
|
850
|
+
int x[], int ncat, double *restrict cat_coef, double single_cat_coef, int chosen_cat,
|
851
|
+
double &fill_val, double &fill_new, size_t *restrict buffer_cnt, size_t *restrict buffer_pos,
|
852
|
+
NewCategAction new_cat_action, MissingAction missing_action, CategSplit cat_split_type, bool first_run);
|
853
|
+
|
854
|
+
/* crit.cpp */
|
855
|
+
double calc_kurtosis(size_t ix_arr[], size_t st, size_t end, double x[], MissingAction missing_action);
|
856
|
+
double calc_kurtosis(size_t ix_arr[], size_t st, size_t end, size_t col_num,
|
857
|
+
double Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
|
858
|
+
MissingAction missing_action);
|
859
|
+
double calc_kurtosis(size_t ix_arr[], size_t st, size_t end, int x[], int ncat, size_t buffer_cnt[], double buffer_prob[],
|
860
|
+
MissingAction missing_action, CategSplit cat_split_type, RNG_engine &rnd_generator);
|
861
|
+
double expected_sd_cat(double p[], size_t n, size_t pos[]);
|
862
|
+
double expected_sd_cat(size_t counts[], double p[], size_t n, size_t pos[]);
|
863
|
+
double expected_sd_cat_single(size_t counts[], double p[], size_t n, size_t pos[], size_t cat_exclude, size_t cnt);
|
864
|
+
double numeric_gain(size_t cnt_left, size_t cnt_right,
|
865
|
+
long double sum_left, long double sum_right,
|
866
|
+
long double sum_sq_left, long double sum_sq_right,
|
867
|
+
double sd_full, long double cnt);
|
868
|
+
double numeric_gain_no_div(size_t cnt_left, size_t cnt_right,
|
869
|
+
long double sum_left, long double sum_right,
|
870
|
+
long double sum_sq_left, long double sum_sq_right,
|
871
|
+
double sd_full, long double cnt);
|
872
|
+
double categ_gain(size_t cnt_left, size_t cnt_right,
|
873
|
+
long double s_left, long double s_right,
|
874
|
+
long double base_info, long double cnt);
|
875
|
+
double eval_guided_crit(double *restrict x, size_t n, GainCriterion criterion, double min_gain,
|
876
|
+
double &split_point, double &xmin, double &xmax);
|
877
|
+
double eval_guided_crit(size_t *restrict ix_arr, size_t st, size_t end, double *restrict x,
|
878
|
+
size_t &split_ix, double &split_point, double &xmin, double &xmax,
|
879
|
+
GainCriterion criterion, double min_gain, MissingAction missing_action);
|
880
|
+
double eval_guided_crit(size_t ix_arr[], size_t st, size_t end,
|
881
|
+
size_t col_num, double Xc[], sparse_ix Xc_ind[], sparse_ix Xc_indptr[],
|
882
|
+
double buffer_arr[], size_t buffer_pos[],
|
883
|
+
double &split_point, double &xmin, double &xmax,
|
884
|
+
GainCriterion criterion, double min_gain, MissingAction missing_action);
|
885
|
+
double eval_guided_crit(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, int ncat,
|
886
|
+
size_t *restrict buffer_cnt, size_t *restrict buffer_pos, double *restrict buffer_prob,
|
887
|
+
int &chosen_cat, char *restrict split_categ, char *restrict buffer_split,
|
888
|
+
GainCriterion criterion, double min_gain, bool all_perm, MissingAction missing_action, CategSplit cat_split_type);
|
889
|
+
|
890
|
+
/* merge_models.cpp */
|
891
|
+
void merge_models(IsoForest* model, IsoForest* other,
|
892
|
+
ExtIsoForest* ext_model, ExtIsoForest* ext_other,
|
893
|
+
Imputer* imputer, Imputer* iother);
|
894
|
+
|
895
|
+
#ifdef _ENABLE_CEREAL
|
896
|
+
/* serialize.cpp */
|
897
|
+
void serialize_isoforest(IsoForest &model, std::ostream &output);
|
898
|
+
void serialize_isoforest(IsoForest &model, const char *output_file_path);
|
899
|
+
std::string serialize_isoforest(IsoForest &model);
|
900
|
+
void deserialize_isoforest(IsoForest &output_obj, std::istream &serialized);
|
901
|
+
void deserialize_isoforest(IsoForest &output_obj, const char *input_file_path);
|
902
|
+
void deserialize_isoforest(IsoForest &output_obj, std::string &serialized, bool move_str);
|
903
|
+
void serialize_ext_isoforest(ExtIsoForest &model, std::ostream &output);
|
904
|
+
void serialize_ext_isoforest(ExtIsoForest &model, const char *output_file_path);
|
905
|
+
std::string serialize_ext_isoforest(ExtIsoForest &model);
|
906
|
+
void deserialize_ext_isoforest(ExtIsoForest &output_obj, std::istream &serialized);
|
907
|
+
void deserialize_ext_isoforest(ExtIsoForest &output_obj, const char *input_file_path);
|
908
|
+
void deserialize_ext_isoforest(ExtIsoForest &output_obj, std::string &serialized, bool move_str);
|
909
|
+
void serialize_imputer(Imputer &imputer, std::ostream &output);
|
910
|
+
void serialize_imputer(Imputer &imputer, const char *output_file_path);
|
911
|
+
std::string serialize_imputer(Imputer &imputer);
|
912
|
+
void deserialize_imputer(Imputer &output_obj, std::istream &serialized);
|
913
|
+
void deserialize_imputer(Imputer &output_obj, const char *input_file_path);
|
914
|
+
void deserialize_imputer(Imputer &output_obj, std::string &serialized, bool move_str);
|
915
|
+
#ifdef _MSC_VER
|
916
|
+
void serialize_isoforest(IsoForest &model, const wchar_t *output_file_path);
|
917
|
+
void deserialize_isoforest(IsoForest &output_obj, const wchar_t *input_file_path);
|
918
|
+
void serialize_ext_isoforest(ExtIsoForest &model, const wchar_t *output_file_path);
|
919
|
+
void deserialize_ext_isoforest(ExtIsoForest &output_obj, const wchar_t *input_file_path);
|
920
|
+
void serialize_imputer(Imputer &imputer, const wchar_t *output_file_path);
|
921
|
+
void deserialize_imputer(Imputer &output_obj, const wchar_t *input_file_path);
|
922
|
+
#endif /* _MSC_VER */
|
923
|
+
bool has_msvc();
|
924
|
+
#endif /* _ENABLE_CEREAL */
|
925
|
+
|
926
|
+
/* dealloc.cpp */
|
927
|
+
void dealloc_IsoForest(IsoForest &model_outputs);
|
928
|
+
void dealloc_IsoExtForest(ExtIsoForest &model_outputs_ext);
|
929
|
+
void dealloc_Imputer(Imputer &imputer);
|