outliertree 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,758 @@
1
+ /********************************************************************************************************************
2
+ * Explainable outlier detection
3
+ *
4
+ * Tries to detect outliers by generating decision trees that attempt to predict the values of each column based on
5
+ * each other column, testing in each branch of every tried split (if it meets some minimum criteria) whether there
6
+ * are observations that seem too distant from the others in a 1-D distribution for the column that the split tries
7
+ * to "predict" (will not generate a score for each observation).
8
+ * Splits are based on gain, while outlierness is based on confidence intervals.
9
+ * Similar in spirit to the GritBot software developed by RuleQuest research. Reference article is:
10
+ * Cortes, David. "Explainable outlier detection through decision tree conditioning."
11
+ * arXiv preprint arXiv:2001.00636 (2020).
12
+ *
13
+ *
14
+ * Copyright 2020 David Cortes.
15
+ *
16
+ * Written for C++11 standard and OpenMP 2.0 or later. Code is meant to be wrapped into scripting languages
17
+ * such as R or Python.
18
+ *
19
+ * This file is part of OutlierTree.
20
+ *
21
+ * OutlierTree is free software: you can redistribute it and/or modify
22
+ * it under the terms of the GNU General Public License as published by
23
+ * the Free Software Foundation, either version 3 of the License, or
24
+ * (at your option) any later version.
25
+ *
26
+ * OutlierTree is distributed in the hope that it will be useful,
27
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
28
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29
+ * GNU General Public License for more details.
30
+ *
31
+ * You should have received a copy of the GNU General Public License
32
+ * along with OutlierTree. If not, see <https://www.gnu.org/licenses/>.
33
+ ********************************************************************************************************************/
34
+
35
+ /***********************
36
+ Standard headers
37
+ ************************/
38
+ #include <iostream>
39
+ #include <vector>
40
+ #include <memory>
41
+ #include <algorithm>
42
+ #include <numeric>
43
+ #include <unordered_set>
44
+ #include <math.h>
45
+ #include <cmath>
46
+ #include <stddef.h>
47
+ #include <limits.h>
48
+ #include <stdlib.h>
49
+ #include <stddef.h>
50
+ #include <string.h>
51
+ #ifdef _OPENMP
52
+ #include <omp.h>
53
+ #endif
54
+
55
+ /************************
56
+ Short Functions
57
+ *************************/
58
+ #define extract_bit(number, bit) (((number) >> (bit)) & 1) /* https://stackoverflow.com/questions/2249731/how-do-i-get-bit-by-bit-data-from-an-integer-value-in-c */
59
+ #define pow2(n) ( ((size_t) 1) << (n) ) /* https://stackoverflow.com/questions/101439/the-most-efficient-way-to-implement-an-integer-based-power-function-powint-int */
60
+ #define avg_between(a, b) (((a) + (b)) * 0.5)
61
+ #define square(x) ((x) * (x))
62
+ #ifndef isinf
63
+ #define isinf std::isinf
64
+ #endif
65
+ #ifndef isnan
66
+ #define isnan std::isnan
67
+ #endif
68
+ #define is_na_or_inf(x) (isnan(x) || isinf(x))
69
+
70
+ /* Aliasing for compiler optimizations */
71
+ #if defined(__GNUG__) || defined(__GNUC__) || defined(_MSC_VER) || defined(__clang__) || defined(__INTEL_COMPILER)
72
+ #define restrict __restrict
73
+ #else
74
+ #define restrict
75
+ #endif
76
+
77
+ /* MSVC is stuck with an OpenMP version that's 19 years old at the time of writing and does not support unsigned iterators */
78
+ #ifdef _OPENMP
79
+ #if (_OPENMP < 200801) || defined(_WIN32) || defined(_WIN64) /* OpenMP < 3.0 */
80
+ #define size_t_for long long
81
+ #else
82
+ #define size_t_for size_t
83
+ #endif
84
+ #else
85
+ #define size_t_for size_t
86
+ #endif
87
+
88
+ #ifndef _OPENMP
89
+ #define omp_get_thread_num() 0
90
+ #endif
91
+
92
+
93
+ /****************************************************************
94
+ Data types and structs that are returned from this module
95
+ *****************************************************************/
96
+ typedef enum ColType {Numeric, Categorical, Ordinal, NoType} ColType;
97
+ typedef enum SplitType {
98
+ LessOrEqual, Greater, /* for numerical and ordinal */
99
+ Equal, NotEqual, /* will try to simplify to these post-hoc if possible */
100
+ InSubset, NotInSubset, /* for categoricals */
101
+ SingleCateg, SubTrees, /* one branch per category of a categorical column */
102
+ IsNa, Root
103
+ } SplitType;
104
+ typedef enum ColTransf {NoTransf, Log, Exp} ColTransf; /* transformation to apply to numeric column */
105
+
106
+ /*
107
+ * 1-d clusters that define homogeneous groups in which observations can be outliers.
108
+ * Note that these are associated to a tree and define one extra condition from what
109
+ * the tree already specifies. The branch they follow is stored in the cluster, unlike
110
+ * for trees in which it's always left and right branch, as these get discarded more often.
111
+ */
112
+ typedef struct Cluster {
113
+ ColType column_type = NoType;
114
+ size_t col_num = 0; /* numer of the column by which its being split, the target column is given by index of the cluster vector */
115
+ SplitType split_type = Root;
116
+ double split_point = HUGE_VAL; /* numerical */
117
+ std::vector<char> split_subset = std::vector<char>(); /* categorical */
118
+ int split_lev = INT_MAX; /* ordinal */
119
+ bool has_NA_branch = false; /* this is in order to determine the best outlier cluster when it fits under more than 1 */
120
+
121
+ size_t cluster_size = 0;
122
+ double lower_lim = HUGE_VAL; /* numerical target column */
123
+ double upper_lim = -HUGE_VAL; /* numerical target column */
124
+ double perc_below = HUGE_VAL; /* numerical target column */
125
+ double perc_above = HUGE_VAL; /* numerical target column */
126
+ double display_lim_low = HUGE_VAL; /* numerical target column */
127
+ double display_lim_high = -HUGE_VAL; /* numerical target column */
128
+ double display_mean = -HUGE_VAL; /* numerical target column */
129
+ double display_sd = -HUGE_VAL; /* numerical target column */
130
+ std::vector<char> subset_common = std::vector<char>(); /* categorical or ordinal target column (=0 is common) */
131
+ double perc_in_subset = HUGE_VAL; /* categorical or ordinal target column */
132
+ double perc_next_most_comm = -HUGE_VAL; /* categorical or ordinal target column */ /* TODO */
133
+ int categ_maj = -1; /* when using majority-criterion for categorical outliers */
134
+
135
+ double cluster_mean; /* used to calculate outlier scores at prediction time */
136
+ double cluster_sd; /* used to calculate outlier scores at prediction time */
137
+ std::vector<double> score_categ; /* used to calculate outlier scores at prediction time */
138
+
139
+ /* constructors in order to use C++'s vector emplace */
140
+
141
+ /* full data (no conditions) */
142
+ Cluster(ColType column_type, SplitType split_type)
143
+ {
144
+ this->column_type = column_type;
145
+ this->split_type = split_type;
146
+ }
147
+
148
+ /* numerical split */
149
+ Cluster(ColType column_type, size_t col_num, SplitType split_type, double split_point, bool has_NA_branch = false)
150
+ {
151
+ this->column_type = column_type;
152
+ this->col_num = col_num;
153
+ this->split_type = split_type;
154
+ this->split_point = split_point;
155
+ this->has_NA_branch = has_NA_branch;
156
+ }
157
+
158
+ /* categorical split */
159
+ Cluster(ColType column_type, size_t col_num, SplitType split_type, char *split_subset, int ncat, bool has_NA_branch = false)
160
+ {
161
+ this->column_type = column_type;
162
+ this->col_num = col_num;
163
+ this->split_type = split_type;
164
+ if (split_type != IsNa) this->split_subset.assign(split_subset, split_subset + ncat);
165
+ this->split_subset.shrink_to_fit();
166
+ this->has_NA_branch = has_NA_branch;
167
+ }
168
+
169
+ /* categorical split with only one level */
170
+ Cluster(size_t col_num, int cat, int ncat, bool has_NA_branch = false)
171
+ {
172
+ this->column_type = Categorical;
173
+ this->col_num = col_num;
174
+ this->has_NA_branch = has_NA_branch;
175
+ this->split_type = Equal;
176
+ this->split_lev = cat;
177
+ }
178
+
179
+ /* ordinal split */
180
+ Cluster(ColType column_type, size_t col_num, SplitType split_type, int split_lev, bool has_NA_branch = false)
181
+ {
182
+ this->column_type = column_type;
183
+ this->col_num = col_num;
184
+ this->split_type = split_type;
185
+ this->split_lev = split_lev;
186
+ this->has_NA_branch = has_NA_branch;
187
+ }
188
+
189
+ /* this is for serialization with cereal */
190
+ template<class Archive>
191
+ void serialize(Archive &archive)
192
+ {
193
+ archive(
194
+ this->column_type,
195
+ this->col_num,
196
+ this->split_type,
197
+ this->split_point,
198
+ this->split_subset,
199
+ this->split_lev,
200
+ this->has_NA_branch,
201
+ this->cluster_size,
202
+ this->lower_lim,
203
+ this->upper_lim,
204
+ this->perc_below,
205
+ this->perc_above,
206
+ this->display_lim_low,
207
+ this->display_lim_high,
208
+ this->display_mean,
209
+ this->display_sd,
210
+ this->subset_common,
211
+ this->perc_in_subset,
212
+ this->perc_next_most_comm,
213
+ this->cluster_mean,
214
+ this->cluster_sd,
215
+ this->score_categ
216
+ );
217
+ }
218
+
219
+ /* this is for serialization with both cereal and cython auto-pickle */
220
+ Cluster() = default;
221
+
222
+ } Cluster;
223
+
224
+ /*
225
+ * Trees that host the aforementioned clusters. These work as follows:
226
+ * - Each tree contains a split column and condition for splitting.
227
+ * - The trees that follow them are specified in tree_left/right/NA.
228
+ * - If the tree is dropped or not used, that branch gets an index of zero.
229
+ * - The child tree will however remember which branch it took.
230
+ * - At prediction time, the output will tell into which cluster and which tree
231
+ * is each row an outlier (if they fall into any).
232
+ * - The exact conditions are reconstructed by following the trees backwards
233
+ * (i.e. first the cluster, then deepest tree, then follow parent tree until root).
234
+ * This way, all the necessary information can be obtained without storing redundant
235
+ * info, and without needing to reconstruct the conditions as the 'predict'
236
+ * function is being called (which makes it easier to wrap into other languages).
237
+ * - At prediction time, as the observation is passed down trees, all the clusters
238
+ * in all those trees have to be tested for (so if a cluster is discarded, it can
239
+ * keep only one branch of its split in the struct).
240
+ * - As a side effect, in ordinal columns, the trees cannot be simplified to 'Equal'.
241
+ * - All of this is ignored when using 'follow_all', in which case the trees work just
242
+ * like the clusters, with an array 'all_branches' which contains all trees that have
243
+ * to be follow from one particular tree.
244
+ */
245
+ typedef struct ClusterTree {
246
+ size_t parent = 0; /* index in a vector */
247
+ SplitType parent_branch = Root; /* this tree follows this branch in the split given by its parent */
248
+ std::vector<size_t> clusters = std::vector<size_t>(); /* these clusters define additional splits */
249
+
250
+ SplitType split_this_branch = Root; /* when using 'follow_all' */
251
+ std::vector<size_t> all_branches = std::vector<size_t>(); /* when using 'follow_all' */
252
+
253
+ ColType column_type = NoType;
254
+ size_t col_num = 0;
255
+ double split_point = HUGE_VAL;
256
+ std::vector<char> split_subset = std::vector<char>();
257
+ int split_lev = INT_MAX;
258
+
259
+ size_t tree_NA = 0; /* binary splits */
260
+ size_t tree_left = 0; /* binary splits */
261
+ size_t tree_right = 0; /* binary splits */
262
+ std::vector<size_t> binary_branches = std::vector<size_t>(); /* multiple splits (single category or binarized categories) */
263
+
264
+ ClusterTree(size_t parent, SplitType parent_branch)
265
+ {
266
+ this->parent = parent;
267
+ this->parent_branch = parent_branch;
268
+ }
269
+
270
+ /* when using 'follow_all' */
271
+ ClusterTree(size_t parent, size_t col_num, double split_point, SplitType split_this_branch)
272
+ {
273
+ this->parent = parent;
274
+ this->col_num = col_num;
275
+ this->column_type = Numeric;
276
+ this->split_this_branch = split_this_branch;
277
+ this->split_point = split_point;
278
+ }
279
+
280
+ ClusterTree(size_t parent, size_t col_num, int split_lev, SplitType split_this_branch)
281
+ {
282
+ this->parent = parent;
283
+ this->col_num = col_num;
284
+ this->column_type = Ordinal;
285
+ this->split_this_branch = split_this_branch;
286
+ this->split_lev = split_lev;
287
+ }
288
+
289
+ ClusterTree(size_t parent, size_t col_num, SplitType split_this_branch, char *split_subset, int ncat)
290
+ {
291
+ this->parent = parent;
292
+ this->col_num = col_num;
293
+ this->column_type = Categorical;
294
+ if (split_this_branch != IsNa) {
295
+ this->split_this_branch = split_this_branch;
296
+ this->split_subset.assign(split_subset, split_subset + ncat);
297
+ this->split_subset.shrink_to_fit();
298
+ } else {
299
+ this->split_this_branch = IsNa;
300
+ }
301
+ }
302
+
303
+ ClusterTree(size_t parent, size_t col_num, int cat_chosen)
304
+ {
305
+ this->parent = parent;
306
+ this->col_num = col_num;
307
+ this->column_type = Categorical;
308
+ this->split_this_branch = Equal;
309
+ this->split_lev = cat_chosen;
310
+ }
311
+
312
+ /* this is for serialization with cereal */
313
+ template<class Archive>
314
+ void serialize(Archive &archive)
315
+ {
316
+ archive(
317
+ this->parent,
318
+ this->parent_branch,
319
+ this->clusters,
320
+ this->split_this_branch,
321
+ this->all_branches,
322
+ this->column_type,
323
+ this->col_num,
324
+ this->split_point,
325
+ this->split_subset,
326
+ this->split_lev,
327
+ this->tree_NA,
328
+ this->tree_left,
329
+ this->tree_right,
330
+ this->binary_branches
331
+ );
332
+ }
333
+
334
+ /* this is for serialization with both cereal and cython auto-pickle */
335
+ ClusterTree() = default;
336
+
337
+ } ClusterTree;
338
+
339
+ /* these are needed for prediction time, and are thus returned from the function that fits the model */
340
+ typedef struct ModelOutputs {
341
+ std::vector< std::vector<ClusterTree> > all_trees; /* clusters in which observations can be outliers, required for prediction time */
342
+ std::vector< std::vector<Cluster> > all_clusters; /* decision trees that host the clusters, required for prediction time */
343
+ std::vector<double> outlier_scores_final; /* if an outlier is flagged, this indicates its score (lower is more outlier) as an upper probability bound */
344
+ std::vector<size_t> outlier_clusters_final; /* if an outlier is flagged, this indicates the most suitable cluster in which to flag it as outlier */
345
+ std::vector<size_t> outlier_columns_final; /* if an outlier is flagged, this indicates the column that makes it an outlier */
346
+ std::vector<size_t> outlier_trees_final; /* if an outlier is flagged, this indicates the tree under which the cluster is found */
347
+ std::vector<size_t> outlier_depth_final; /* if an outlier is flagged, this indicates the split depth under which the cluster is found */
348
+ std::vector<int> outlier_decimals_distr; /* if an outlier is flagged, and it's a numeric column, this will indicate how many decimals to print for it */
349
+ std::vector<size_t> start_ix_cat_counts; /* this is to determine where to index the proportions */
350
+ std::vector<long double> prop_categ; /* this is just for statistics to show, it's not used for anything */
351
+ std::vector<ColTransf> col_transf; /* tells whether each numerical columns underwent log/exp transformations */
352
+ std::vector<double> transf_offset; /* value subtracted for log transform, mean subtracted for exp transform */
353
+ std::vector<double> sd_div; /* standard deviation with which exp-transformed columns were standardized */
354
+ std::vector<int> min_decimals_col; /* number of decimals to show for split conditions in numeric columns */
355
+ std::vector<int> ncat; /* copied from the inputs, used to determine at prediction time if a category is out-of-range and skip */
356
+ std::vector<int> ncat_ord; /* copied from the inputs, used to determine at prediction time if a category is out-of-range and skip */
357
+ size_t ncols_numeric; /* copied from the inputs, used to determine at prediction time if a category is out-of-range and skip */
358
+ size_t ncols_categ; /* copied from the inputs, used to determine at prediction time if a category is out-of-range and skip */
359
+ size_t ncols_ord; /* copied from the inputs, used to determine at prediction time if a category is out-of-range and skip */
360
+ std::vector<double> min_outlier_any_cl; /* redundant info which speeds up prediction */
361
+ std::vector<double> max_outlier_any_cl; /* redundant info which speeds up prediction */
362
+ std::vector<std::vector<bool>> cat_outlier_any_cl; /* redundant info which speeds up prediction */
363
+ size_t max_depth; /* redundant info which speeds up prediction */
364
+
365
+
366
+ /* this is for serialization with cereal */
367
+ template<class Archive>
368
+ void serialize(Archive &archive)
369
+ {
370
+ archive(
371
+ this->all_trees,
372
+ this->all_clusters,
373
+ this->outlier_scores_final,
374
+ this->outlier_clusters_final,
375
+ this->outlier_columns_final,
376
+ this->outlier_trees_final,
377
+ this->outlier_depth_final,
378
+ this->start_ix_cat_counts,
379
+ this->prop_categ,
380
+ this->col_transf,
381
+ this->transf_offset,
382
+ this->sd_div,
383
+ this->ncat,
384
+ this->ncat_ord,
385
+ this->ncols_numeric,
386
+ this->ncols_categ,
387
+ this->ncols_ord,
388
+ this->min_outlier_any_cl,
389
+ this->max_outlier_any_cl,
390
+ this->cat_outlier_any_cl,
391
+ this->max_depth
392
+ );
393
+ }
394
+
395
+ /* this is for serialization with both cereal and cython auto-pickle */
396
+ ModelOutputs() = default;
397
+
398
+ } ModelOutputs;
399
+
400
+ /*
401
+ * Note: the vectors with proportions in these structs are supposed to be all small numbers so 'long double' is an overkill for them
402
+ * and does not make them translate into SIMD instructions in regular x86-64 CPUs, but if setting them as 'double' and then doing casts
403
+ * from/between 'double' and the 'size_t' and 'long double's of other arrays (such as in function 'find_outlier_categories'), comparisons
404
+ * such as '<=' will oftentimes fail even with small counts - this is an example that will fail when mixing the 3 types together:
405
+ * >>> (2 / (88+1)) * 0.5 <= (1 / 89) --> produces FALSE (right answer is TRUE)
406
+ * All due to decimals (in that example) right of the 10th digit, and ends up creating categorical clusters that it should not create.
407
+ * So don't change them back to regular 'double', or if necessary, change every 'long double' to 'double' too.
408
+ */
409
+
410
+ /******************************************
411
+ Prototypes from fit_model.cpp
412
+ (This is the main module from which
413
+ the model is generated)
414
+ *******************************************/
415
+ bool fit_outliers_models(ModelOutputs &model_outputs,
416
+ double *restrict numeric_data, size_t ncols_numeric,
417
+ int *restrict categorical_data, size_t ncols_categ, int *restrict ncat,
418
+ int *restrict ordinal_data, size_t ncols_ord, int *restrict ncat_ord,
419
+ size_t nrows, char *restrict cols_ignore = NULL, int nthreads = 1,
420
+ bool categ_as_bin = true, bool ord_as_bin = true, bool cat_bruteforce_subset = false, bool categ_from_maj = false, bool take_mid = true,
421
+ size_t max_depth = 3, double max_perc_outliers = 0.01, size_t min_size_numeric = 25, size_t min_size_categ = 50,
422
+ double min_gain = 1e-2, bool gain_as_pct = false, bool follow_all = false, double z_norm = 2.67, double z_outlier = 8.0);
423
+
424
+ typedef struct {
425
+
426
+ std::vector<size_t> ix_arr; /* indices from the target column */
427
+ size_t st; /* chunk of the indices to take for current function calls */
428
+ size_t end; /* chunk of the indices to take for current function calls */
429
+ std::vector<double> outlier_scores; /* these hold the model outputs for 1 column before combining them */
430
+ std::vector<size_t> outlier_clusters; /* these hold the model outputs for 1 column before combining them */
431
+ std::vector<size_t> outlier_trees; /* these hold the model outputs for 1 column before combining them */
432
+ std::vector<size_t> outlier_depth; /* these hold the model outputs for 1 column before combining them */
433
+ size_t target_col_num; /* if categorical or ordinal, gets subtracted the number of numeric columns (used to index other arrays) */
434
+ long double sd_y; /* numerical only (standard deviation before splitting) */
435
+ double mean_y; /* numerical only (used to standardize numbers for extra FP precision) */
436
+ long double base_info; /* categorical and ordinal (information before splitting and before binarizing) */
437
+ long double base_info_orig; /* categorical and ordinal (information before splitting and after binarizing if needed) */
438
+ bool log_transf; /* numerical - whether the target variable underwent a logarithmic transformation */
439
+ bool exp_transf; /* numerical - whether the target variable underwent exponentiation on its Z values */
440
+ double *target_numeric_col; /* dynamic pointer */
441
+ int *target_categ_col; /* dynamic pointer */
442
+ std::vector<double> buffer_transf_y; /* if applying logarithm or exponentiation, transformed values are stored here */
443
+ std::vector<int> buffer_bin_y; /* if binarizing, transformed values are stored here */
444
+ std::vector<Cluster> *clusters; /* dynamic pointer, don't change to reference as it otherwise cannot be reassigned */
445
+ std::vector<ClusterTree> *tree; /* dynamic pointer, don't change to reference as it otherwise cannot be reassigned */
446
+ bool has_outliers; /* temporary variable from which the other two are updated */
447
+ bool lev_has_outliers; /* whether the particular depth level has outliers (if so, wil remove them at the end before new split) */
448
+ bool col_has_outliers; /* whether there's any outliers in the column (will later merge them into the outputs) */
449
+ double left_tail; /* approximate value where a long left tail ends */
450
+ double right_tail; /* approximate value where a long right tail ends */
451
+
452
+ bool col_is_bin; /* whether the target categorical/ordinal column has 2 categories or has been forcibly binarized */
453
+ long double *prop_small_this; /* dynamic pointer */
454
+ long double *prior_prob; /* dynamic pointer */
455
+
456
+ double orig_mean; /* value to reconstruct originals from exponentiated */
457
+ double orig_sd; /* value to reconstruct originals from exponentiated */
458
+ double log_minval; /* value to reconstruct originals from logarithms */
459
+ double *orig_target_col; /* column as it was before applying log/exp (dynamic pointer) */
460
+ int *untransf_target_col; /* column as it was before forcibly binarizing (dynamic pointer) */
461
+ int *temp_ptr_x; /* dynamic pointer */
462
+
463
+ std::vector<char> buffer_subset_categ_best; /* categorical split that gave the best gain */
464
+ long double this_gain; /* buffer where to store gain */
465
+ double this_split_point; /* numeric split threshold */
466
+ int this_split_lev; /* ordinal split threshold */
467
+ size_t this_split_ix; /* index at which the data is partitioned */
468
+ size_t this_split_NA; /* index at which the non-NA values start */
469
+ long double best_gain; /* buffer where to store the info of the splitting column that produced the highest gain */
470
+ ColType column_type_best; /* buffer where to store the info of the splitting column that produced the highest gain */
471
+ double split_point_best; /* buffer where to store the info of the splitting column that produced the highest gain */
472
+ int split_lev_best; /* buffer where to store the info of the splitting column that produced the highest gain */
473
+ size_t col_best; /* buffer where to store the info of the splitting column that produced the highest gain */
474
+
475
+ std::vector<long double> buffer_cat_sum; /* buffer arrays where to allocate values required by functions and not used outside them */
476
+ std::vector<long double> buffer_cat_sum_sq; /* buffer arrays where to allocate values required by functions and not used outside them */
477
+ std::vector<size_t> buffer_crosstab; /* buffer arrays where to allocate values required by functions and not used outside them */
478
+ std::vector<size_t> buffer_cat_cnt; /* buffer arrays where to allocate values required by functions and not used outside them */
479
+ std::vector<size_t> buffer_cat_sorted; /* buffer arrays where to allocate values required by functions and not used outside them */
480
+ std::vector<char> buffer_subset_categ; /* buffer arrays where to allocate values required by functions and not used outside them */
481
+ std::vector<char> buffer_subset_outlier; /* buffer arrays where to allocate values required by functions and not used outside them */
482
+ std::vector<long double> buffer_sd; /* used for a more numerically-stable two-pass gain calculation */
483
+
484
+ bool drop_cluster; /* for categorical and ordinal variables, not all clusters can flag observations as outliers, so those are not kept */
485
+ bool already_split_main; /* when binarizing categoricals/ordinals, avoid attempting the same split with numerical and ordinals that take the non-binarized data */
486
+ bool target_col_is_ord; /* whether the target column is ordinal (rest is the same as in categoricals) */
487
+ int ncat_this; /* number of categories in the target column */
488
+
489
+ } Workspace;
490
+
491
+ /* info holders to shorten function call arguments */
492
+ typedef struct {
493
+ bool categ_as_bin;
494
+ bool ord_as_bin;
495
+ bool cat_bruteforce_subset;
496
+ bool categ_from_maj;
497
+ bool take_mid;
498
+ size_t max_depth;
499
+ double max_perc_outliers;
500
+ size_t min_size_numeric;
501
+ size_t min_size_categ;
502
+ double min_gain;
503
+ bool gain_as_pct;
504
+ bool follow_all;
505
+ double z_norm;
506
+ double z_outlier;
507
+ double z_tail;
508
+ std::vector<long double> prop_small; /* this is not a parameter, but a shared array determined from the parameters and data */
509
+ } ModelParams;
510
+
511
+ /* Note: the vectors here are filled within the function that fits the model, while the pointers are passed from outside */
512
+ typedef struct {
513
+ double *restrict numeric_data; size_t ncols_numeric;
514
+ int *restrict categorical_data; size_t ncols_categ; int *restrict ncat;
515
+ int *restrict ordinal_data; size_t ncols_ord; int *restrict ncat_ord;
516
+ size_t nrows; size_t tot_cols; std::vector<char> has_NA; std::vector<char> skip_col; int max_categ;
517
+ std::vector<size_t> cat_counts;
518
+ } InputData;
519
+
520
+
521
+ void process_numeric_col(std::vector<Cluster> &cluster_root,
522
+ std::vector<ClusterTree> &tree_root,
523
+ size_t target_col_num,
524
+ Workspace &workspace,
525
+ InputData &input_data,
526
+ ModelParams &model_params,
527
+ ModelOutputs &model_outputs);
528
+ void recursive_split_numeric(Workspace &workspace,
529
+ InputData &input_data,
530
+ ModelParams &model_params,
531
+ size_t curr_depth, bool is_NA_branch);
532
+ void process_categ_col(std::vector<Cluster> &cluster_root,
533
+ std::vector<ClusterTree> &tree_root,
534
+ size_t target_col_num, bool is_ord,
535
+ Workspace &workspace,
536
+ InputData &input_data,
537
+ ModelParams &model_params,
538
+ ModelOutputs &model_outputs);
539
+ void recursive_split_categ(Workspace &workspace,
540
+ InputData &input_data,
541
+ ModelParams &model_params,
542
+ size_t curr_depth, bool is_NA_branch);
543
+
544
+
545
+ /*******************************************
546
+ Prototypes from predict.cpp
547
+ (This is the module from which
548
+ new data can be flagged as outliers)
549
+ ********************************************/
550
+ typedef struct {
551
+ double *restrict numeric_data;
552
+ int *restrict categorical_data;
553
+ int *restrict ordinal_data;
554
+ size_t nrows;
555
+ } PredictionData;
556
+
557
+ bool find_new_outliers(double *restrict numeric_data,
558
+ int *restrict categorical_data,
559
+ int *restrict ordinal_data,
560
+ size_t nrows, int nthreads, ModelOutputs &model_outputs);
561
+ bool follow_tree(ModelOutputs &model_outputs, PredictionData &prediction_data, size_t curr_tree, size_t curr_depth,
562
+ size_t_for row, size_t_for col, bool col_is_num, double num_val_this, int cat_val_this);
563
+ bool check_is_outlier_in_tree(std::vector<size_t> &clusters_in_tree, size_t curr_depth, size_t curr_tree,
564
+ ModelOutputs &model_outputs, PredictionData &prediction_data, size_t_for row, size_t_for col,
565
+ bool col_is_num, double num_val_this, int cat_val_this);
566
+
567
+
568
+ /********************************
569
+ Prototypes from split.cpp
570
+ *********************************/
571
+ #define SD_REG 1e-5 /* Regularization for standard deviation estimation */
572
+
573
+ typedef struct {
574
+ size_t cnt;
575
+ long double sum;
576
+ long double sum_sq;
577
+ } NumericBranch;
578
+
579
+ typedef struct {
580
+ NumericBranch NA_branch = {0, 0, 0};
581
+ NumericBranch left_branch = {0, 0, 0};
582
+ NumericBranch right_branch = {0, 0, 0};
583
+ } NumericSplit;
584
+
585
+ typedef struct {
586
+ size_t *restrict NA_branch; /* array of counts of the target variable's categories */
587
+ size_t *restrict left_branch; /* array of counts of the target variable's categories */
588
+ size_t *restrict right_branch; /* array of counts of the target variable's categories */
589
+ size_t ncat; /* number of categories/entries in the arrays above */
590
+ size_t tot; /* size_NA + size_left + size_right */
591
+ size_t size_NA = 0;
592
+ size_t size_left = 0;
593
+ size_t size_right = 0;
594
+ } CategSplit;
595
+
596
+ void subset_to_onehot(size_t ix_arr[], size_t n_true, size_t n_tot, bool onehot[]);
597
+ size_t move_zero_count_to_front(size_t *restrict cat_sorted, size_t *restrict cat_cnt, size_t ncat_x);
598
+ void flag_zero_counts(char split_subset[], size_t buffer_cat_cnt[], size_t ncat_x);
599
+ long double calc_sd(size_t cnt, long double sum, long double sum_sq);
600
+ long double calc_sd(NumericBranch &branch);
601
+ long double calc_sd(size_t ix_arr[], double *restrict x, size_t st, size_t end, double *restrict mean);
602
+ long double numeric_gain(NumericSplit &split_info, long double tot_sd);
603
+ long double numeric_gain(long double tot_sd, long double info_left, long double info_right, long double info_NA, long double cnt);
604
+ long double total_info(size_t categ_counts[], size_t ncat);
605
+ long double total_info(size_t categ_counts[], size_t ncat, size_t tot);
606
+ long double total_info(size_t *restrict ix_arr, int *restrict x, size_t st, size_t end, size_t ncat, size_t *restrict buffer_cat_cnt);
607
+ long double categ_gain(CategSplit split_info, long double base_info);
608
+ long double categ_gain(size_t *restrict categ_counts, size_t ncat, size_t *restrict ncat_col, size_t maxcat, long double base_info, size_t tot);
609
+ long double categ_gain_from_split(size_t *restrict ix_arr, int *restrict x, size_t st, size_t st_non_na, size_t split_ix, size_t end,
610
+ size_t ncat, size_t *restrict buffer_cat_cnt, long double base_info);
611
+ void split_numericx_numericy(size_t *restrict ix_arr, size_t st, size_t end, double *restrict x, double *restrict y,
612
+ long double sd_y, bool has_na, size_t min_size, bool take_mid, long double *restrict buffer_sd,
613
+ long double *restrict gain, double *restrict split_point, size_t *restrict split_left, size_t *restrict split_NA);
614
+ void split_categx_numericy(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, double *restrict y, long double sd_y, double ymean,
615
+ bool x_is_ordinal, size_t ncat_x, size_t *restrict buffer_cat_cnt, long double *restrict buffer_cat_sum,
616
+ long double *restrict buffer_cat_sum_sq, size_t *restrict buffer_cat_sorted,
617
+ bool has_na, size_t min_size, long double *gain, char *restrict split_subset, int *restrict split_point);
618
+ void split_numericx_categy(size_t *restrict ix_arr, size_t st, size_t end, double *restrict x, int *restrict y,
619
+ size_t ncat_y, long double base_info, size_t *restrict buffer_cat_cnt,
620
+ bool has_na, size_t min_size, bool take_mid, long double *restrict gain, double *restrict split_point,
621
+ size_t *restrict split_left, size_t *restrict split_NA);
622
+ void split_ordx_categy(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, int *restrict y,
623
+ size_t ncat_y, size_t ncat_x, long double base_info,
624
+ size_t *restrict buffer_cat_cnt, size_t *restrict buffer_crosstab, size_t *restrict buffer_ord_cnt,
625
+ bool has_na, size_t min_size, long double *gain, int *split_point);
626
+ void split_categx_biny(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, int *restrict y,
627
+ size_t ncat_x, long double base_info,
628
+ size_t *restrict buffer_cat_cnt, size_t *restrict buffer_crosstab, size_t *restrict buffer_cat_sorted,
629
+ bool has_na, size_t min_size, long double *gain, char *restrict split_subset);
630
+ void split_categx_categy_separate(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, int *restrict y,
631
+ size_t ncat_x, size_t ncat_y, long double base_info,
632
+ size_t *restrict buffer_cat_cnt, size_t *restrict buffer_crosstab,
633
+ bool has_na, size_t min_size, long double *gain);
634
+ void split_categx_categy_subset(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, int *restrict y,
635
+ size_t ncat_x, size_t ncat_y, long double base_info,
636
+ size_t *restrict buffer_cat_cnt, size_t *restrict buffer_crosstab, size_t *restrict buffer_split,
637
+ bool has_na, size_t min_size, long double *gain, char *restrict split_subset);
638
+
639
+
640
+
641
+ /***********************************
642
+ Prototypes from clusters.cpp
643
+ ************************************/
644
+ #define calculate_max_outliers(n, perc) ( (n) * (perc) + (long double)2 * sqrtl( (n) * (perc) * ((long double)1 - perc) ) + (long double)1 )
645
+ #define z_score(x, mu, sd) ( ((x) - (mu)) / (sd) )
646
+ #define chebyshyov_bound(sd) (1.0 / square(sd))
647
+
648
+ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_t st, size_t end,
649
+ double *restrict outlier_scores, size_t *restrict outlier_clusters, size_t *restrict outlier_trees,
650
+ size_t *restrict outlier_depth, Cluster &cluster, std::vector<Cluster> &clusters, size_t cluster_num, size_t tree_num, size_t tree_depth,
651
+ bool is_log_transf, double log_minval, bool is_exp_transf, double orig_mean, double orig_sd,
652
+ double left_tail, double right_tail, double *restrict orig_x,
653
+ double max_perc_outliers, double z_norm, double z_outlier);
654
+ void define_categ_cluster_no_cond(int *restrict x, size_t *restrict ix_arr, size_t st, size_t end, size_t ncateg,
655
+ double *restrict outlier_scores, size_t *restrict outlier_clusters, size_t *restrict outlier_trees,
656
+ size_t *restrict outlier_depth, Cluster &cluster,
657
+ size_t *restrict categ_counts, char *restrict is_outlier, double perc_next_most_comm);
658
+ bool define_categ_cluster(int *restrict x, size_t *restrict ix_arr, size_t st, size_t end, size_t ncateg, bool by_maj,
659
+ double *restrict outlier_scores, size_t *restrict outlier_clusters, size_t *restrict outlier_trees,
660
+ size_t *restrict outlier_depth, Cluster &cluster, std::vector<Cluster> &clusters,
661
+ size_t cluster_num, size_t tree_num, size_t tree_depth,
662
+ double max_perc_outliers, double z_norm, double z_outlier,
663
+ long double *restrict perc_threshold, long double *restrict prop_prior,
664
+ size_t *restrict buffer_categ_counts, long double *restrict buffer_categ_pct,
665
+ size_t *restrict buffer_categ_ix, char *restrict buffer_outliers,
666
+ bool *restrict drop_cluster);
667
+ void simplify_when_equal_cond(std::vector<Cluster> &clusters, int ncat_ord[]);
668
+ void simplify_when_equal_cond(std::vector<ClusterTree> &trees, int ncat_ord[]);
669
+ #ifdef TEST_MODE_DEFINE
670
+ void prune_unused_trees(std::vector<ClusterTree> &trees);
671
+ #endif
672
+ bool check_tree_is_not_needed(ClusterTree &tree);
673
+ void calculate_cluster_minimums(ModelOutputs &model_outputs, size_t col);
674
+ void calculate_cluster_poss_categs(ModelOutputs &model_outputs, size_t col, size_t col_rel);
675
+
676
+
677
+ /**************************************
678
+ Prototypes from cat_outlier.cpp
679
+ ***************************************/
680
+ #define calculate_max_cat_outliers(n, perc, z_norm) ((long double)1 + ((n) * (perc) / z_norm)) /* Note: this is not anyhow probabilistic, nor based on provable bounds */
681
+ void find_outlier_categories(size_t categ_counts[], size_t ncateg, size_t tot, double max_perc_outliers,
682
+ long double perc_threshold[], size_t buffer_ix[], long double buffer_perc[],
683
+ double z_norm, char is_outlier[], bool *found_outliers, bool *new_is_outlier, double *next_most_comm);
684
+ void find_outlier_categories_by_maj(size_t categ_counts[], size_t ncateg, size_t tot, double max_perc_outliers,
685
+ long double prior_prob[], double z_outlier, char is_outlier[],
686
+ bool *found_outliers, bool *new_is_outlier, int *categ_maj);
687
+ bool find_outlier_categories_no_cond(size_t categ_counts[], size_t ncateg, size_t tot,
688
+ char is_outlier[], double *next_most_comm);
689
+
690
+
691
+
692
+ /*************************************************
693
+ Prototypes from misc.cpp and other structs
694
+ **************************************************/
695
+
696
+ /* an inefficient workaround for coding up option 'follow_all' */
697
+ typedef struct {
698
+ double gain_restore;
699
+ double gain_best_restore;
700
+ double split_point_restore;
701
+ int split_lev_restore;
702
+ std::vector<char> split_subset_restore;
703
+ size_t ix1_restore;
704
+ size_t ix2_restore;
705
+ size_t ix3_restore;
706
+ size_t ix4_restore;
707
+ int * temp_ptr_x;
708
+ size_t col_best_restore;
709
+ ColType col_type_best_rememer;
710
+ double split_point_best_restore;
711
+ int split_lev_best_restore;
712
+ std::vector<char> split_subset_best_restore;
713
+ long double base_info_restore;
714
+ long double base_info_orig_restore;
715
+ double sd_y_restore;
716
+ bool has_outliers_restore;
717
+ bool lev_has_outliers_restore;
718
+ } RecursionState;
719
+
720
+
721
+ int calculate_category_indices(size_t start_ix_cat_counts[], int ncat[], size_t ncols, bool skip_col[], int max_categ = 0);
722
+ void calculate_all_cat_counts(size_t start_ix_cat_counts[], size_t cat_counts[], int ncat[],
723
+ int categorical_data[], size_t ncols, size_t nrows,
724
+ bool has_NA[], bool skip_col[], int nthreads);
725
+ void check_cat_col_unsplittable(size_t start_ix_cat_counts[], size_t cat_counts[], int ncat[],
726
+ size_t ncols, size_t min_conditioned_size, size_t nrows, bool skip_col[], int nthreads);
727
+ void calculate_lowerlim_proportion(long double *restrict prop_small, long double *restrict prop,
728
+ size_t start_ix_cat_counts[], size_t cat_counts[],
729
+ size_t ncols, size_t nrows, double z_norm, double z_tail);
730
+ void check_missing_no_variance(double numeric_data[], size_t ncols, size_t nrows, bool has_NA[],
731
+ bool skip_col[], int min_decimals[], int nthreads);
732
+ void calc_central_mean_and_sd(size_t ix_arr[], size_t st, size_t end, double x[], size_t size_quarter, double *mean_central, double *sd_central);
733
+ void check_for_tails(size_t ix_arr[], size_t st, size_t end, double *restrict x,
734
+ double z_norm, double max_perc_outliers,
735
+ double *restrict buffer_x, double mean, double sd,
736
+ double *restrict left_tail, double *restrict right_tail,
737
+ bool *exp_transf, bool *log_transf);
738
+ size_t move_outliers_to_front(size_t ix_arr[], double outlier_scores[], size_t st, size_t end);
739
+ size_t move_NAs_to_front(size_t ix_arr[], double x[], size_t st, size_t end, bool inf_as_NA);
740
+ size_t move_NAs_to_front(size_t ix_arr[], int x[], size_t st, size_t end);
741
+ void divide_subset_split(size_t ix_arr[], double x[], size_t st, size_t end, double split_point, bool has_NA, size_t *split_NA, size_t *st_right);
742
+ void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, char subset_categ[], int ncat, bool has_NA, size_t *split_NA, size_t *st_right);
743
+ void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, int split_lev, bool has_NA, size_t *split_NA, size_t *st_right);
744
+ bool check_workspace_is_allocated(Workspace &workspace);
745
+ void allocate_thread_workspace(Workspace &workspace, size_t nrows, int max_categ);
746
+ void backup_recursion_state(Workspace &workspace, RecursionState &state_backup);
747
+ void restore_recursion_state(Workspace &workspace, RecursionState &state_backup);
748
+ void set_tree_as_numeric(ClusterTree &tree, double split_point, size_t col);
749
+ void set_tree_as_categorical(ClusterTree &tree, int ncat, char *split_subset, size_t col);
750
+ void set_tree_as_categorical(ClusterTree &tree, size_t col);
751
+ void set_tree_as_categorical(ClusterTree &tree, size_t col, int ncat);
752
+ void set_tree_as_ordinal(ClusterTree &tree, int split_lev, size_t col);
753
+ void forget_row_outputs(ModelOutputs &model_outputs);
754
+ void allocate_row_outputs(ModelOutputs &model_outputs, size_t nrows, size_t max_depth);
755
+ void check_more_two_values(double arr_num[], size_t nrows, size_t ncols, int nthreads, char too_few_values[]);
756
+ void calc_min_decimals_to_print(ModelOutputs &model_outputs, double *restrict numeric_data, int nthreads);
757
+ int decimals_diff(double val1, double val2);
758
+ void dealloc_ModelOutputs(ModelOutputs &model_outputs);