outliertree 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,758 @@
1
+ /********************************************************************************************************************
2
+ * Explainable outlier detection
3
+ *
4
+ * Tries to detect outliers by generating decision trees that attempt to predict the values of each column based on
5
+ * each other column, testing in each branch of every tried split (if it meets some minimum criteria) whether there
6
+ * are observations that seem too distant from the others in a 1-D distribution for the column that the split tries
7
+ * to "predict" (will not generate a score for each observation).
8
+ * Splits are based on gain, while outlierness is based on confidence intervals.
9
+ * Similar in spirit to the GritBot software developed by RuleQuest research. Reference article is:
10
+ * Cortes, David. "Explainable outlier detection through decision tree conditioning."
11
+ * arXiv preprint arXiv:2001.00636 (2020).
12
+ *
13
+ *
14
+ * Copyright 2020 David Cortes.
15
+ *
16
+ * Written for C++11 standard and OpenMP 2.0 or later. Code is meant to be wrapped into scripting languages
17
+ * such as R or Python.
18
+ *
19
+ * This file is part of OutlierTree.
20
+ *
21
+ * OutlierTree is free software: you can redistribute it and/or modify
22
+ * it under the terms of the GNU General Public License as published by
23
+ * the Free Software Foundation, either version 3 of the License, or
24
+ * (at your option) any later version.
25
+ *
26
+ * OutlierTree is distributed in the hope that it will be useful,
27
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
28
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29
+ * GNU General Public License for more details.
30
+ *
31
+ * You should have received a copy of the GNU General Public License
32
+ * along with OutlierTree. If not, see <https://www.gnu.org/licenses/>.
33
+ ********************************************************************************************************************/
34
+
35
+ /***********************
36
+ Standard headers
37
+ ************************/
38
+ #include <iostream>
39
+ #include <vector>
40
+ #include <memory>
41
+ #include <algorithm>
42
+ #include <numeric>
43
+ #include <unordered_set>
44
+ #include <math.h>
45
+ #include <cmath>
46
+ #include <stddef.h>
47
+ #include <limits.h>
48
+ #include <stdlib.h>
49
+ #include <stddef.h>
50
+ #include <string.h>
51
+ #ifdef _OPENMP
52
+ #include <omp.h>
53
+ #endif
54
+
55
+ /************************
56
+ Short Functions
57
+ *************************/
58
+ #define extract_bit(number, bit) (((number) >> (bit)) & 1) /* https://stackoverflow.com/questions/2249731/how-do-i-get-bit-by-bit-data-from-an-integer-value-in-c */
59
+ #define pow2(n) ( ((size_t) 1) << (n) ) /* https://stackoverflow.com/questions/101439/the-most-efficient-way-to-implement-an-integer-based-power-function-powint-int */
60
+ #define avg_between(a, b) (((a) + (b)) * 0.5)
61
+ #define square(x) ((x) * (x))
62
+ #ifndef isinf
63
+ #define isinf std::isinf
64
+ #endif
65
+ #ifndef isnan
66
+ #define isnan std::isnan
67
+ #endif
68
+ #define is_na_or_inf(x) (isnan(x) || isinf(x))
69
+
70
+ /* Aliasing for compiler optimizations */
71
+ #if defined(__GNUG__) || defined(__GNUC__) || defined(_MSC_VER) || defined(__clang__) || defined(__INTEL_COMPILER)
72
+ #define restrict __restrict
73
+ #else
74
+ #define restrict
75
+ #endif
76
+
77
+ /* MSVC is stuck with an OpenMP version that's 19 years old at the time of writing and does not support unsigned iterators */
78
+ #ifdef _OPENMP
79
+ #if (_OPENMP < 200801) || defined(_WIN32) || defined(_WIN64) /* OpenMP < 3.0 */
80
+ #define size_t_for long long
81
+ #else
82
+ #define size_t_for size_t
83
+ #endif
84
+ #else
85
+ #define size_t_for size_t
86
+ #endif
87
+
88
+ #ifndef _OPENMP
89
+ #define omp_get_thread_num() 0
90
+ #endif
91
+
92
+
93
+ /****************************************************************
94
+ Data types and structs that are returned from this module
95
+ *****************************************************************/
96
+ typedef enum ColType {Numeric, Categorical, Ordinal, NoType} ColType;
97
+ typedef enum SplitType {
98
+ LessOrEqual, Greater, /* for numerical and ordinal */
99
+ Equal, NotEqual, /* will try to simplify to these post-hoc if possible */
100
+ InSubset, NotInSubset, /* for categoricals */
101
+ SingleCateg, SubTrees, /* one branch per category of a categorical column */
102
+ IsNa, Root
103
+ } SplitType;
104
+ typedef enum ColTransf {NoTransf, Log, Exp} ColTransf; /* transformation to apply to numeric column */
105
+
106
+ /*
107
+ * 1-d clusters that define homogeneous groups in which observations can be outliers.
108
+ * Note that these are associated to a tree and define one extra condition from what
109
+ * the tree already specifies. The branch they follow is stored in the cluster, unlike
110
+ * for trees in which it's always left and right branch, as these get discarded more often.
111
+ */
112
+ typedef struct Cluster {
113
+ ColType column_type = NoType;
114
+ size_t col_num = 0; /* numer of the column by which its being split, the target column is given by index of the cluster vector */
115
+ SplitType split_type = Root;
116
+ double split_point = HUGE_VAL; /* numerical */
117
+ std::vector<char> split_subset = std::vector<char>(); /* categorical */
118
+ int split_lev = INT_MAX; /* ordinal */
119
+ bool has_NA_branch = false; /* this is in order to determine the best outlier cluster when it fits under more than 1 */
120
+
121
+ size_t cluster_size = 0;
122
+ double lower_lim = HUGE_VAL; /* numerical target column */
123
+ double upper_lim = -HUGE_VAL; /* numerical target column */
124
+ double perc_below = HUGE_VAL; /* numerical target column */
125
+ double perc_above = HUGE_VAL; /* numerical target column */
126
+ double display_lim_low = HUGE_VAL; /* numerical target column */
127
+ double display_lim_high = -HUGE_VAL; /* numerical target column */
128
+ double display_mean = -HUGE_VAL; /* numerical target column */
129
+ double display_sd = -HUGE_VAL; /* numerical target column */
130
+ std::vector<char> subset_common = std::vector<char>(); /* categorical or ordinal target column (=0 is common) */
131
+ double perc_in_subset = HUGE_VAL; /* categorical or ordinal target column */
132
+ double perc_next_most_comm = -HUGE_VAL; /* categorical or ordinal target column */ /* TODO */
133
+ int categ_maj = -1; /* when using majority-criterion for categorical outliers */
134
+
135
+ double cluster_mean; /* used to calculate outlier scores at prediction time */
136
+ double cluster_sd; /* used to calculate outlier scores at prediction time */
137
+ std::vector<double> score_categ; /* used to calculate outlier scores at prediction time */
138
+
139
+ /* constructors in order to use C++'s vector emplace */
140
+
141
+ /* full data (no conditions) */
142
+ Cluster(ColType column_type, SplitType split_type)
143
+ {
144
+ this->column_type = column_type;
145
+ this->split_type = split_type;
146
+ }
147
+
148
+ /* numerical split */
149
+ Cluster(ColType column_type, size_t col_num, SplitType split_type, double split_point, bool has_NA_branch = false)
150
+ {
151
+ this->column_type = column_type;
152
+ this->col_num = col_num;
153
+ this->split_type = split_type;
154
+ this->split_point = split_point;
155
+ this->has_NA_branch = has_NA_branch;
156
+ }
157
+
158
+ /* categorical split */
159
+ Cluster(ColType column_type, size_t col_num, SplitType split_type, char *split_subset, int ncat, bool has_NA_branch = false)
160
+ {
161
+ this->column_type = column_type;
162
+ this->col_num = col_num;
163
+ this->split_type = split_type;
164
+ if (split_type != IsNa) this->split_subset.assign(split_subset, split_subset + ncat);
165
+ this->split_subset.shrink_to_fit();
166
+ this->has_NA_branch = has_NA_branch;
167
+ }
168
+
169
+ /* categorical split with only one level */
170
+ Cluster(size_t col_num, int cat, int ncat, bool has_NA_branch = false)
171
+ {
172
+ this->column_type = Categorical;
173
+ this->col_num = col_num;
174
+ this->has_NA_branch = has_NA_branch;
175
+ this->split_type = Equal;
176
+ this->split_lev = cat;
177
+ }
178
+
179
+ /* ordinal split */
180
+ Cluster(ColType column_type, size_t col_num, SplitType split_type, int split_lev, bool has_NA_branch = false)
181
+ {
182
+ this->column_type = column_type;
183
+ this->col_num = col_num;
184
+ this->split_type = split_type;
185
+ this->split_lev = split_lev;
186
+ this->has_NA_branch = has_NA_branch;
187
+ }
188
+
189
+ /* this is for serialization with cereal */
190
+ template<class Archive>
191
+ void serialize(Archive &archive)
192
+ {
193
+ archive(
194
+ this->column_type,
195
+ this->col_num,
196
+ this->split_type,
197
+ this->split_point,
198
+ this->split_subset,
199
+ this->split_lev,
200
+ this->has_NA_branch,
201
+ this->cluster_size,
202
+ this->lower_lim,
203
+ this->upper_lim,
204
+ this->perc_below,
205
+ this->perc_above,
206
+ this->display_lim_low,
207
+ this->display_lim_high,
208
+ this->display_mean,
209
+ this->display_sd,
210
+ this->subset_common,
211
+ this->perc_in_subset,
212
+ this->perc_next_most_comm,
213
+ this->cluster_mean,
214
+ this->cluster_sd,
215
+ this->score_categ
216
+ );
217
+ }
218
+
219
+ /* this is for serialization with both cereal and cython auto-pickle */
220
+ Cluster() = default;
221
+
222
+ } Cluster;
223
+
224
+ /*
225
+ * Trees that host the aforementioned clusters. These work as follows:
226
+ * - Each tree contains a split column and condition for splitting.
227
+ * - The trees that follow them are specified in tree_left/right/NA.
228
+ * - If the tree is dropped or not used, that branch gets an index of zero.
229
+ * - The child tree will however remember which branch it took.
230
+ * - At prediction time, the output will tell into which cluster and which tree
231
+ * is each row an outlier (if they fall into any).
232
+ * - The exact conditions are reconstructed by following the trees backwards
233
+ * (i.e. first the cluster, then deepest tree, then follow parent tree until root).
234
+ * This way, all the necessary information can be obtained without storing redundant
235
+ * info, and without needing to reconstruct the conditions as the 'predict'
236
+ * function is being called (which makes it easier to wrap into other languages).
237
+ * - At prediction time, as the observation is passed down trees, all the clusters
238
+ * in all those trees have to be tested for (so if a cluster is discarded, it can
239
+ * keep only one branch of its split in the struct).
240
+ * - As a side effect, in ordinal columns, the trees cannot be simplified to 'Equal'.
241
+ * - All of this is ignored when using 'follow_all', in which case the trees work just
242
+ * like the clusters, with an array 'all_branches' which contains all trees that have
243
+ * to be follow from one particular tree.
244
+ */
245
+ typedef struct ClusterTree {
246
+ size_t parent = 0; /* index in a vector */
247
+ SplitType parent_branch = Root; /* this tree follows this branch in the split given by its parent */
248
+ std::vector<size_t> clusters = std::vector<size_t>(); /* these clusters define additional splits */
249
+
250
+ SplitType split_this_branch = Root; /* when using 'follow_all' */
251
+ std::vector<size_t> all_branches = std::vector<size_t>(); /* when using 'follow_all' */
252
+
253
+ ColType column_type = NoType;
254
+ size_t col_num = 0;
255
+ double split_point = HUGE_VAL;
256
+ std::vector<char> split_subset = std::vector<char>();
257
+ int split_lev = INT_MAX;
258
+
259
+ size_t tree_NA = 0; /* binary splits */
260
+ size_t tree_left = 0; /* binary splits */
261
+ size_t tree_right = 0; /* binary splits */
262
+ std::vector<size_t> binary_branches = std::vector<size_t>(); /* multiple splits (single category or binarized categories) */
263
+
264
+ ClusterTree(size_t parent, SplitType parent_branch)
265
+ {
266
+ this->parent = parent;
267
+ this->parent_branch = parent_branch;
268
+ }
269
+
270
+ /* when using 'follow_all' */
271
+ ClusterTree(size_t parent, size_t col_num, double split_point, SplitType split_this_branch)
272
+ {
273
+ this->parent = parent;
274
+ this->col_num = col_num;
275
+ this->column_type = Numeric;
276
+ this->split_this_branch = split_this_branch;
277
+ this->split_point = split_point;
278
+ }
279
+
280
+ ClusterTree(size_t parent, size_t col_num, int split_lev, SplitType split_this_branch)
281
+ {
282
+ this->parent = parent;
283
+ this->col_num = col_num;
284
+ this->column_type = Ordinal;
285
+ this->split_this_branch = split_this_branch;
286
+ this->split_lev = split_lev;
287
+ }
288
+
289
+ ClusterTree(size_t parent, size_t col_num, SplitType split_this_branch, char *split_subset, int ncat)
290
+ {
291
+ this->parent = parent;
292
+ this->col_num = col_num;
293
+ this->column_type = Categorical;
294
+ if (split_this_branch != IsNa) {
295
+ this->split_this_branch = split_this_branch;
296
+ this->split_subset.assign(split_subset, split_subset + ncat);
297
+ this->split_subset.shrink_to_fit();
298
+ } else {
299
+ this->split_this_branch = IsNa;
300
+ }
301
+ }
302
+
303
+ ClusterTree(size_t parent, size_t col_num, int cat_chosen)
304
+ {
305
+ this->parent = parent;
306
+ this->col_num = col_num;
307
+ this->column_type = Categorical;
308
+ this->split_this_branch = Equal;
309
+ this->split_lev = cat_chosen;
310
+ }
311
+
312
+ /* this is for serialization with cereal */
313
+ template<class Archive>
314
+ void serialize(Archive &archive)
315
+ {
316
+ archive(
317
+ this->parent,
318
+ this->parent_branch,
319
+ this->clusters,
320
+ this->split_this_branch,
321
+ this->all_branches,
322
+ this->column_type,
323
+ this->col_num,
324
+ this->split_point,
325
+ this->split_subset,
326
+ this->split_lev,
327
+ this->tree_NA,
328
+ this->tree_left,
329
+ this->tree_right,
330
+ this->binary_branches
331
+ );
332
+ }
333
+
334
+ /* this is for serialization with both cereal and cython auto-pickle */
335
+ ClusterTree() = default;
336
+
337
+ } ClusterTree;
338
+
339
+ /* these are needed for prediction time, and are thus returned from the function that fits the model */
340
+ typedef struct ModelOutputs {
341
+ std::vector< std::vector<ClusterTree> > all_trees; /* clusters in which observations can be outliers, required for prediction time */
342
+ std::vector< std::vector<Cluster> > all_clusters; /* decision trees that host the clusters, required for prediction time */
343
+ std::vector<double> outlier_scores_final; /* if an outlier is flagged, this indicates its score (lower is more outlier) as an upper probability bound */
344
+ std::vector<size_t> outlier_clusters_final; /* if an outlier is flagged, this indicates the most suitable cluster in which to flag it as outlier */
345
+ std::vector<size_t> outlier_columns_final; /* if an outlier is flagged, this indicates the column that makes it an outlier */
346
+ std::vector<size_t> outlier_trees_final; /* if an outlier is flagged, this indicates the tree under which the cluster is found */
347
+ std::vector<size_t> outlier_depth_final; /* if an outlier is flagged, this indicates the split depth under which the cluster is found */
348
+ std::vector<int> outlier_decimals_distr; /* if an outlier is flagged, and it's a numeric column, this will indicate how many decimals to print for it */
349
+ std::vector<size_t> start_ix_cat_counts; /* this is to determine where to index the proportions */
350
+ std::vector<long double> prop_categ; /* this is just for statistics to show, it's not used for anything */
351
+ std::vector<ColTransf> col_transf; /* tells whether each numerical columns underwent log/exp transformations */
352
+ std::vector<double> transf_offset; /* value subtracted for log transform, mean subtracted for exp transform */
353
+ std::vector<double> sd_div; /* standard deviation with which exp-transformed columns were standardized */
354
+ std::vector<int> min_decimals_col; /* number of decimals to show for split conditions in numeric columns */
355
+ std::vector<int> ncat; /* copied from the inputs, used to determine at prediction time if a category is out-of-range and skip */
356
+ std::vector<int> ncat_ord; /* copied from the inputs, used to determine at prediction time if a category is out-of-range and skip */
357
+ size_t ncols_numeric; /* copied from the inputs, used to determine at prediction time if a category is out-of-range and skip */
358
+ size_t ncols_categ; /* copied from the inputs, used to determine at prediction time if a category is out-of-range and skip */
359
+ size_t ncols_ord; /* copied from the inputs, used to determine at prediction time if a category is out-of-range and skip */
360
+ std::vector<double> min_outlier_any_cl; /* redundant info which speeds up prediction */
361
+ std::vector<double> max_outlier_any_cl; /* redundant info which speeds up prediction */
362
+ std::vector<std::vector<bool>> cat_outlier_any_cl; /* redundant info which speeds up prediction */
363
+ size_t max_depth; /* redundant info which speeds up prediction */
364
+
365
+
366
+ /* this is for serialization with cereal */
367
+ template<class Archive>
368
+ void serialize(Archive &archive)
369
+ {
370
+ archive(
371
+ this->all_trees,
372
+ this->all_clusters,
373
+ this->outlier_scores_final,
374
+ this->outlier_clusters_final,
375
+ this->outlier_columns_final,
376
+ this->outlier_trees_final,
377
+ this->outlier_depth_final,
378
+ this->start_ix_cat_counts,
379
+ this->prop_categ,
380
+ this->col_transf,
381
+ this->transf_offset,
382
+ this->sd_div,
383
+ this->ncat,
384
+ this->ncat_ord,
385
+ this->ncols_numeric,
386
+ this->ncols_categ,
387
+ this->ncols_ord,
388
+ this->min_outlier_any_cl,
389
+ this->max_outlier_any_cl,
390
+ this->cat_outlier_any_cl,
391
+ this->max_depth
392
+ );
393
+ }
394
+
395
+ /* this is for serialization with both cereal and cython auto-pickle */
396
+ ModelOutputs() = default;
397
+
398
+ } ModelOutputs;
399
+
400
+ /*
401
+ * Note: the vectors with proportions in these structs are supposed to be all small numbers so 'long double' is an overkill for them
402
+ * and does not make them translate into SIMD instructions in regular x86-64 CPUs, but if setting them as 'double' and then doing casts
403
+ * from/between 'double' and the 'size_t' and 'long double's of other arrays (such as in function 'find_outlier_categories'), comparisons
404
+ * such as '<=' will oftentimes fail even with small counts - this is an example that will fail when mixing the 3 types together:
405
+ * >>> (2 / (88+1)) * 0.5 <= (1 / 89) --> produces FALSE (right answer is TRUE)
406
+ * All due to decimals (in that example) right of the 10th digit, and ends up creating categorical clusters that it should not create.
407
+ * So don't change them back to regular 'double', or if necessary, change every 'long double' to 'double' too.
408
+ */
409
+
410
+ /******************************************
411
+ Prototypes from fit_model.cpp
412
+ (This is the main module from which
413
+ the model is generated)
414
+ *******************************************/
415
+ bool fit_outliers_models(ModelOutputs &model_outputs,
416
+ double *restrict numeric_data, size_t ncols_numeric,
417
+ int *restrict categorical_data, size_t ncols_categ, int *restrict ncat,
418
+ int *restrict ordinal_data, size_t ncols_ord, int *restrict ncat_ord,
419
+ size_t nrows, char *restrict cols_ignore = NULL, int nthreads = 1,
420
+ bool categ_as_bin = true, bool ord_as_bin = true, bool cat_bruteforce_subset = false, bool categ_from_maj = false, bool take_mid = true,
421
+ size_t max_depth = 3, double max_perc_outliers = 0.01, size_t min_size_numeric = 25, size_t min_size_categ = 50,
422
+ double min_gain = 1e-2, bool gain_as_pct = false, bool follow_all = false, double z_norm = 2.67, double z_outlier = 8.0);
423
+
424
+ typedef struct {
425
+
426
+ std::vector<size_t> ix_arr; /* indices from the target column */
427
+ size_t st; /* chunk of the indices to take for current function calls */
428
+ size_t end; /* chunk of the indices to take for current function calls */
429
+ std::vector<double> outlier_scores; /* these hold the model outputs for 1 column before combining them */
430
+ std::vector<size_t> outlier_clusters; /* these hold the model outputs for 1 column before combining them */
431
+ std::vector<size_t> outlier_trees; /* these hold the model outputs for 1 column before combining them */
432
+ std::vector<size_t> outlier_depth; /* these hold the model outputs for 1 column before combining them */
433
+ size_t target_col_num; /* if categorical or ordinal, gets subtracted the number of numeric columns (used to index other arrays) */
434
+ long double sd_y; /* numerical only (standard deviation before splitting) */
435
+ double mean_y; /* numerical only (used to standardize numbers for extra FP precision) */
436
+ long double base_info; /* categorical and ordinal (information before splitting and before binarizing) */
437
+ long double base_info_orig; /* categorical and ordinal (information before splitting and after binarizing if needed) */
438
+ bool log_transf; /* numerical - whether the target variable underwent a logarithmic transformation */
439
+ bool exp_transf; /* numerical - whether the target variable underwent exponentiation on its Z values */
440
+ double *target_numeric_col; /* dynamic pointer */
441
+ int *target_categ_col; /* dynamic pointer */
442
+ std::vector<double> buffer_transf_y; /* if applying logarithm or exponentiation, transformed values are stored here */
443
+ std::vector<int> buffer_bin_y; /* if binarizing, transformed values are stored here */
444
+ std::vector<Cluster> *clusters; /* dynamic pointer, don't change to reference as it otherwise cannot be reassigned */
445
+ std::vector<ClusterTree> *tree; /* dynamic pointer, don't change to reference as it otherwise cannot be reassigned */
446
+ bool has_outliers; /* temporary variable from which the other two are updated */
447
+ bool lev_has_outliers; /* whether the particular depth level has outliers (if so, wil remove them at the end before new split) */
448
+ bool col_has_outliers; /* whether there's any outliers in the column (will later merge them into the outputs) */
449
+ double left_tail; /* approximate value where a long left tail ends */
450
+ double right_tail; /* approximate value where a long right tail ends */
451
+
452
+ bool col_is_bin; /* whether the target categorical/ordinal column has 2 categories or has been forcibly binarized */
453
+ long double *prop_small_this; /* dynamic pointer */
454
+ long double *prior_prob; /* dynamic pointer */
455
+
456
+ double orig_mean; /* value to reconstruct originals from exponentiated */
457
+ double orig_sd; /* value to reconstruct originals from exponentiated */
458
+ double log_minval; /* value to reconstruct originals from logarithms */
459
+ double *orig_target_col; /* column as it was before applying log/exp (dynamic pointer) */
460
+ int *untransf_target_col; /* column as it was before forcibly binarizing (dynamic pointer) */
461
+ int *temp_ptr_x; /* dynamic pointer */
462
+
463
+ std::vector<char> buffer_subset_categ_best; /* categorical split that gave the best gain */
464
+ long double this_gain; /* buffer where to store gain */
465
+ double this_split_point; /* numeric split threshold */
466
+ int this_split_lev; /* ordinal split threshold */
467
+ size_t this_split_ix; /* index at which the data is partitioned */
468
+ size_t this_split_NA; /* index at which the non-NA values start */
469
+ long double best_gain; /* buffer where to store the info of the splitting column that produced the highest gain */
470
+ ColType column_type_best; /* buffer where to store the info of the splitting column that produced the highest gain */
471
+ double split_point_best; /* buffer where to store the info of the splitting column that produced the highest gain */
472
+ int split_lev_best; /* buffer where to store the info of the splitting column that produced the highest gain */
473
+ size_t col_best; /* buffer where to store the info of the splitting column that produced the highest gain */
474
+
475
+ std::vector<long double> buffer_cat_sum; /* buffer arrays where to allocate values required by functions and not used outside them */
476
+ std::vector<long double> buffer_cat_sum_sq; /* buffer arrays where to allocate values required by functions and not used outside them */
477
+ std::vector<size_t> buffer_crosstab; /* buffer arrays where to allocate values required by functions and not used outside them */
478
+ std::vector<size_t> buffer_cat_cnt; /* buffer arrays where to allocate values required by functions and not used outside them */
479
+ std::vector<size_t> buffer_cat_sorted; /* buffer arrays where to allocate values required by functions and not used outside them */
480
+ std::vector<char> buffer_subset_categ; /* buffer arrays where to allocate values required by functions and not used outside them */
481
+ std::vector<char> buffer_subset_outlier; /* buffer arrays where to allocate values required by functions and not used outside them */
482
+ std::vector<long double> buffer_sd; /* used for a more numerically-stable two-pass gain calculation */
483
+
484
+ bool drop_cluster; /* for categorical and ordinal variables, not all clusters can flag observations as outliers, so those are not kept */
485
+ bool already_split_main; /* when binarizing categoricals/ordinals, avoid attempting the same split with numerical and ordinals that take the non-binarized data */
486
+ bool target_col_is_ord; /* whether the target column is ordinal (rest is the same as in categoricals) */
487
+ int ncat_this; /* number of categories in the target column */
488
+
489
+ } Workspace;
490
+
491
+ /* info holders to shorten function call arguments */
492
+ typedef struct {
493
+ bool categ_as_bin;
494
+ bool ord_as_bin;
495
+ bool cat_bruteforce_subset;
496
+ bool categ_from_maj;
497
+ bool take_mid;
498
+ size_t max_depth;
499
+ double max_perc_outliers;
500
+ size_t min_size_numeric;
501
+ size_t min_size_categ;
502
+ double min_gain;
503
+ bool gain_as_pct;
504
+ bool follow_all;
505
+ double z_norm;
506
+ double z_outlier;
507
+ double z_tail;
508
+ std::vector<long double> prop_small; /* this is not a parameter, but a shared array determined from the parameters and data */
509
+ } ModelParams;
510
+
511
+ /* Note: the vectors here are filled within the function that fits the model, while the pointers are passed from outside */
512
+ typedef struct {
513
+ double *restrict numeric_data; size_t ncols_numeric;
514
+ int *restrict categorical_data; size_t ncols_categ; int *restrict ncat;
515
+ int *restrict ordinal_data; size_t ncols_ord; int *restrict ncat_ord;
516
+ size_t nrows; size_t tot_cols; std::vector<char> has_NA; std::vector<char> skip_col; int max_categ;
517
+ std::vector<size_t> cat_counts;
518
+ } InputData;
519
+
520
+
521
+ void process_numeric_col(std::vector<Cluster> &cluster_root,
522
+ std::vector<ClusterTree> &tree_root,
523
+ size_t target_col_num,
524
+ Workspace &workspace,
525
+ InputData &input_data,
526
+ ModelParams &model_params,
527
+ ModelOutputs &model_outputs);
528
+ void recursive_split_numeric(Workspace &workspace,
529
+ InputData &input_data,
530
+ ModelParams &model_params,
531
+ size_t curr_depth, bool is_NA_branch);
532
+ void process_categ_col(std::vector<Cluster> &cluster_root,
533
+ std::vector<ClusterTree> &tree_root,
534
+ size_t target_col_num, bool is_ord,
535
+ Workspace &workspace,
536
+ InputData &input_data,
537
+ ModelParams &model_params,
538
+ ModelOutputs &model_outputs);
539
+ void recursive_split_categ(Workspace &workspace,
540
+ InputData &input_data,
541
+ ModelParams &model_params,
542
+ size_t curr_depth, bool is_NA_branch);
543
+
544
+
545
+ /*******************************************
546
+ Prototypes from predict.cpp
547
+ (This is the module from which
548
+ new data can be flagged as outliers)
549
+ ********************************************/
550
+ typedef struct {
551
+ double *restrict numeric_data;
552
+ int *restrict categorical_data;
553
+ int *restrict ordinal_data;
554
+ size_t nrows;
555
+ } PredictionData;
556
+
557
+ bool find_new_outliers(double *restrict numeric_data,
558
+ int *restrict categorical_data,
559
+ int *restrict ordinal_data,
560
+ size_t nrows, int nthreads, ModelOutputs &model_outputs);
561
+ bool follow_tree(ModelOutputs &model_outputs, PredictionData &prediction_data, size_t curr_tree, size_t curr_depth,
562
+ size_t_for row, size_t_for col, bool col_is_num, double num_val_this, int cat_val_this);
563
+ bool check_is_outlier_in_tree(std::vector<size_t> &clusters_in_tree, size_t curr_depth, size_t curr_tree,
564
+ ModelOutputs &model_outputs, PredictionData &prediction_data, size_t_for row, size_t_for col,
565
+ bool col_is_num, double num_val_this, int cat_val_this);
566
+
567
+
568
+ /********************************
569
+ Prototypes from split.cpp
570
+ *********************************/
571
+ #define SD_REG 1e-5 /* Regularization for standard deviation estimation */
572
+
573
+ typedef struct {
574
+ size_t cnt;
575
+ long double sum;
576
+ long double sum_sq;
577
+ } NumericBranch;
578
+
579
+ typedef struct {
580
+ NumericBranch NA_branch = {0, 0, 0};
581
+ NumericBranch left_branch = {0, 0, 0};
582
+ NumericBranch right_branch = {0, 0, 0};
583
+ } NumericSplit;
584
+
585
+ typedef struct {
586
+ size_t *restrict NA_branch; /* array of counts of the target variable's categories */
587
+ size_t *restrict left_branch; /* array of counts of the target variable's categories */
588
+ size_t *restrict right_branch; /* array of counts of the target variable's categories */
589
+ size_t ncat; /* number of categories/entries in the arrays above */
590
+ size_t tot; /* size_NA + size_left + size_right */
591
+ size_t size_NA = 0;
592
+ size_t size_left = 0;
593
+ size_t size_right = 0;
594
+ } CategSplit;
595
+
596
+ void subset_to_onehot(size_t ix_arr[], size_t n_true, size_t n_tot, bool onehot[]);
597
+ size_t move_zero_count_to_front(size_t *restrict cat_sorted, size_t *restrict cat_cnt, size_t ncat_x);
598
+ void flag_zero_counts(char split_subset[], size_t buffer_cat_cnt[], size_t ncat_x);
599
+ long double calc_sd(size_t cnt, long double sum, long double sum_sq);
600
+ long double calc_sd(NumericBranch &branch);
601
+ long double calc_sd(size_t ix_arr[], double *restrict x, size_t st, size_t end, double *restrict mean);
602
+ long double numeric_gain(NumericSplit &split_info, long double tot_sd);
603
+ long double numeric_gain(long double tot_sd, long double info_left, long double info_right, long double info_NA, long double cnt);
604
+ long double total_info(size_t categ_counts[], size_t ncat);
605
+ long double total_info(size_t categ_counts[], size_t ncat, size_t tot);
606
+ long double total_info(size_t *restrict ix_arr, int *restrict x, size_t st, size_t end, size_t ncat, size_t *restrict buffer_cat_cnt);
607
+ long double categ_gain(CategSplit split_info, long double base_info);
608
+ long double categ_gain(size_t *restrict categ_counts, size_t ncat, size_t *restrict ncat_col, size_t maxcat, long double base_info, size_t tot);
609
+ long double categ_gain_from_split(size_t *restrict ix_arr, int *restrict x, size_t st, size_t st_non_na, size_t split_ix, size_t end,
610
+ size_t ncat, size_t *restrict buffer_cat_cnt, long double base_info);
611
+ void split_numericx_numericy(size_t *restrict ix_arr, size_t st, size_t end, double *restrict x, double *restrict y,
612
+ long double sd_y, bool has_na, size_t min_size, bool take_mid, long double *restrict buffer_sd,
613
+ long double *restrict gain, double *restrict split_point, size_t *restrict split_left, size_t *restrict split_NA);
614
+ void split_categx_numericy(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, double *restrict y, long double sd_y, double ymean,
615
+ bool x_is_ordinal, size_t ncat_x, size_t *restrict buffer_cat_cnt, long double *restrict buffer_cat_sum,
616
+ long double *restrict buffer_cat_sum_sq, size_t *restrict buffer_cat_sorted,
617
+ bool has_na, size_t min_size, long double *gain, char *restrict split_subset, int *restrict split_point);
618
+ void split_numericx_categy(size_t *restrict ix_arr, size_t st, size_t end, double *restrict x, int *restrict y,
619
+ size_t ncat_y, long double base_info, size_t *restrict buffer_cat_cnt,
620
+ bool has_na, size_t min_size, bool take_mid, long double *restrict gain, double *restrict split_point,
621
+ size_t *restrict split_left, size_t *restrict split_NA);
622
+ void split_ordx_categy(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, int *restrict y,
623
+ size_t ncat_y, size_t ncat_x, long double base_info,
624
+ size_t *restrict buffer_cat_cnt, size_t *restrict buffer_crosstab, size_t *restrict buffer_ord_cnt,
625
+ bool has_na, size_t min_size, long double *gain, int *split_point);
626
+ void split_categx_biny(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, int *restrict y,
627
+ size_t ncat_x, long double base_info,
628
+ size_t *restrict buffer_cat_cnt, size_t *restrict buffer_crosstab, size_t *restrict buffer_cat_sorted,
629
+ bool has_na, size_t min_size, long double *gain, char *restrict split_subset);
630
+ void split_categx_categy_separate(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, int *restrict y,
631
+ size_t ncat_x, size_t ncat_y, long double base_info,
632
+ size_t *restrict buffer_cat_cnt, size_t *restrict buffer_crosstab,
633
+ bool has_na, size_t min_size, long double *gain);
634
+ void split_categx_categy_subset(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, int *restrict y,
635
+ size_t ncat_x, size_t ncat_y, long double base_info,
636
+ size_t *restrict buffer_cat_cnt, size_t *restrict buffer_crosstab, size_t *restrict buffer_split,
637
+ bool has_na, size_t min_size, long double *gain, char *restrict split_subset);
638
+
639
+
640
+
641
+ /***********************************
642
+ Prototypes from clusters.cpp
643
+ ************************************/
644
+ #define calculate_max_outliers(n, perc) ( (n) * (perc) + (long double)2 * sqrtl( (n) * (perc) * ((long double)1 - perc) ) + (long double)1 )
645
+ #define z_score(x, mu, sd) ( ((x) - (mu)) / (sd) )
646
+ #define chebyshyov_bound(sd) (1.0 / square(sd))
647
+
648
+ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_t st, size_t end,
649
+ double *restrict outlier_scores, size_t *restrict outlier_clusters, size_t *restrict outlier_trees,
650
+ size_t *restrict outlier_depth, Cluster &cluster, std::vector<Cluster> &clusters, size_t cluster_num, size_t tree_num, size_t tree_depth,
651
+ bool is_log_transf, double log_minval, bool is_exp_transf, double orig_mean, double orig_sd,
652
+ double left_tail, double right_tail, double *restrict orig_x,
653
+ double max_perc_outliers, double z_norm, double z_outlier);
654
+ void define_categ_cluster_no_cond(int *restrict x, size_t *restrict ix_arr, size_t st, size_t end, size_t ncateg,
655
+ double *restrict outlier_scores, size_t *restrict outlier_clusters, size_t *restrict outlier_trees,
656
+ size_t *restrict outlier_depth, Cluster &cluster,
657
+ size_t *restrict categ_counts, char *restrict is_outlier, double perc_next_most_comm);
658
+ bool define_categ_cluster(int *restrict x, size_t *restrict ix_arr, size_t st, size_t end, size_t ncateg, bool by_maj,
659
+ double *restrict outlier_scores, size_t *restrict outlier_clusters, size_t *restrict outlier_trees,
660
+ size_t *restrict outlier_depth, Cluster &cluster, std::vector<Cluster> &clusters,
661
+ size_t cluster_num, size_t tree_num, size_t tree_depth,
662
+ double max_perc_outliers, double z_norm, double z_outlier,
663
+ long double *restrict perc_threshold, long double *restrict prop_prior,
664
+ size_t *restrict buffer_categ_counts, long double *restrict buffer_categ_pct,
665
+ size_t *restrict buffer_categ_ix, char *restrict buffer_outliers,
666
+ bool *restrict drop_cluster);
667
+ void simplify_when_equal_cond(std::vector<Cluster> &clusters, int ncat_ord[]);
668
+ void simplify_when_equal_cond(std::vector<ClusterTree> &trees, int ncat_ord[]);
669
+ #ifdef TEST_MODE_DEFINE
670
+ void prune_unused_trees(std::vector<ClusterTree> &trees);
671
+ #endif
672
+ bool check_tree_is_not_needed(ClusterTree &tree);
673
+ void calculate_cluster_minimums(ModelOutputs &model_outputs, size_t col);
674
+ void calculate_cluster_poss_categs(ModelOutputs &model_outputs, size_t col, size_t col_rel);
675
+
676
+
677
+ /**************************************
678
+ Prototypes from cat_outlier.cpp
679
+ ***************************************/
680
+ #define calculate_max_cat_outliers(n, perc, z_norm) ((long double)1 + ((n) * (perc) / z_norm)) /* Note: this is not anyhow probabilistic, nor based on provable bounds */
681
+ void find_outlier_categories(size_t categ_counts[], size_t ncateg, size_t tot, double max_perc_outliers,
682
+ long double perc_threshold[], size_t buffer_ix[], long double buffer_perc[],
683
+ double z_norm, char is_outlier[], bool *found_outliers, bool *new_is_outlier, double *next_most_comm);
684
+ void find_outlier_categories_by_maj(size_t categ_counts[], size_t ncateg, size_t tot, double max_perc_outliers,
685
+ long double prior_prob[], double z_outlier, char is_outlier[],
686
+ bool *found_outliers, bool *new_is_outlier, int *categ_maj);
687
+ bool find_outlier_categories_no_cond(size_t categ_counts[], size_t ncateg, size_t tot,
688
+ char is_outlier[], double *next_most_comm);
689
+
690
+
691
+
692
+ /*************************************************
693
+ Prototypes from misc.cpp and other structs
694
+ **************************************************/
695
+
696
+ /* an inefficient workaround for coding up option 'follow_all' */
697
+ typedef struct {
698
+ double gain_restore;
699
+ double gain_best_restore;
700
+ double split_point_restore;
701
+ int split_lev_restore;
702
+ std::vector<char> split_subset_restore;
703
+ size_t ix1_restore;
704
+ size_t ix2_restore;
705
+ size_t ix3_restore;
706
+ size_t ix4_restore;
707
+ int * temp_ptr_x;
708
+ size_t col_best_restore;
709
+ ColType col_type_best_rememer;
710
+ double split_point_best_restore;
711
+ int split_lev_best_restore;
712
+ std::vector<char> split_subset_best_restore;
713
+ long double base_info_restore;
714
+ long double base_info_orig_restore;
715
+ double sd_y_restore;
716
+ bool has_outliers_restore;
717
+ bool lev_has_outliers_restore;
718
+ } RecursionState;
719
+
720
+
721
+ int calculate_category_indices(size_t start_ix_cat_counts[], int ncat[], size_t ncols, bool skip_col[], int max_categ = 0);
722
+ void calculate_all_cat_counts(size_t start_ix_cat_counts[], size_t cat_counts[], int ncat[],
723
+ int categorical_data[], size_t ncols, size_t nrows,
724
+ bool has_NA[], bool skip_col[], int nthreads);
725
+ void check_cat_col_unsplittable(size_t start_ix_cat_counts[], size_t cat_counts[], int ncat[],
726
+ size_t ncols, size_t min_conditioned_size, size_t nrows, bool skip_col[], int nthreads);
727
+ void calculate_lowerlim_proportion(long double *restrict prop_small, long double *restrict prop,
728
+ size_t start_ix_cat_counts[], size_t cat_counts[],
729
+ size_t ncols, size_t nrows, double z_norm, double z_tail);
730
+ void check_missing_no_variance(double numeric_data[], size_t ncols, size_t nrows, bool has_NA[],
731
+ bool skip_col[], int min_decimals[], int nthreads);
732
+ void calc_central_mean_and_sd(size_t ix_arr[], size_t st, size_t end, double x[], size_t size_quarter, double *mean_central, double *sd_central);
733
+ void check_for_tails(size_t ix_arr[], size_t st, size_t end, double *restrict x,
734
+ double z_norm, double max_perc_outliers,
735
+ double *restrict buffer_x, double mean, double sd,
736
+ double *restrict left_tail, double *restrict right_tail,
737
+ bool *exp_transf, bool *log_transf);
738
+ size_t move_outliers_to_front(size_t ix_arr[], double outlier_scores[], size_t st, size_t end);
739
+ size_t move_NAs_to_front(size_t ix_arr[], double x[], size_t st, size_t end, bool inf_as_NA);
740
+ size_t move_NAs_to_front(size_t ix_arr[], int x[], size_t st, size_t end);
741
+ void divide_subset_split(size_t ix_arr[], double x[], size_t st, size_t end, double split_point, bool has_NA, size_t *split_NA, size_t *st_right);
742
+ void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, char subset_categ[], int ncat, bool has_NA, size_t *split_NA, size_t *st_right);
743
+ void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, int split_lev, bool has_NA, size_t *split_NA, size_t *st_right);
744
+ bool check_workspace_is_allocated(Workspace &workspace);
745
+ void allocate_thread_workspace(Workspace &workspace, size_t nrows, int max_categ);
746
+ void backup_recursion_state(Workspace &workspace, RecursionState &state_backup);
747
+ void restore_recursion_state(Workspace &workspace, RecursionState &state_backup);
748
+ void set_tree_as_numeric(ClusterTree &tree, double split_point, size_t col);
749
+ void set_tree_as_categorical(ClusterTree &tree, int ncat, char *split_subset, size_t col);
750
+ void set_tree_as_categorical(ClusterTree &tree, size_t col);
751
+ void set_tree_as_categorical(ClusterTree &tree, size_t col, int ncat);
752
+ void set_tree_as_ordinal(ClusterTree &tree, int split_lev, size_t col);
753
+ void forget_row_outputs(ModelOutputs &model_outputs);
754
+ void allocate_row_outputs(ModelOutputs &model_outputs, size_t nrows, size_t max_depth);
755
+ void check_more_two_values(double arr_num[], size_t nrows, size_t ncols, int nthreads, char too_few_values[]);
756
+ void calc_min_decimals_to_print(ModelOutputs &model_outputs, double *restrict numeric_data, int nthreads);
757
+ int decimals_diff(double val1, double val2);
758
+ void dealloc_ModelOutputs(ModelOutputs &model_outputs);