outliertree 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,685 @@
1
+ /********************************************************************************************************************
2
+ * Explainable outlier detection
3
+ *
4
+ * Tries to detect outliers by generating decision trees that attempt to predict the values of each column based on
5
+ * each other column, testing in each branch of every tried split (if it meets some minimum criteria) whether there
6
+ * are observations that seem too distant from the others in a 1-D distribution for the column that the split tries
7
+ * to "predict" (will not generate a score for each observation).
8
+ * Splits are based on gain, while outlierness is based on confidence intervals.
9
+ * Similar in spirit to the GritBot software developed by RuleQuest research. Reference article is:
10
+ * Cortes, David. "Explainable outlier detection through decision tree conditioning."
11
+ * arXiv preprint arXiv:2001.00636 (2020).
12
+ *
13
+ *
14
+ * Copyright 2020 David Cortes.
15
+ *
16
+ * Written for C++11 standard and OpenMP 2.0 or later. Code is meant to be wrapped into scripting languages
17
+ * such as R or Python.
18
+ *
19
+ * This file is part of OutlierTree.
20
+ *
21
+ * OutlierTree is free software: you can redistribute it and/or modify
22
+ * it under the terms of the GNU General Public License as published by
23
+ * the Free Software Foundation, either version 3 of the License, or
24
+ * (at your option) any later version.
25
+ *
26
+ * OutlierTree is distributed in the hope that it will be useful,
27
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
28
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29
+ * GNU General Public License for more details.
30
+ *
31
+ * You should have received a copy of the GNU General Public License
32
+ * along with OutlierTree. If not, see <https://www.gnu.org/licenses/>.
33
+ ********************************************************************************************************************/
34
+ #include "outlier_tree.hpp"
35
+
36
+
37
+ /*
38
+ * Calculate, in a continuous array that would hold start indices for each category for each column in sequence,
39
+ * at which position will the counts for a given column start. Note that NAs are stored as the last index in each
40
+ * column, so each one needs one extra category
41
+ */
42
+ int calculate_category_indices(size_t start_ix_cat_counts[], int ncat[], size_t ncols, bool skip_col[], int max_categ)
43
+ {
44
+ for (size_t col = 0; col < ncols; col++) {
45
+ max_categ = std::max(ncat[col], max_categ);
46
+ start_ix_cat_counts[col + 1] = start_ix_cat_counts[col] + ncat[col] + 1;
47
+ if (ncat[col] < 2) skip_col[col] = true;
48
+ }
49
+
50
+ return max_categ;
51
+ }
52
+
53
+ /* Save the counts of each category for each column in the array determined above */
54
+ void calculate_all_cat_counts(size_t start_ix_cat_counts[], size_t cat_counts[], int ncat[],
55
+ int categorical_data[], size_t ncols, size_t nrows,
56
+ bool has_NA[], bool skip_col[], int nthreads)
57
+ {
58
+ size_t col_st_offset;
59
+ size_t col_stop;
60
+
61
+ #pragma omp parallel for schedule(static, 1) num_threads(nthreads) private(col_st_offset, col_stop)
62
+ for (size_t_for col = 0; col < ncols; col++) {
63
+
64
+ if (skip_col[col]) continue;
65
+
66
+ col_st_offset = start_ix_cat_counts[col];
67
+ col_stop = (col + 1) * nrows;
68
+ for (size_t row = (col * nrows); row < col_stop; row++) {
69
+
70
+ if (categorical_data[row] >= 0) {
71
+ cat_counts[ categorical_data[row] + col_st_offset ]++;
72
+ } else {
73
+ cat_counts[ ncat[col] + col_st_offset ]++;
74
+ has_NA[col] = true;
75
+ }
76
+
77
+ }
78
+ }
79
+ }
80
+
81
+ /* Check if some column has a large majority that would make any split fail to meet minimum sizes */
82
+ void check_cat_col_unsplittable(size_t start_ix_cat_counts[], size_t cat_counts[], int ncat[],
83
+ size_t ncols, size_t min_conditioned_size, size_t nrows, bool skip_col[], int nthreads)
84
+ {
85
+ size_t largest_cnt;
86
+ #pragma omp parallel for num_threads(nthreads) private(largest_cnt) shared(ncols, nrows, ncat, cat_counts, start_ix_cat_counts, min_conditioned_size, skip_col)
87
+ for (size_t_for col = 0; col < ncols; col++) {
88
+
89
+ largest_cnt = 0;
90
+ for (int cat = 0; cat <= ncat[col]; cat++) {
91
+ largest_cnt = std::max(largest_cnt, cat_counts[ cat + start_ix_cat_counts[col] ]);
92
+ }
93
+ if (largest_cnt > (nrows - min_conditioned_size)) skip_col[col] = true;
94
+ if (largest_cnt <= 1) skip_col[col] = true;
95
+
96
+ }
97
+ }
98
+
99
+ /* Calculate the maxmimum proportions in a subset below which a category *can* be considered as outlier (must meet other conditions too) */
100
+ void calculate_lowerlim_proportion(long double *restrict prop_small, long double *restrict prop,
101
+ size_t start_ix_cat_counts[], size_t cat_counts[],
102
+ size_t ncols, size_t nrows, double z_norm, double z_tail)
103
+ {
104
+ /* TODO: come up with some way of flagging unseen categories as outliers */
105
+ long double mean;
106
+ long double sd;
107
+ long double nrows_dbl = (long double) nrows;
108
+ for (size_t col = 0; col < ncols; col++) {
109
+
110
+ for (size_t cat = start_ix_cat_counts[col]; cat < (start_ix_cat_counts[col + 1] - 1); cat++) {
111
+
112
+ if (cat_counts[cat] > 0) {
113
+ mean = (long double) cat_counts[cat] / nrows_dbl;
114
+ sd = sqrtl( mean * (1.0 - mean) / nrows_dbl );
115
+ prop_small[cat] = fminl(mean - z_norm * sd, mean * 0.5);
116
+ prop[cat] = mean;
117
+ } else {
118
+ prop_small[cat] = 0;
119
+ prop[cat] = 0;
120
+ }
121
+
122
+ }
123
+
124
+ }
125
+ }
126
+
127
+
128
+ /* Check if a numerical column has no variance (i.e. will not be splittable).
129
+ Along the way, also record the number of decimals to display for this column. */
130
+ void check_missing_no_variance(double numeric_data[], size_t ncols, size_t nrows, bool has_NA[],
131
+ bool skip_col[], int min_decimals[], int nthreads)
132
+ {
133
+ long double running_mean;
134
+ long double mean_prev;
135
+ long double running_ssq;
136
+ size_t cnt;
137
+ size_t col_stop;
138
+ double xval;
139
+ double min_val;
140
+ double max_val;
141
+ int min_decimals_col;
142
+
143
+ #pragma omp parallel for schedule(static) num_threads(nthreads) \
144
+ shared(nrows, ncols, numeric_data, has_NA, skip_col, min_decimals) \
145
+ private(running_mean, mean_prev, running_ssq, cnt, col_stop, xval, min_val, max_val, min_decimals_col)
146
+ for (size_t_for col = 0; col < ncols; col++) {
147
+ running_mean = 0;
148
+ mean_prev = 0;
149
+ running_ssq = 0;
150
+ min_val = HUGE_VAL;
151
+ max_val = -HUGE_VAL;
152
+ cnt = 0;
153
+ col_stop = (col + 1) * nrows;
154
+ for (size_t row = col * nrows; row < col_stop; row++) {
155
+ xval = numeric_data[row];
156
+ if (!is_na_or_inf(xval)) {
157
+ running_mean += (xval - running_mean) / (long double)(++cnt);
158
+ running_ssq += (xval - running_mean) * (xval - mean_prev);
159
+ mean_prev = running_mean;
160
+ min_val = fmin(min_val, xval);
161
+ max_val = fmax(max_val, xval);
162
+ } else {
163
+ has_NA[col] = true;
164
+ }
165
+ }
166
+ if ( (running_ssq / (long double)(cnt - 1)) < 1e-6 ) skip_col[col] = true;
167
+ if (cnt > 1) {
168
+ min_decimals_col = 0;
169
+ min_decimals_col = std::max(min_decimals_col, decimals_diff(running_mean, min_val));
170
+ min_decimals_col = std::max(min_decimals_col, decimals_diff(running_mean, max_val));
171
+ min_decimals_col = std::max(min_decimals_col, decimals_diff(0., sqrtl((running_ssq / (long double)(cnt - 1)))));
172
+ min_decimals[col] = min_decimals_col;
173
+ }
174
+ }
175
+ }
176
+
177
+ /* Calculate mean and standard deviation from the central half of the data, and adjust SD heuristically by x2.5 */
178
+ void calc_central_mean_and_sd(size_t ix_arr[], size_t st, size_t end, double x[], size_t size_quarter, double *mean_central, double *sd_central)
179
+ {
180
+ long double running_mean = 0;
181
+ long double mean_prev = 0;
182
+ long double running_ssq = 0;
183
+ double xval;
184
+ size_t st_offset = st + size_quarter;
185
+ if (ix_arr != NULL) {
186
+ for (size_t row = st_offset; row <= (end - size_quarter); row++) {
187
+ xval = x[ix_arr[row]];
188
+ running_mean += (xval - running_mean) / (long double)(row - st_offset + 1);
189
+ running_ssq += (xval - running_mean) * (xval - mean_prev);
190
+ mean_prev = running_mean;
191
+ }
192
+ } else {
193
+ for (size_t row = st_offset; row <= (end - size_quarter); row++) {
194
+ xval = x[row];
195
+ running_mean += (xval - running_mean) / (long double)(row - st_offset + 1);
196
+ running_ssq += (xval - running_mean) * (xval - mean_prev);
197
+ mean_prev = running_mean;
198
+ }
199
+ }
200
+ *mean_central = (double) running_mean;
201
+ *sd_central = 2.5 * sqrtl(running_ssq / (long double)(end - st - 2 * size_quarter));
202
+ }
203
+
204
+
205
+ /* Check whether a numerical column has long tails, and whether a transformation is appropriate
206
+ *
207
+ * Will check if there are too many observations with large Z values at either side. If found, will
208
+ * see if applying a trasnformation (exponentiation for left tail, logarithm for right tail) would
209
+ * solve the problem. If not, will report approximate values for where the tails end/start. If it
210
+ * has tails at both sides, will not process the column.
211
+ *
212
+ * Parameters:
213
+ * - ix_arr[n] (in)
214
+ * Indices by which the 'x' variable would be sorted in ascending order. (Must be already sorted!!!)
215
+ * - st (in)
216
+ * Position at which ix_arr starts (inclusive).
217
+ * - end (in)
218
+ * Position at which ix_arr ends (inclusive).
219
+ * - x[n] (in)
220
+ * Column with the original values to check for tails.
221
+ * - z_norm (in)
222
+ * Model parameter. Default is 2.67.
223
+ * - max_perc_outliers (in)
224
+ * Model parameter. Default is 0.01.
225
+ * - buffer_x[n] (temp)
226
+ * Array where to store the transformed values of 'x' (will ignore ix_arr).
227
+ * - mean (in)
228
+ * Mean to use for transforming to Z scores before exponentiating.
229
+ * - sd (in)
230
+ * Standard deviation to use for transforming to Z scores before exponentiating.
231
+ * - left_tail (out)
232
+ * Approximate value at which the tail is considered to end (if found and not solvable by transforming).
233
+ * - right_tail (out)
234
+ * Approximate value at which the tail is considered to start (if found and not solvable by transforming).
235
+ * - exp_transf (out)
236
+ * Whether to apply an exponential transformation (on the Z values!!!) to solve the problem of having a long left tail.
237
+ * - log_transf (out)
238
+ * Whether to apply a log transform to solve the problem of having a long right tail.
239
+ */
240
+ void check_for_tails(size_t ix_arr[], size_t st, size_t end, double *restrict x,
241
+ double z_norm, double max_perc_outliers,
242
+ double *restrict buffer_x, double mean, double sd,
243
+ double *restrict left_tail, double *restrict right_tail,
244
+ bool *exp_transf, bool *log_transf)
245
+ {
246
+ size_t size_quarter = (end - st + 1) / 4;
247
+ size_t tail_ix;
248
+ size_t median = 2 * size_quarter;
249
+ double z_tail = 2 * z_norm;
250
+ double const_add_log;
251
+ *left_tail = -HUGE_VAL;
252
+ *right_tail = HUGE_VAL;
253
+ size_t max_norm_tail = (size_t) calculate_max_outliers((long double)(end - st + 1), max_perc_outliers);
254
+ double mean_central, sd_central;
255
+ calc_central_mean_and_sd(ix_arr, st, end, x, size_quarter, &mean_central, &sd_central);
256
+ *exp_transf = false;
257
+ *log_transf = false;
258
+ if ( z_score(x[ix_arr[st + max_norm_tail]], mean_central, sd_central) < (-z_tail) ) *left_tail = 1;
259
+ if ( z_score(x[ix_arr[end - max_norm_tail]], mean_central, sd_central) > z_tail ) *right_tail = 1;
260
+
261
+ /* check for left tail (too many low values) */
262
+ if (*left_tail == 1) {
263
+
264
+ /* check if exponentiation would help */
265
+ for (size_t row = (st + size_quarter); row <= (end - size_quarter); row++)
266
+ buffer_x[row] = exp(z_score(x[ix_arr[row]], mean, sd));
267
+ calc_central_mean_and_sd(NULL, st, end, buffer_x, size_quarter, &mean_central, &sd_central);
268
+ buffer_x[st + max_norm_tail] = exp(z_score(x[ix_arr[st + max_norm_tail]], mean, sd));
269
+ if (z_score(buffer_x[st + max_norm_tail], mean_central, sd_central) >= -z_tail)
270
+ {
271
+ *left_tail = HUGE_VAL;
272
+ *exp_transf = true;
273
+ }
274
+
275
+ /* if exponentiation doesn't help, determine where does the tail lie on the untransformed data */
276
+ else {
277
+
278
+ *exp_transf = false;
279
+ for (tail_ix = st; tail_ix <= median; tail_ix++) {
280
+ if (z_score(x[ix_arr[tail_ix]], mean_central, sd_central) > (-z_tail)) break;
281
+ }
282
+ *left_tail = x[ix_arr[tail_ix]];
283
+
284
+ }
285
+
286
+ }
287
+
288
+ /* check for right tail (too many high values) */
289
+ if (*right_tail == 1 ) {
290
+
291
+ if (x[ix_arr[st]] == 0) {
292
+ const_add_log = +1;
293
+ } else {
294
+ const_add_log = - x[ix_arr[st]] + 1e-3;
295
+ }
296
+
297
+ /* check if a log transform would help */
298
+ for (size_t row = (st + size_quarter); row <= (end - size_quarter); row++)
299
+ buffer_x[row] = log(x[ix_arr[row]] + const_add_log);
300
+ calc_central_mean_and_sd(NULL, st, end, buffer_x, size_quarter, &mean_central, &sd_central);
301
+ buffer_x[end - max_norm_tail] = log(x[ix_arr[end - max_norm_tail]] + const_add_log);
302
+ if (z_score(buffer_x[end - max_norm_tail], mean_central, sd_central) <= z_tail)
303
+ {
304
+ *right_tail = HUGE_VAL;
305
+ *log_transf = true;
306
+ }
307
+
308
+ /* if log transform doesn't help, determine where does the tail lie on the untransformed data */
309
+ else {
310
+ for (tail_ix = end; tail_ix >= median; tail_ix--) {
311
+ if (z_score(x[ix_arr[tail_ix]], mean_central, sd_central) < z_tail) break;
312
+ }
313
+ *right_tail = x[ix_arr[tail_ix]];
314
+ }
315
+
316
+ }
317
+
318
+ }
319
+
320
+ /* Move identified outliers for a given column to the beginning of the indices array,
321
+ and return the position at which the non-outliers start */
322
+ size_t move_outliers_to_front(size_t ix_arr[], double outlier_scores[], size_t st, size_t end)
323
+ {
324
+ size_t st_non_na = st;
325
+ size_t temp;
326
+
327
+ for (size_t i = st; i <= end; i++) {
328
+ if (outlier_scores[ix_arr[i]] < 1.0) {
329
+ temp = ix_arr[st_non_na];
330
+ ix_arr[st_non_na] = ix_arr[i];
331
+ ix_arr[i] = temp;
332
+ st_non_na++;
333
+ }
334
+ }
335
+ return st_non_na;
336
+ }
337
+
338
+ /* Move missing values of a numeric variable to the front of the indices array and return the position at which non-missing ones start */
339
+ size_t move_NAs_to_front(size_t ix_arr[], double x[], size_t st, size_t end, bool inf_as_NA)
340
+ {
341
+ size_t st_non_na = st;
342
+ size_t temp;
343
+
344
+ if (inf_as_NA) {
345
+ for (size_t i = st; i <= end; i++) {
346
+ if (is_na_or_inf(x[ix_arr[i]])) {
347
+ temp = ix_arr[st_non_na];
348
+ ix_arr[st_non_na] = ix_arr[i];
349
+ ix_arr[i] = temp;
350
+ st_non_na++;
351
+ }
352
+ }
353
+ } else {
354
+ for (size_t i = st; i <= end; i++) {
355
+ if (isnan(x[ix_arr[i]])) {
356
+ temp = ix_arr[st_non_na];
357
+ ix_arr[st_non_na] = ix_arr[i];
358
+ ix_arr[i] = temp;
359
+ st_non_na++;
360
+ }
361
+ }
362
+ }
363
+
364
+ return st_non_na;
365
+ }
366
+
367
+ /* Move missing values of a categorical variable to the front of the indices array and return the position at which non-missing ones start */
368
+ size_t move_NAs_to_front(size_t ix_arr[], int x[], size_t st, size_t end)
369
+ {
370
+ size_t st_non_na = st;
371
+ size_t temp;
372
+
373
+ for (size_t i = st; i <= end; i++) {
374
+ if (x[ix_arr[i]] < 0) { /* categorical NAs are represented as negative integers */
375
+ temp = ix_arr[st_non_na];
376
+ ix_arr[st_non_na] = ix_arr[i];
377
+ ix_arr[i] = temp;
378
+ st_non_na++;
379
+ }
380
+ }
381
+ return st_non_na;
382
+ }
383
+
384
+ /* for numerical */
385
+ void divide_subset_split(size_t ix_arr[], double x[], size_t st, size_t end, double split_point, bool has_NA, size_t *split_NA, size_t *st_right)
386
+ {
387
+ size_t temp;
388
+
389
+ if (has_NA) {
390
+ *split_NA = move_NAs_to_front(ix_arr, x, st, end, false);
391
+ st = *split_NA;
392
+ } else { *split_NA = st; }
393
+ for (size_t row = st; row <= end; row++) {
394
+
395
+ /* move to the left if the category is there */
396
+ if (x[ix_arr[row]] <= split_point) {
397
+ temp = ix_arr[st];
398
+ ix_arr[st] = ix_arr[row];
399
+ ix_arr[row] = temp;
400
+ st++;
401
+ }
402
+ }
403
+
404
+ *st_right = st;
405
+ }
406
+
407
+ /* for categorical */
408
+ void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, char subset_categ[], int ncat, bool has_NA, size_t *split_NA, size_t *st_right)
409
+ {
410
+ size_t temp;
411
+
412
+ if (has_NA) {
413
+ *split_NA = move_NAs_to_front(ix_arr, x, st, end);
414
+ st = *split_NA;
415
+ } else { *split_NA = st; }
416
+ for (size_t row = st; row <= end; row++) {
417
+
418
+ /* move to the left if the category is there */
419
+ if (subset_categ[ x[ix_arr[row]] ] != 0) {
420
+ temp = ix_arr[st];
421
+ ix_arr[st] = ix_arr[row];
422
+ ix_arr[row] = temp;
423
+ st++;
424
+ }
425
+ }
426
+
427
+ *st_right = st;
428
+ }
429
+
430
+ /* for ordinal */
431
+ void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, int split_lev, bool has_NA, size_t *split_NA, size_t *st_right)
432
+ {
433
+ size_t temp;
434
+
435
+ if (has_NA) {
436
+ *split_NA = move_NAs_to_front(ix_arr, x, st, end);
437
+ st = *split_NA;
438
+ } else { *split_NA = st; }
439
+ for (size_t row = st; row <= end; row++) {
440
+
441
+ /* move to the left if the category is there */
442
+ if (x[ix_arr[row]] <= split_lev) {
443
+ temp = ix_arr[st];
444
+ ix_arr[st] = ix_arr[row];
445
+ ix_arr[row] = temp;
446
+ st++;
447
+ }
448
+ }
449
+
450
+ *st_right = st;
451
+ }
452
+
453
+ /* thread-local memory where intermediate outputs and buffers are stored */
454
+ bool check_workspace_is_allocated(Workspace &workspace)
455
+ {
456
+ return workspace.ix_arr.size() > 0;
457
+ }
458
+
459
+ void allocate_thread_workspace(Workspace &workspace, size_t nrows, int max_categ)
460
+ {
461
+ workspace.buffer_transf_y.resize(nrows);
462
+ workspace.buffer_bin_y.resize(nrows);
463
+ workspace.ix_arr.resize(nrows);
464
+ for (size_t i = 0; i < nrows; i++) workspace.ix_arr[i] = i;
465
+
466
+ workspace.outlier_scores.resize(nrows);
467
+ workspace.outlier_clusters.resize(nrows);
468
+ workspace.outlier_trees.resize(nrows);
469
+ workspace.outlier_depth.resize(nrows);
470
+ workspace.buffer_sd.resize(nrows);
471
+
472
+ workspace.buffer_cat_sum.resize(max_categ + 1);
473
+ workspace.buffer_cat_sum_sq.resize(max_categ + 1);
474
+ workspace.buffer_cat_cnt.resize( (max_categ + 1) * 3);
475
+ workspace.buffer_cat_sorted.resize(max_categ);
476
+ workspace.buffer_subset_categ.resize(max_categ);
477
+
478
+ workspace.buffer_subset_categ_best.resize(max_categ);
479
+ workspace.buffer_crosstab.resize(square(max_categ + 1));
480
+ workspace.buffer_subset_outlier.resize(max_categ);
481
+ }
482
+
483
+ /*
484
+ * This was a quick way of coding up the option 'follow_all' - it basically backs up the modifyable data that
485
+ * is looked at during a recursion, in a rather un-optimal manner. It can be optimized further by not copying
486
+ * everything, as it doesn't really need to always copy all variables (same for the restore function below).
487
+ * For example, at a given point, only one of buffer_subset_categ/this_split_point is used.
488
+ */
489
+ void backup_recursion_state(Workspace &workspace, RecursionState &state_backup)
490
+ {
491
+ state_backup.gain_restore = workspace.this_gain;
492
+ state_backup.gain_best_restore = workspace.best_gain;
493
+ state_backup.split_point_restore = workspace.this_split_point;
494
+ state_backup.split_lev_restore = workspace.this_split_lev;
495
+ state_backup.split_subset_restore = workspace.buffer_subset_categ;
496
+ state_backup.ix1_restore = workspace.st;
497
+ state_backup.ix2_restore = workspace.this_split_NA;
498
+ state_backup.ix3_restore = workspace.this_split_ix;
499
+ state_backup.ix4_restore = workspace.end;
500
+ state_backup.col_best_restore = workspace.col_best;
501
+ state_backup.col_type_best_rememer = workspace.column_type_best;
502
+ state_backup.split_point_best_restore = workspace.split_point_best;
503
+ state_backup.split_lev_best_restore = workspace.split_lev_best;
504
+ state_backup.split_subset_best_restore = workspace.buffer_subset_categ_best;
505
+ state_backup.base_info_restore = workspace.base_info;
506
+ state_backup.base_info_orig_restore = workspace.base_info_orig;
507
+ state_backup.sd_y_restore = workspace.sd_y;
508
+ state_backup.has_outliers_restore = workspace.has_outliers;
509
+ state_backup.lev_has_outliers_restore = workspace.lev_has_outliers;
510
+ state_backup.temp_ptr_x = workspace.temp_ptr_x;
511
+ }
512
+
513
+ void restore_recursion_state(Workspace &workspace, RecursionState &state_backup)
514
+ {
515
+ workspace.this_gain = state_backup.gain_restore;
516
+ workspace.best_gain = state_backup.gain_best_restore;
517
+ workspace.this_split_point = state_backup.split_point_restore;
518
+ workspace.this_split_lev = state_backup.split_lev_restore;
519
+ workspace.buffer_subset_categ = state_backup.split_subset_restore;
520
+ workspace.st = state_backup.ix1_restore;
521
+ workspace.this_split_NA = state_backup.ix2_restore;
522
+ workspace.this_split_ix = state_backup.ix3_restore;
523
+ workspace.end = state_backup.ix4_restore;
524
+ workspace.col_best = state_backup.col_best_restore;
525
+ workspace.column_type_best = state_backup.col_type_best_rememer;
526
+ workspace.split_point_best = state_backup.split_point_best_restore;
527
+ workspace.split_lev_best = state_backup.split_lev_best_restore;
528
+ workspace.buffer_subset_categ_best = state_backup.split_subset_best_restore;
529
+ workspace.base_info = state_backup.base_info_restore;
530
+ workspace.base_info_orig = state_backup.base_info_orig_restore;
531
+ workspace.sd_y = state_backup.sd_y_restore;
532
+ workspace.has_outliers = state_backup.has_outliers_restore;
533
+ workspace.lev_has_outliers = state_backup.lev_has_outliers_restore;
534
+ workspace.temp_ptr_x = state_backup.temp_ptr_x;
535
+ }
536
+
537
+ /* Next split on the trees is only decided after they are already initialized */
538
+ void set_tree_as_numeric(ClusterTree &tree, double split_point, size_t col)
539
+ {
540
+ tree.column_type = Numeric;
541
+ tree.split_point = split_point;
542
+ tree.col_num = col;
543
+ }
544
+
545
+ void set_tree_as_categorical(ClusterTree &tree, int ncat, char *split_subset, size_t col)
546
+ {
547
+ tree.column_type = Categorical;
548
+ tree.col_num = col;
549
+ tree.split_subset.assign(split_subset, split_subset + ncat);
550
+ tree.split_subset.shrink_to_fit();
551
+ }
552
+
553
+ void set_tree_as_categorical(ClusterTree &tree, size_t col)
554
+ {
555
+ tree.column_type = Categorical;
556
+ tree.col_num = col;
557
+ tree.split_subset.resize(2);
558
+ tree.split_subset[0] = 1;
559
+ tree.split_subset[1] = 0;
560
+ tree.split_subset.shrink_to_fit();
561
+ }
562
+
563
+ void set_tree_as_categorical(ClusterTree &tree, size_t col, int ncat)
564
+ {
565
+ tree.column_type = Categorical;
566
+ tree.col_num = col;
567
+ tree.binary_branches.resize(ncat, 0);
568
+ tree.binary_branches.shrink_to_fit();
569
+ tree.split_subset.shrink_to_fit();
570
+ }
571
+
572
+ void set_tree_as_ordinal(ClusterTree &tree, int split_lev, size_t col)
573
+ {
574
+ tree.column_type = Ordinal;
575
+ tree.split_lev = split_lev;
576
+ tree.col_num = col;
577
+ }
578
+
579
+
580
+
581
+ /* After presenting outliers, it's not necessary to retain their details about column/cluster/tree/etc. */
582
+ void forget_row_outputs(ModelOutputs &model_outputs)
583
+ {
584
+ model_outputs.outlier_scores_final.clear();
585
+ model_outputs.outlier_clusters_final.clear();
586
+ model_outputs.outlier_columns_final.clear();
587
+ model_outputs.outlier_trees_final.clear();
588
+ model_outputs.outlier_depth_final.clear();
589
+ model_outputs.outlier_decimals_distr.clear();
590
+
591
+ model_outputs.outlier_scores_final.shrink_to_fit();
592
+ model_outputs.outlier_clusters_final.shrink_to_fit();
593
+ model_outputs.outlier_columns_final.shrink_to_fit();
594
+ model_outputs.outlier_trees_final.shrink_to_fit();
595
+ model_outputs.outlier_depth_final.shrink_to_fit();
596
+ model_outputs.outlier_decimals_distr.shrink_to_fit();
597
+ }
598
+
599
+ void allocate_row_outputs(ModelOutputs &model_outputs, size_t nrows, size_t max_depth)
600
+ {
601
+ forget_row_outputs(model_outputs);
602
+ model_outputs.outlier_scores_final.resize(nrows, 1.0);
603
+ model_outputs.outlier_clusters_final.resize(nrows, 0);
604
+ model_outputs.outlier_columns_final.resize(nrows);
605
+ model_outputs.outlier_trees_final.resize(nrows);
606
+ model_outputs.outlier_depth_final.resize(nrows, max_depth + 2);
607
+ model_outputs.outlier_decimals_distr.resize(nrows, 0);
608
+
609
+ model_outputs.outlier_scores_final.shrink_to_fit();
610
+ model_outputs.outlier_clusters_final.shrink_to_fit();
611
+ model_outputs.outlier_columns_final.shrink_to_fit();
612
+ model_outputs.outlier_trees_final.shrink_to_fit();
613
+ model_outputs.outlier_depth_final.shrink_to_fit();
614
+ model_outputs.outlier_decimals_distr.shrink_to_fit();
615
+ }
616
+
617
+ void check_more_two_values(double arr_num[], size_t nrows, size_t ncols, int nthreads, char too_few_values[])
618
+ {
619
+ std::vector<std::unordered_set<double>> seen_values(ncols);
620
+
621
+ #pragma omp parallel for schedule(dynamic) num_threads(nthreads) shared(arr_num, nrows, ncols, too_few_values, seen_values)
622
+ for (size_t_for col = 0; col < ncols; col++) {
623
+ for (size_t row = 0; row < nrows; row++) {
624
+ if (!isnan(arr_num[row + col * nrows]))
625
+ seen_values[col].insert(arr_num[row + col * nrows]);
626
+ if (seen_values[col].size() > 2) break;
627
+ }
628
+ if (seen_values[col].size() <= 2)too_few_values[col] = true;
629
+ }
630
+ }
631
+
632
+ void calc_min_decimals_to_print(ModelOutputs &model_outputs, double *restrict numeric_data, int nthreads)
633
+ {
634
+ if (numeric_data == NULL) return;
635
+
636
+ double val_this;
637
+ double val_comp;
638
+ int min_decimals;
639
+ size_t col_this;
640
+ Cluster *cluster_this;
641
+ size_t nrows = model_outputs.outlier_columns_final.size();
642
+
643
+ #pragma omp parallel for schedule(dynamic) num_threads(nthreads) \
644
+ shared(model_outputs, nrows, numeric_data) \
645
+ private(val_this, val_comp, min_decimals, col_this, cluster_this)
646
+ for (size_t_for row = 0; row < nrows; row++) {
647
+ if (model_outputs.outlier_scores_final[row] < 1.0 &&
648
+ model_outputs.outlier_columns_final[row] < model_outputs.ncols_numeric
649
+ ) {
650
+
651
+ col_this = model_outputs.outlier_columns_final[row];
652
+ cluster_this = &model_outputs.all_clusters[col_this][model_outputs.outlier_clusters_final[row]];
653
+ val_this = numeric_data[row + nrows * col_this];
654
+ val_comp = cluster_this->display_mean;
655
+ min_decimals = std::max(0, decimals_diff(val_this, val_comp));
656
+
657
+ if (val_this >= cluster_this->upper_lim)
658
+ val_comp = cluster_this->display_lim_high;
659
+ else
660
+ val_comp = cluster_this->display_lim_low;
661
+ min_decimals = std::max(min_decimals, decimals_diff(val_this, val_comp));
662
+
663
+ model_outputs.outlier_decimals_distr[row] = min_decimals;
664
+ }
665
+ }
666
+ }
667
+
668
+ int decimals_diff(double val1, double val2)
669
+ {
670
+ double res = ceil(-log10(fabs(val1 - val2)));
671
+ if (is_na_or_inf(res)) res = 0.;
672
+ return (int) res;
673
+ }
674
+
675
+
676
+ /* Reason behind this function: Cython (as of v0.29) will not auto-deallocate
677
+ structs which are part of a cdef'd class, which produces a memory leak
678
+ but can be force-destructed. Unfortunately, Cython itself doesn't even
679
+ allow calling destructors for structs, so it has to be done externally.
680
+ This function should otherwise have no reason to exist.
681
+ */
682
+ void dealloc_ModelOutputs(ModelOutputs &model_outputs)
683
+ {
684
+ model_outputs.~ModelOutputs();
685
+ }