outliertree 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,685 @@
1
+ /********************************************************************************************************************
2
+ * Explainable outlier detection
3
+ *
4
+ * Tries to detect outliers by generating decision trees that attempt to predict the values of each column based on
5
+ * each other column, testing in each branch of every tried split (if it meets some minimum criteria) whether there
6
+ * are observations that seem too distant from the others in a 1-D distribution for the column that the split tries
7
+ * to "predict" (will not generate a score for each observation).
8
+ * Splits are based on gain, while outlierness is based on confidence intervals.
9
+ * Similar in spirit to the GritBot software developed by RuleQuest research. Reference article is:
10
+ * Cortes, David. "Explainable outlier detection through decision tree conditioning."
11
+ * arXiv preprint arXiv:2001.00636 (2020).
12
+ *
13
+ *
14
+ * Copyright 2020 David Cortes.
15
+ *
16
+ * Written for C++11 standard and OpenMP 2.0 or later. Code is meant to be wrapped into scripting languages
17
+ * such as R or Python.
18
+ *
19
+ * This file is part of OutlierTree.
20
+ *
21
+ * OutlierTree is free software: you can redistribute it and/or modify
22
+ * it under the terms of the GNU General Public License as published by
23
+ * the Free Software Foundation, either version 3 of the License, or
24
+ * (at your option) any later version.
25
+ *
26
+ * OutlierTree is distributed in the hope that it will be useful,
27
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
28
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29
+ * GNU General Public License for more details.
30
+ *
31
+ * You should have received a copy of the GNU General Public License
32
+ * along with OutlierTree. If not, see <https://www.gnu.org/licenses/>.
33
+ ********************************************************************************************************************/
34
+ #include "outlier_tree.hpp"
35
+
36
+
37
+ /*
38
+ * Calculate, in a continuous array that would hold start indices for each category for each column in sequence,
39
+ * at which position will the counts for a given column start. Note that NAs are stored as the last index in each
40
+ * column, so each one needs one extra category
41
+ */
42
+ int calculate_category_indices(size_t start_ix_cat_counts[], int ncat[], size_t ncols, bool skip_col[], int max_categ)
43
+ {
44
+ for (size_t col = 0; col < ncols; col++) {
45
+ max_categ = std::max(ncat[col], max_categ);
46
+ start_ix_cat_counts[col + 1] = start_ix_cat_counts[col] + ncat[col] + 1;
47
+ if (ncat[col] < 2) skip_col[col] = true;
48
+ }
49
+
50
+ return max_categ;
51
+ }
52
+
53
+ /* Save the counts of each category for each column in the array determined above */
54
+ void calculate_all_cat_counts(size_t start_ix_cat_counts[], size_t cat_counts[], int ncat[],
55
+ int categorical_data[], size_t ncols, size_t nrows,
56
+ bool has_NA[], bool skip_col[], int nthreads)
57
+ {
58
+ size_t col_st_offset;
59
+ size_t col_stop;
60
+
61
+ #pragma omp parallel for schedule(static, 1) num_threads(nthreads) private(col_st_offset, col_stop)
62
+ for (size_t_for col = 0; col < ncols; col++) {
63
+
64
+ if (skip_col[col]) continue;
65
+
66
+ col_st_offset = start_ix_cat_counts[col];
67
+ col_stop = (col + 1) * nrows;
68
+ for (size_t row = (col * nrows); row < col_stop; row++) {
69
+
70
+ if (categorical_data[row] >= 0) {
71
+ cat_counts[ categorical_data[row] + col_st_offset ]++;
72
+ } else {
73
+ cat_counts[ ncat[col] + col_st_offset ]++;
74
+ has_NA[col] = true;
75
+ }
76
+
77
+ }
78
+ }
79
+ }
80
+
81
+ /* Check if some column has a large majority that would make any split fail to meet minimum sizes */
82
+ void check_cat_col_unsplittable(size_t start_ix_cat_counts[], size_t cat_counts[], int ncat[],
83
+ size_t ncols, size_t min_conditioned_size, size_t nrows, bool skip_col[], int nthreads)
84
+ {
85
+ size_t largest_cnt;
86
+ #pragma omp parallel for num_threads(nthreads) private(largest_cnt) shared(ncols, nrows, ncat, cat_counts, start_ix_cat_counts, min_conditioned_size, skip_col)
87
+ for (size_t_for col = 0; col < ncols; col++) {
88
+
89
+ largest_cnt = 0;
90
+ for (int cat = 0; cat <= ncat[col]; cat++) {
91
+ largest_cnt = std::max(largest_cnt, cat_counts[ cat + start_ix_cat_counts[col] ]);
92
+ }
93
+ if (largest_cnt > (nrows - min_conditioned_size)) skip_col[col] = true;
94
+ if (largest_cnt <= 1) skip_col[col] = true;
95
+
96
+ }
97
+ }
98
+
99
+ /* Calculate the maxmimum proportions in a subset below which a category *can* be considered as outlier (must meet other conditions too) */
100
+ void calculate_lowerlim_proportion(long double *restrict prop_small, long double *restrict prop,
101
+ size_t start_ix_cat_counts[], size_t cat_counts[],
102
+ size_t ncols, size_t nrows, double z_norm, double z_tail)
103
+ {
104
+ /* TODO: come up with some way of flagging unseen categories as outliers */
105
+ long double mean;
106
+ long double sd;
107
+ long double nrows_dbl = (long double) nrows;
108
+ for (size_t col = 0; col < ncols; col++) {
109
+
110
+ for (size_t cat = start_ix_cat_counts[col]; cat < (start_ix_cat_counts[col + 1] - 1); cat++) {
111
+
112
+ if (cat_counts[cat] > 0) {
113
+ mean = (long double) cat_counts[cat] / nrows_dbl;
114
+ sd = sqrtl( mean * (1.0 - mean) / nrows_dbl );
115
+ prop_small[cat] = fminl(mean - z_norm * sd, mean * 0.5);
116
+ prop[cat] = mean;
117
+ } else {
118
+ prop_small[cat] = 0;
119
+ prop[cat] = 0;
120
+ }
121
+
122
+ }
123
+
124
+ }
125
+ }
126
+
127
+
128
+ /* Check if a numerical column has no variance (i.e. will not be splittable).
129
+ Along the way, also record the number of decimals to display for this column. */
130
+ void check_missing_no_variance(double numeric_data[], size_t ncols, size_t nrows, bool has_NA[],
131
+ bool skip_col[], int min_decimals[], int nthreads)
132
+ {
133
+ long double running_mean;
134
+ long double mean_prev;
135
+ long double running_ssq;
136
+ size_t cnt;
137
+ size_t col_stop;
138
+ double xval;
139
+ double min_val;
140
+ double max_val;
141
+ int min_decimals_col;
142
+
143
+ #pragma omp parallel for schedule(static) num_threads(nthreads) \
144
+ shared(nrows, ncols, numeric_data, has_NA, skip_col, min_decimals) \
145
+ private(running_mean, mean_prev, running_ssq, cnt, col_stop, xval, min_val, max_val, min_decimals_col)
146
+ for (size_t_for col = 0; col < ncols; col++) {
147
+ running_mean = 0;
148
+ mean_prev = 0;
149
+ running_ssq = 0;
150
+ min_val = HUGE_VAL;
151
+ max_val = -HUGE_VAL;
152
+ cnt = 0;
153
+ col_stop = (col + 1) * nrows;
154
+ for (size_t row = col * nrows; row < col_stop; row++) {
155
+ xval = numeric_data[row];
156
+ if (!is_na_or_inf(xval)) {
157
+ running_mean += (xval - running_mean) / (long double)(++cnt);
158
+ running_ssq += (xval - running_mean) * (xval - mean_prev);
159
+ mean_prev = running_mean;
160
+ min_val = fmin(min_val, xval);
161
+ max_val = fmax(max_val, xval);
162
+ } else {
163
+ has_NA[col] = true;
164
+ }
165
+ }
166
+ if ( (running_ssq / (long double)(cnt - 1)) < 1e-6 ) skip_col[col] = true;
167
+ if (cnt > 1) {
168
+ min_decimals_col = 0;
169
+ min_decimals_col = std::max(min_decimals_col, decimals_diff(running_mean, min_val));
170
+ min_decimals_col = std::max(min_decimals_col, decimals_diff(running_mean, max_val));
171
+ min_decimals_col = std::max(min_decimals_col, decimals_diff(0., sqrtl((running_ssq / (long double)(cnt - 1)))));
172
+ min_decimals[col] = min_decimals_col;
173
+ }
174
+ }
175
+ }
176
+
177
+ /* Calculate mean and standard deviation from the central half of the data, and adjust SD heuristically by x2.5 */
178
+ void calc_central_mean_and_sd(size_t ix_arr[], size_t st, size_t end, double x[], size_t size_quarter, double *mean_central, double *sd_central)
179
+ {
180
+ long double running_mean = 0;
181
+ long double mean_prev = 0;
182
+ long double running_ssq = 0;
183
+ double xval;
184
+ size_t st_offset = st + size_quarter;
185
+ if (ix_arr != NULL) {
186
+ for (size_t row = st_offset; row <= (end - size_quarter); row++) {
187
+ xval = x[ix_arr[row]];
188
+ running_mean += (xval - running_mean) / (long double)(row - st_offset + 1);
189
+ running_ssq += (xval - running_mean) * (xval - mean_prev);
190
+ mean_prev = running_mean;
191
+ }
192
+ } else {
193
+ for (size_t row = st_offset; row <= (end - size_quarter); row++) {
194
+ xval = x[row];
195
+ running_mean += (xval - running_mean) / (long double)(row - st_offset + 1);
196
+ running_ssq += (xval - running_mean) * (xval - mean_prev);
197
+ mean_prev = running_mean;
198
+ }
199
+ }
200
+ *mean_central = (double) running_mean;
201
+ *sd_central = 2.5 * sqrtl(running_ssq / (long double)(end - st - 2 * size_quarter));
202
+ }
203
+
204
+
205
+ /* Check whether a numerical column has long tails, and whether a transformation is appropriate
206
+ *
207
+ * Will check if there are too many observations with large Z values at either side. If found, will
208
+ * see if applying a trasnformation (exponentiation for left tail, logarithm for right tail) would
209
+ * solve the problem. If not, will report approximate values for where the tails end/start. If it
210
+ * has tails at both sides, will not process the column.
211
+ *
212
+ * Parameters:
213
+ * - ix_arr[n] (in)
214
+ * Indices by which the 'x' variable would be sorted in ascending order. (Must be already sorted!!!)
215
+ * - st (in)
216
+ * Position at which ix_arr starts (inclusive).
217
+ * - end (in)
218
+ * Position at which ix_arr ends (inclusive).
219
+ * - x[n] (in)
220
+ * Column with the original values to check for tails.
221
+ * - z_norm (in)
222
+ * Model parameter. Default is 2.67.
223
+ * - max_perc_outliers (in)
224
+ * Model parameter. Default is 0.01.
225
+ * - buffer_x[n] (temp)
226
+ * Array where to store the transformed values of 'x' (will ignore ix_arr).
227
+ * - mean (in)
228
+ * Mean to use for transforming to Z scores before exponentiating.
229
+ * - sd (in)
230
+ * Standard deviation to use for transforming to Z scores before exponentiating.
231
+ * - left_tail (out)
232
+ * Approximate value at which the tail is considered to end (if found and not solvable by transforming).
233
+ * - right_tail (out)
234
+ * Approximate value at which the tail is considered to start (if found and not solvable by transforming).
235
+ * - exp_transf (out)
236
+ * Whether to apply an exponential transformation (on the Z values!!!) to solve the problem of having a long left tail.
237
+ * - log_transf (out)
238
+ * Whether to apply a log transform to solve the problem of having a long right tail.
239
+ */
240
+ void check_for_tails(size_t ix_arr[], size_t st, size_t end, double *restrict x,
241
+ double z_norm, double max_perc_outliers,
242
+ double *restrict buffer_x, double mean, double sd,
243
+ double *restrict left_tail, double *restrict right_tail,
244
+ bool *exp_transf, bool *log_transf)
245
+ {
246
+ size_t size_quarter = (end - st + 1) / 4;
247
+ size_t tail_ix;
248
+ size_t median = 2 * size_quarter;
249
+ double z_tail = 2 * z_norm;
250
+ double const_add_log;
251
+ *left_tail = -HUGE_VAL;
252
+ *right_tail = HUGE_VAL;
253
+ size_t max_norm_tail = (size_t) calculate_max_outliers((long double)(end - st + 1), max_perc_outliers);
254
+ double mean_central, sd_central;
255
+ calc_central_mean_and_sd(ix_arr, st, end, x, size_quarter, &mean_central, &sd_central);
256
+ *exp_transf = false;
257
+ *log_transf = false;
258
+ if ( z_score(x[ix_arr[st + max_norm_tail]], mean_central, sd_central) < (-z_tail) ) *left_tail = 1;
259
+ if ( z_score(x[ix_arr[end - max_norm_tail]], mean_central, sd_central) > z_tail ) *right_tail = 1;
260
+
261
+ /* check for left tail (too many low values) */
262
+ if (*left_tail == 1) {
263
+
264
+ /* check if exponentiation would help */
265
+ for (size_t row = (st + size_quarter); row <= (end - size_quarter); row++)
266
+ buffer_x[row] = exp(z_score(x[ix_arr[row]], mean, sd));
267
+ calc_central_mean_and_sd(NULL, st, end, buffer_x, size_quarter, &mean_central, &sd_central);
268
+ buffer_x[st + max_norm_tail] = exp(z_score(x[ix_arr[st + max_norm_tail]], mean, sd));
269
+ if (z_score(buffer_x[st + max_norm_tail], mean_central, sd_central) >= -z_tail)
270
+ {
271
+ *left_tail = HUGE_VAL;
272
+ *exp_transf = true;
273
+ }
274
+
275
+ /* if exponentiation doesn't help, determine where does the tail lie on the untransformed data */
276
+ else {
277
+
278
+ *exp_transf = false;
279
+ for (tail_ix = st; tail_ix <= median; tail_ix++) {
280
+ if (z_score(x[ix_arr[tail_ix]], mean_central, sd_central) > (-z_tail)) break;
281
+ }
282
+ *left_tail = x[ix_arr[tail_ix]];
283
+
284
+ }
285
+
286
+ }
287
+
288
+ /* check for right tail (too many high values) */
289
+ if (*right_tail == 1 ) {
290
+
291
+ if (x[ix_arr[st]] == 0) {
292
+ const_add_log = +1;
293
+ } else {
294
+ const_add_log = - x[ix_arr[st]] + 1e-3;
295
+ }
296
+
297
+ /* check if a log transform would help */
298
+ for (size_t row = (st + size_quarter); row <= (end - size_quarter); row++)
299
+ buffer_x[row] = log(x[ix_arr[row]] + const_add_log);
300
+ calc_central_mean_and_sd(NULL, st, end, buffer_x, size_quarter, &mean_central, &sd_central);
301
+ buffer_x[end - max_norm_tail] = log(x[ix_arr[end - max_norm_tail]] + const_add_log);
302
+ if (z_score(buffer_x[end - max_norm_tail], mean_central, sd_central) <= z_tail)
303
+ {
304
+ *right_tail = HUGE_VAL;
305
+ *log_transf = true;
306
+ }
307
+
308
+ /* if log transform doesn't help, determine where does the tail lie on the untransformed data */
309
+ else {
310
+ for (tail_ix = end; tail_ix >= median; tail_ix--) {
311
+ if (z_score(x[ix_arr[tail_ix]], mean_central, sd_central) < z_tail) break;
312
+ }
313
+ *right_tail = x[ix_arr[tail_ix]];
314
+ }
315
+
316
+ }
317
+
318
+ }
319
+
320
+ /* Move identified outliers for a given column to the beginning of the indices array,
321
+ and return the position at which the non-outliers start */
322
+ size_t move_outliers_to_front(size_t ix_arr[], double outlier_scores[], size_t st, size_t end)
323
+ {
324
+ size_t st_non_na = st;
325
+ size_t temp;
326
+
327
+ for (size_t i = st; i <= end; i++) {
328
+ if (outlier_scores[ix_arr[i]] < 1.0) {
329
+ temp = ix_arr[st_non_na];
330
+ ix_arr[st_non_na] = ix_arr[i];
331
+ ix_arr[i] = temp;
332
+ st_non_na++;
333
+ }
334
+ }
335
+ return st_non_na;
336
+ }
337
+
338
+ /* Move missing values of a numeric variable to the front of the indices array and return the position at which non-missing ones start */
339
+ size_t move_NAs_to_front(size_t ix_arr[], double x[], size_t st, size_t end, bool inf_as_NA)
340
+ {
341
+ size_t st_non_na = st;
342
+ size_t temp;
343
+
344
+ if (inf_as_NA) {
345
+ for (size_t i = st; i <= end; i++) {
346
+ if (is_na_or_inf(x[ix_arr[i]])) {
347
+ temp = ix_arr[st_non_na];
348
+ ix_arr[st_non_na] = ix_arr[i];
349
+ ix_arr[i] = temp;
350
+ st_non_na++;
351
+ }
352
+ }
353
+ } else {
354
+ for (size_t i = st; i <= end; i++) {
355
+ if (isnan(x[ix_arr[i]])) {
356
+ temp = ix_arr[st_non_na];
357
+ ix_arr[st_non_na] = ix_arr[i];
358
+ ix_arr[i] = temp;
359
+ st_non_na++;
360
+ }
361
+ }
362
+ }
363
+
364
+ return st_non_na;
365
+ }
366
+
367
+ /* Move missing values of a categorical variable to the front of the indices array and return the position at which non-missing ones start */
368
+ size_t move_NAs_to_front(size_t ix_arr[], int x[], size_t st, size_t end)
369
+ {
370
+ size_t st_non_na = st;
371
+ size_t temp;
372
+
373
+ for (size_t i = st; i <= end; i++) {
374
+ if (x[ix_arr[i]] < 0) { /* categorical NAs are represented as negative integers */
375
+ temp = ix_arr[st_non_na];
376
+ ix_arr[st_non_na] = ix_arr[i];
377
+ ix_arr[i] = temp;
378
+ st_non_na++;
379
+ }
380
+ }
381
+ return st_non_na;
382
+ }
383
+
384
+ /* for numerical */
385
+ void divide_subset_split(size_t ix_arr[], double x[], size_t st, size_t end, double split_point, bool has_NA, size_t *split_NA, size_t *st_right)
386
+ {
387
+ size_t temp;
388
+
389
+ if (has_NA) {
390
+ *split_NA = move_NAs_to_front(ix_arr, x, st, end, false);
391
+ st = *split_NA;
392
+ } else { *split_NA = st; }
393
+ for (size_t row = st; row <= end; row++) {
394
+
395
+ /* move to the left if the category is there */
396
+ if (x[ix_arr[row]] <= split_point) {
397
+ temp = ix_arr[st];
398
+ ix_arr[st] = ix_arr[row];
399
+ ix_arr[row] = temp;
400
+ st++;
401
+ }
402
+ }
403
+
404
+ *st_right = st;
405
+ }
406
+
407
+ /* for categorical */
408
+ void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, char subset_categ[], int ncat, bool has_NA, size_t *split_NA, size_t *st_right)
409
+ {
410
+ size_t temp;
411
+
412
+ if (has_NA) {
413
+ *split_NA = move_NAs_to_front(ix_arr, x, st, end);
414
+ st = *split_NA;
415
+ } else { *split_NA = st; }
416
+ for (size_t row = st; row <= end; row++) {
417
+
418
+ /* move to the left if the category is there */
419
+ if (subset_categ[ x[ix_arr[row]] ] != 0) {
420
+ temp = ix_arr[st];
421
+ ix_arr[st] = ix_arr[row];
422
+ ix_arr[row] = temp;
423
+ st++;
424
+ }
425
+ }
426
+
427
+ *st_right = st;
428
+ }
429
+
430
+ /* for ordinal */
431
+ void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, int split_lev, bool has_NA, size_t *split_NA, size_t *st_right)
432
+ {
433
+ size_t temp;
434
+
435
+ if (has_NA) {
436
+ *split_NA = move_NAs_to_front(ix_arr, x, st, end);
437
+ st = *split_NA;
438
+ } else { *split_NA = st; }
439
+ for (size_t row = st; row <= end; row++) {
440
+
441
+ /* move to the left if the category is there */
442
+ if (x[ix_arr[row]] <= split_lev) {
443
+ temp = ix_arr[st];
444
+ ix_arr[st] = ix_arr[row];
445
+ ix_arr[row] = temp;
446
+ st++;
447
+ }
448
+ }
449
+
450
+ *st_right = st;
451
+ }
452
+
453
+ /* thread-local memory where intermediate outputs and buffers are stored */
454
+ bool check_workspace_is_allocated(Workspace &workspace)
455
+ {
456
+ return workspace.ix_arr.size() > 0;
457
+ }
458
+
459
+ void allocate_thread_workspace(Workspace &workspace, size_t nrows, int max_categ)
460
+ {
461
+ workspace.buffer_transf_y.resize(nrows);
462
+ workspace.buffer_bin_y.resize(nrows);
463
+ workspace.ix_arr.resize(nrows);
464
+ for (size_t i = 0; i < nrows; i++) workspace.ix_arr[i] = i;
465
+
466
+ workspace.outlier_scores.resize(nrows);
467
+ workspace.outlier_clusters.resize(nrows);
468
+ workspace.outlier_trees.resize(nrows);
469
+ workspace.outlier_depth.resize(nrows);
470
+ workspace.buffer_sd.resize(nrows);
471
+
472
+ workspace.buffer_cat_sum.resize(max_categ + 1);
473
+ workspace.buffer_cat_sum_sq.resize(max_categ + 1);
474
+ workspace.buffer_cat_cnt.resize( (max_categ + 1) * 3);
475
+ workspace.buffer_cat_sorted.resize(max_categ);
476
+ workspace.buffer_subset_categ.resize(max_categ);
477
+
478
+ workspace.buffer_subset_categ_best.resize(max_categ);
479
+ workspace.buffer_crosstab.resize(square(max_categ + 1));
480
+ workspace.buffer_subset_outlier.resize(max_categ);
481
+ }
482
+
483
+ /*
484
+ * This was a quick way of coding up the option 'follow_all' - it basically backs up the modifyable data that
485
+ * is looked at during a recursion, in a rather un-optimal manner. It can be optimized further by not copying
486
+ * everything, as it doesn't really need to always copy all variables (same for the restore function below).
487
+ * For example, at a given point, only one of buffer_subset_categ/this_split_point is used.
488
+ */
489
+ void backup_recursion_state(Workspace &workspace, RecursionState &state_backup)
490
+ {
491
+ state_backup.gain_restore = workspace.this_gain;
492
+ state_backup.gain_best_restore = workspace.best_gain;
493
+ state_backup.split_point_restore = workspace.this_split_point;
494
+ state_backup.split_lev_restore = workspace.this_split_lev;
495
+ state_backup.split_subset_restore = workspace.buffer_subset_categ;
496
+ state_backup.ix1_restore = workspace.st;
497
+ state_backup.ix2_restore = workspace.this_split_NA;
498
+ state_backup.ix3_restore = workspace.this_split_ix;
499
+ state_backup.ix4_restore = workspace.end;
500
+ state_backup.col_best_restore = workspace.col_best;
501
+ state_backup.col_type_best_rememer = workspace.column_type_best;
502
+ state_backup.split_point_best_restore = workspace.split_point_best;
503
+ state_backup.split_lev_best_restore = workspace.split_lev_best;
504
+ state_backup.split_subset_best_restore = workspace.buffer_subset_categ_best;
505
+ state_backup.base_info_restore = workspace.base_info;
506
+ state_backup.base_info_orig_restore = workspace.base_info_orig;
507
+ state_backup.sd_y_restore = workspace.sd_y;
508
+ state_backup.has_outliers_restore = workspace.has_outliers;
509
+ state_backup.lev_has_outliers_restore = workspace.lev_has_outliers;
510
+ state_backup.temp_ptr_x = workspace.temp_ptr_x;
511
+ }
512
+
513
+ void restore_recursion_state(Workspace &workspace, RecursionState &state_backup)
514
+ {
515
+ workspace.this_gain = state_backup.gain_restore;
516
+ workspace.best_gain = state_backup.gain_best_restore;
517
+ workspace.this_split_point = state_backup.split_point_restore;
518
+ workspace.this_split_lev = state_backup.split_lev_restore;
519
+ workspace.buffer_subset_categ = state_backup.split_subset_restore;
520
+ workspace.st = state_backup.ix1_restore;
521
+ workspace.this_split_NA = state_backup.ix2_restore;
522
+ workspace.this_split_ix = state_backup.ix3_restore;
523
+ workspace.end = state_backup.ix4_restore;
524
+ workspace.col_best = state_backup.col_best_restore;
525
+ workspace.column_type_best = state_backup.col_type_best_rememer;
526
+ workspace.split_point_best = state_backup.split_point_best_restore;
527
+ workspace.split_lev_best = state_backup.split_lev_best_restore;
528
+ workspace.buffer_subset_categ_best = state_backup.split_subset_best_restore;
529
+ workspace.base_info = state_backup.base_info_restore;
530
+ workspace.base_info_orig = state_backup.base_info_orig_restore;
531
+ workspace.sd_y = state_backup.sd_y_restore;
532
+ workspace.has_outliers = state_backup.has_outliers_restore;
533
+ workspace.lev_has_outliers = state_backup.lev_has_outliers_restore;
534
+ workspace.temp_ptr_x = state_backup.temp_ptr_x;
535
+ }
536
+
537
+ /* Next split on the trees is only decided after they are already initialized */
538
+ void set_tree_as_numeric(ClusterTree &tree, double split_point, size_t col)
539
+ {
540
+ tree.column_type = Numeric;
541
+ tree.split_point = split_point;
542
+ tree.col_num = col;
543
+ }
544
+
545
+ void set_tree_as_categorical(ClusterTree &tree, int ncat, char *split_subset, size_t col)
546
+ {
547
+ tree.column_type = Categorical;
548
+ tree.col_num = col;
549
+ tree.split_subset.assign(split_subset, split_subset + ncat);
550
+ tree.split_subset.shrink_to_fit();
551
+ }
552
+
553
+ void set_tree_as_categorical(ClusterTree &tree, size_t col)
554
+ {
555
+ tree.column_type = Categorical;
556
+ tree.col_num = col;
557
+ tree.split_subset.resize(2);
558
+ tree.split_subset[0] = 1;
559
+ tree.split_subset[1] = 0;
560
+ tree.split_subset.shrink_to_fit();
561
+ }
562
+
563
+ void set_tree_as_categorical(ClusterTree &tree, size_t col, int ncat)
564
+ {
565
+ tree.column_type = Categorical;
566
+ tree.col_num = col;
567
+ tree.binary_branches.resize(ncat, 0);
568
+ tree.binary_branches.shrink_to_fit();
569
+ tree.split_subset.shrink_to_fit();
570
+ }
571
+
572
+ void set_tree_as_ordinal(ClusterTree &tree, int split_lev, size_t col)
573
+ {
574
+ tree.column_type = Ordinal;
575
+ tree.split_lev = split_lev;
576
+ tree.col_num = col;
577
+ }
578
+
579
+
580
+
581
+ /* After presenting outliers, it's not necessary to retain their details about column/cluster/tree/etc. */
582
+ void forget_row_outputs(ModelOutputs &model_outputs)
583
+ {
584
+ model_outputs.outlier_scores_final.clear();
585
+ model_outputs.outlier_clusters_final.clear();
586
+ model_outputs.outlier_columns_final.clear();
587
+ model_outputs.outlier_trees_final.clear();
588
+ model_outputs.outlier_depth_final.clear();
589
+ model_outputs.outlier_decimals_distr.clear();
590
+
591
+ model_outputs.outlier_scores_final.shrink_to_fit();
592
+ model_outputs.outlier_clusters_final.shrink_to_fit();
593
+ model_outputs.outlier_columns_final.shrink_to_fit();
594
+ model_outputs.outlier_trees_final.shrink_to_fit();
595
+ model_outputs.outlier_depth_final.shrink_to_fit();
596
+ model_outputs.outlier_decimals_distr.shrink_to_fit();
597
+ }
598
+
599
+ void allocate_row_outputs(ModelOutputs &model_outputs, size_t nrows, size_t max_depth)
600
+ {
601
+ forget_row_outputs(model_outputs);
602
+ model_outputs.outlier_scores_final.resize(nrows, 1.0);
603
+ model_outputs.outlier_clusters_final.resize(nrows, 0);
604
+ model_outputs.outlier_columns_final.resize(nrows);
605
+ model_outputs.outlier_trees_final.resize(nrows);
606
+ model_outputs.outlier_depth_final.resize(nrows, max_depth + 2);
607
+ model_outputs.outlier_decimals_distr.resize(nrows, 0);
608
+
609
+ model_outputs.outlier_scores_final.shrink_to_fit();
610
+ model_outputs.outlier_clusters_final.shrink_to_fit();
611
+ model_outputs.outlier_columns_final.shrink_to_fit();
612
+ model_outputs.outlier_trees_final.shrink_to_fit();
613
+ model_outputs.outlier_depth_final.shrink_to_fit();
614
+ model_outputs.outlier_decimals_distr.shrink_to_fit();
615
+ }
616
+
617
+ void check_more_two_values(double arr_num[], size_t nrows, size_t ncols, int nthreads, char too_few_values[])
618
+ {
619
+ std::vector<std::unordered_set<double>> seen_values(ncols);
620
+
621
+ #pragma omp parallel for schedule(dynamic) num_threads(nthreads) shared(arr_num, nrows, ncols, too_few_values, seen_values)
622
+ for (size_t_for col = 0; col < ncols; col++) {
623
+ for (size_t row = 0; row < nrows; row++) {
624
+ if (!isnan(arr_num[row + col * nrows]))
625
+ seen_values[col].insert(arr_num[row + col * nrows]);
626
+ if (seen_values[col].size() > 2) break;
627
+ }
628
+ if (seen_values[col].size() <= 2)too_few_values[col] = true;
629
+ }
630
+ }
631
+
632
+ void calc_min_decimals_to_print(ModelOutputs &model_outputs, double *restrict numeric_data, int nthreads)
633
+ {
634
+ if (numeric_data == NULL) return;
635
+
636
+ double val_this;
637
+ double val_comp;
638
+ int min_decimals;
639
+ size_t col_this;
640
+ Cluster *cluster_this;
641
+ size_t nrows = model_outputs.outlier_columns_final.size();
642
+
643
+ #pragma omp parallel for schedule(dynamic) num_threads(nthreads) \
644
+ shared(model_outputs, nrows, numeric_data) \
645
+ private(val_this, val_comp, min_decimals, col_this, cluster_this)
646
+ for (size_t_for row = 0; row < nrows; row++) {
647
+ if (model_outputs.outlier_scores_final[row] < 1.0 &&
648
+ model_outputs.outlier_columns_final[row] < model_outputs.ncols_numeric
649
+ ) {
650
+
651
+ col_this = model_outputs.outlier_columns_final[row];
652
+ cluster_this = &model_outputs.all_clusters[col_this][model_outputs.outlier_clusters_final[row]];
653
+ val_this = numeric_data[row + nrows * col_this];
654
+ val_comp = cluster_this->display_mean;
655
+ min_decimals = std::max(0, decimals_diff(val_this, val_comp));
656
+
657
+ if (val_this >= cluster_this->upper_lim)
658
+ val_comp = cluster_this->display_lim_high;
659
+ else
660
+ val_comp = cluster_this->display_lim_low;
661
+ min_decimals = std::max(min_decimals, decimals_diff(val_this, val_comp));
662
+
663
+ model_outputs.outlier_decimals_distr[row] = min_decimals;
664
+ }
665
+ }
666
+ }
667
+
668
+ int decimals_diff(double val1, double val2)
669
+ {
670
+ double res = ceil(-log10(fabs(val1 - val2)));
671
+ if (is_na_or_inf(res)) res = 0.;
672
+ return (int) res;
673
+ }
674
+
675
+
676
+ /* Reason behind this function: Cython (as of v0.29) will not auto-deallocate
677
+ structs which are part of a cdef'd class, which produces a memory leak
678
+ but can be force-destructed. Unfortunately, Cython itself doesn't even
679
+ allow calling destructors for structs, so it has to be done externally.
680
+ This function should otherwise have no reason to exist.
681
+ */
682
+ void dealloc_ModelOutputs(ModelOutputs &model_outputs)
683
+ {
684
+ model_outputs.~ModelOutputs();
685
+ }