outliertree 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,706 @@
1
+ /********************************************************************************************************************
2
+ * Explainable outlier detection
3
+ *
4
+ * Tries to detect outliers by generating decision trees that attempt to predict the values of each column based on
5
+ * each other column, testing in each branch of every tried split (if it meets some minimum criteria) whether there
6
+ * are observations that seem too distant from the others in a 1-D distribution for the column that the split tries
7
+ * to "predict" (will not generate a score for each observation).
8
+ * Splits are based on gain, while outlierness is based on confidence intervals.
9
+ * Similar in spirit to the GritBot software developed by RuleQuest research.
10
+ *
11
+ *
12
+ * Copyright 2019 David Cortes.
13
+ *
14
+ * Written for C++11 standard and OpenMP 2.0 or later. Code is meant to be wrapped into scripting languages
15
+ * such as R or Python.
16
+ *
17
+ * This file is part of OutlierTree.
18
+ *
19
+ * OutlierTree is free software: you can redistribute it and/or modify
20
+ * it under the terms of the GNU General Public License as published by
21
+ * the Free Software Foundation, either version 3 of the License, or
22
+ * (at your option) any later version.
23
+ *
24
+ * OutlierTree is distributed in the hope that it will be useful,
25
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
26
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27
+ * GNU General Public License for more details.
28
+ *
29
+ * You should have received a copy of the GNU General Public License
30
+ * along with OutlierTree. If not, see <https://www.gnu.org/licenses/>.
31
+ ********************************************************************************************************************/
32
+ #include "outlier_tree.hpp"
33
+
34
+
35
+ /* Check if new data would be outliers according to previously-fit model
36
+ *
37
+ * Note that the new data must follow the exact same column order, and must also be passsed as arrays
38
+ * order by columns (like Fortran arrays, not like C arrays). For data with < 10,000 rows, it's usually
39
+ * faster to run it single-threaded. The outputs are pased in arrays within the 'ModelOutputs' struct,
40
+ * just like when fitting the model. Outpus for rows from previous calls to this function or to the model-fitting
41
+ * function will be overwriten.
42
+ *
43
+ * Parameters:
44
+ * - numeric_data[n * m1] (in)
45
+ * Array with numerical columns in the data. Must be ordered by columns like Fortran arrays.
46
+ * Missing values should be encoded as NaN. Infinite values in most sections are treated as NaN too.
47
+ * If there are no numerical columns, pass NULL.
48
+ * - categorical_data[n * m2] (in)
49
+ * Array with categorical columns in the data. Must be ordered by columns like Fortran arrays.
50
+ * Negative numbers will be interpreted as missing values. Numeration must start at zero and be
51
+ * contiguous (i.e. if there's category 2, must also have category 1).
52
+ * If there are no categorical columns, pass NULL.
53
+ * - ordinal_data[n * m3] (in)
54
+ * Array with ordinal categorical columns in the data. Must be ordered by columns like Fortran arrays.
55
+ * Same rules as for categorical data. Note that the order will only be taken into consideration when
56
+ * producing splits by these columns, but outliers are still detected in the same way as for categoricals.
57
+ * If there are no ordinal columns, pass NULL.
58
+ * - nrows (in)
59
+ * Number of rows (n) in the arrays passed above.
60
+ * - nthreads (in)
61
+ * Number of parallel threads to use.
62
+ * - model_outputs (in, out)
63
+ * Struct containing the data from the fitted model necessary to make new predictions,
64
+ * and buffer vectors where to store the details of the potential outliers found.
65
+ *
66
+ * Returns:
67
+ * Whether there were any outliers identified in the data passed here. Their details will be inside the
68
+ * 'ModelOutputs' struct.
69
+ */
70
+ bool find_new_outliers(double *restrict numeric_data,
71
+ int *restrict categorical_data,
72
+ int *restrict ordinal_data,
73
+ size_t nrows, int nthreads, ModelOutputs &model_outputs)
74
+ {
75
+ size_t tot_cols = model_outputs.ncols_numeric + model_outputs.ncols_categ + model_outputs.ncols_ord;
76
+ double num_val_this;
77
+ int cat_val_this;
78
+ bool col_is_num;
79
+
80
+ bool found_outliers = false;
81
+ if (nrows < (size_t)nthreads)
82
+ nthreads = (int) nrows;
83
+ #if defined(_OPENMP)
84
+ std::vector<char> outliers_thread(nthreads, false);
85
+ #endif
86
+
87
+ /* reset the output data structures */
88
+ allocate_row_outputs(model_outputs, nrows, model_outputs.max_depth);
89
+
90
+ /* put data into a struct and pass it by reference */
91
+ PredictionData prediction_data = {numeric_data, categorical_data, ordinal_data, nrows};
92
+
93
+ /* Note: if parallelizing by columns instead of by rows, need to switch on the `#pragma omp critical`
94
+ in the block that assigns the cluster to an observation */
95
+
96
+ /* see if any value is an outlier */
97
+ // #pragma omp parallel for schedule(dynamic) num_threads(nthreads) shared(model_outputs, outliers_thread, nrows, tot_cols, prediction_data) private(col_is_num, num_val_this, cat_val_this)
98
+ for (size_t_for col = 0; col < tot_cols; col++) {
99
+
100
+ if (model_outputs.all_trees[col].size() == 0 || model_outputs.all_clusters[col].size() == 0) continue;
101
+ col_is_num = col < model_outputs.ncols_numeric;
102
+
103
+ /* Note: earlier versions of OpenMP (like v2 released in 2000 and still used by MSVC in 2019) don't support max reduction, hence this code */
104
+ #pragma omp parallel for schedule(dynamic) num_threads(nthreads) shared(model_outputs, outliers_thread, nrows, prediction_data) \
105
+ firstprivate(col_is_num, col) private(num_val_this, cat_val_this)
106
+ for (size_t_for row = 0; row < nrows; row++) {
107
+
108
+ /* first make a pre-check that the value could be flagged as outlier in some cluster */
109
+ if (col < model_outputs.ncols_numeric) {
110
+
111
+ num_val_this = prediction_data.numeric_data[row + col * nrows];
112
+ if (is_na_or_inf(num_val_this)) continue;
113
+ if ((num_val_this < model_outputs.max_outlier_any_cl[col]) && (num_val_this > model_outputs.min_outlier_any_cl[col])) continue;
114
+
115
+ } else if (col < (model_outputs.ncols_numeric + model_outputs.ncols_categ)) {
116
+
117
+ cat_val_this = prediction_data.categorical_data[row + (col - model_outputs.ncols_numeric) * nrows];
118
+ if (cat_val_this < 0) continue;
119
+ if (cat_val_this >= model_outputs.ncat[col - model_outputs.ncols_numeric]) continue;
120
+ if (!model_outputs.cat_outlier_any_cl[col - model_outputs.ncols_numeric][cat_val_this]) continue;
121
+
122
+ } else {
123
+
124
+ cat_val_this = prediction_data.ordinal_data[row + (col - model_outputs.ncols_numeric - model_outputs.ncols_categ) * nrows];
125
+ if (cat_val_this < 0) continue;
126
+ if (cat_val_this >= model_outputs.ncat_ord[col - model_outputs.ncols_numeric - model_outputs.ncols_categ]) continue;
127
+ if (!model_outputs.cat_outlier_any_cl[col - model_outputs.ncols_numeric][cat_val_this]) continue;
128
+
129
+ }
130
+
131
+ #ifdef _OPENMP
132
+ outliers_thread[omp_get_thread_num()] = follow_tree(model_outputs, prediction_data, 0, 0, row, col, col_is_num, num_val_this, cat_val_this)?
133
+ true : outliers_thread[omp_get_thread_num()];
134
+ #else
135
+ found_outliers = std::max(found_outliers, follow_tree(model_outputs, prediction_data, 0, 0, row, col, col_is_num, num_val_this, cat_val_this));
136
+ #endif
137
+
138
+ }
139
+ }
140
+
141
+ #if defined(_OPENMP)
142
+ for (size_t tid = 0; tid < outliers_thread.size(); tid++) {
143
+ if (outliers_thread[tid] != 0) found_outliers = true;
144
+ }
145
+ #endif
146
+
147
+ if (found_outliers)
148
+ calc_min_decimals_to_print(model_outputs, prediction_data.numeric_data, nthreads);
149
+
150
+ return found_outliers;
151
+ }
152
+
153
+ bool follow_tree(ModelOutputs &model_outputs, PredictionData &prediction_data, size_t curr_tree, size_t curr_depth,
154
+ size_t_for row, size_t_for col, bool col_is_num, double num_val_this, int cat_val_this)
155
+ {
156
+ bool found_outliers = check_is_outlier_in_tree(model_outputs.all_trees[col][curr_tree].clusters,
157
+ curr_depth, curr_tree, model_outputs, prediction_data,
158
+ row, col, col_is_num, num_val_this, cat_val_this);
159
+
160
+ /* if there's outliers at this level and it's in a non-NA branch, there's no point in continuing
161
+ further down the tree as deeper clusters are not preferred */
162
+ if (
163
+ found_outliers &&
164
+ !model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
165
+ ) return true;
166
+
167
+ /* check if the tree is a dead-end */
168
+ if (
169
+ model_outputs.all_trees[col][curr_tree].tree_NA == 0 &&
170
+ model_outputs.all_trees[col][curr_tree].tree_left == 0 &&
171
+ model_outputs.all_trees[col][curr_tree].tree_right == 0 &&
172
+ (
173
+ model_outputs.all_trees[col][curr_tree].binary_branches.size() == 0 ||
174
+ *std::max_element(
175
+ model_outputs.all_trees[col][curr_tree].binary_branches.begin(),
176
+ model_outputs.all_trees[col][curr_tree].binary_branches.end()
177
+ ) == 0
178
+ ) &&
179
+ (
180
+ model_outputs.all_trees[col][curr_tree].all_branches.size() == 0 ||
181
+ *std::max_element(
182
+ model_outputs.all_trees[col][curr_tree].all_branches.begin(),
183
+ model_outputs.all_trees[col][curr_tree].all_branches.end()
184
+ ) == 0
185
+ )
186
+ ) return false;
187
+
188
+ /* try to follow trees according to the value of the columns they look at */
189
+ double num_val_other;
190
+ int cat_val_other;
191
+
192
+ /* if using 'follow_all', follow on all possible branches */
193
+ if (model_outputs.all_trees[col][curr_tree].all_branches.size() > 0) {
194
+
195
+ for (size_t br : model_outputs.all_trees[col][curr_tree].all_branches) {
196
+ if (br > 0) {
197
+ switch(model_outputs.all_trees[col][br].column_type) {
198
+
199
+ case Numeric:
200
+ {
201
+ num_val_other = prediction_data.numeric_data[row + model_outputs.all_trees[col][br].col_num * prediction_data.nrows];
202
+ switch (model_outputs.all_trees[col][br].split_this_branch) {
203
+ case IsNa:
204
+ {
205
+ if (isnan(num_val_other))
206
+ found_outliers = follow_tree(model_outputs, prediction_data, br, curr_depth + 1,
207
+ row, col, col_is_num, num_val_this, cat_val_this)?
208
+ true : found_outliers;
209
+ break;
210
+ }
211
+
212
+ case LessOrEqual:
213
+ {
214
+ if (!isnan(num_val_other) && num_val_other <= model_outputs.all_trees[col][br].split_point)
215
+ found_outliers = follow_tree(model_outputs, prediction_data, br, curr_depth + 1,
216
+ row, col, col_is_num, num_val_this, cat_val_this)?
217
+ true : found_outliers;
218
+ break;
219
+ }
220
+
221
+ case Greater:
222
+ {
223
+ if (!isnan(num_val_other) && num_val_other > model_outputs.all_trees[col][br].split_point)
224
+ found_outliers = follow_tree(model_outputs, prediction_data, br, curr_depth + 1,
225
+ row, col, col_is_num, num_val_this, cat_val_this)?
226
+ true : found_outliers;
227
+ break;
228
+ }
229
+ }
230
+ break;
231
+ }
232
+
233
+ case Categorical:
234
+ {
235
+ cat_val_other = prediction_data.categorical_data[row + model_outputs.all_trees[col][br].col_num * prediction_data.nrows];
236
+ if (cat_val_other >= model_outputs.ncat[model_outputs.all_trees[col][br].col_num]) continue;
237
+ switch (model_outputs.all_trees[col][br].split_this_branch) {
238
+ case IsNa:
239
+ {
240
+ if (cat_val_other < 0)
241
+ found_outliers = follow_tree(model_outputs, prediction_data, br, curr_depth + 1,
242
+ row, col, col_is_num, num_val_this, cat_val_this)?
243
+ true : found_outliers;
244
+ break;
245
+ }
246
+
247
+ case InSubset:
248
+ {
249
+ if (cat_val_other >= 0 && model_outputs.all_trees[col][br].split_subset[cat_val_other] == 1)
250
+ found_outliers = follow_tree(model_outputs, prediction_data, br, curr_depth + 1,
251
+ row, col, col_is_num, num_val_this, cat_val_this)?
252
+ true : found_outliers;
253
+ break;
254
+ }
255
+
256
+ case NotInSubset:
257
+ {
258
+ if (cat_val_other >= 0 && model_outputs.all_trees[col][br].split_subset[cat_val_other] == 0)
259
+ found_outliers = follow_tree(model_outputs, prediction_data, br, curr_depth + 1,
260
+ row, col, col_is_num, num_val_this, cat_val_this)?
261
+ true : found_outliers;
262
+ break;
263
+ }
264
+
265
+ case Equal:
266
+ {
267
+ if (cat_val_other == model_outputs.all_trees[col][br].split_lev)
268
+ found_outliers = follow_tree(model_outputs, prediction_data, br, curr_depth + 1,
269
+ row, col, col_is_num, num_val_this, cat_val_this)?
270
+ true : found_outliers;
271
+ break;
272
+ }
273
+
274
+ case NotEqual:
275
+ {
276
+ if (cat_val_other >= 0 && cat_val_other != model_outputs.all_trees[col][br].split_lev)
277
+ found_outliers = follow_tree(model_outputs, prediction_data, br, curr_depth + 1,
278
+ row, col, col_is_num, num_val_this, cat_val_this)?
279
+ true : found_outliers;
280
+ break;
281
+ }
282
+ }
283
+ break;
284
+ }
285
+
286
+ case Ordinal:
287
+ {
288
+ cat_val_other = prediction_data.ordinal_data[row + model_outputs.all_trees[col][br].col_num * prediction_data.nrows];
289
+ if (cat_val_other >= model_outputs.ncat_ord[model_outputs.all_trees[col][br].col_num]) continue;
290
+ switch (model_outputs.all_trees[col][br].split_this_branch) {
291
+ case IsNa:
292
+ {
293
+ if (cat_val_other < 0)
294
+ found_outliers = follow_tree(model_outputs, prediction_data, br, curr_depth + 1,
295
+ row, col, col_is_num, num_val_this, cat_val_this)?
296
+ true : found_outliers;
297
+ break;
298
+ }
299
+
300
+ case LessOrEqual:
301
+ {
302
+ if (cat_val_other >= 0 && cat_val_other <= model_outputs.all_trees[col][br].split_lev)
303
+ found_outliers = follow_tree(model_outputs, prediction_data, br, curr_depth + 1,
304
+ row, col, col_is_num, num_val_this, cat_val_this)?
305
+ true : found_outliers;
306
+ break;
307
+ }
308
+
309
+ case Greater:
310
+ {
311
+ if (cat_val_other > model_outputs.all_trees[col][br].split_lev)
312
+ found_outliers = follow_tree(model_outputs, prediction_data, br, curr_depth + 1,
313
+ row, col, col_is_num, num_val_this, cat_val_this)?
314
+ true : found_outliers;
315
+ break;
316
+ }
317
+
318
+ case Equal:
319
+ {
320
+ if (cat_val_other == model_outputs.all_trees[col][br].split_lev)
321
+ found_outliers = follow_tree(model_outputs, prediction_data, br, curr_depth + 1,
322
+ row, col, col_is_num, num_val_this, cat_val_this)?
323
+ true : found_outliers;
324
+ break;
325
+ }
326
+
327
+ case NotEqual:
328
+ {
329
+ if (cat_val_other >= 0 && cat_val_other != model_outputs.all_trees[col][br].split_lev)
330
+ found_outliers = follow_tree(model_outputs, prediction_data, br, curr_depth + 1,
331
+ row, col, col_is_num, num_val_this, cat_val_this)?
332
+ true : found_outliers;
333
+ break;
334
+ }
335
+ }
336
+ break;
337
+ }
338
+
339
+ }
340
+ }
341
+ }
342
+ return found_outliers;
343
+ }
344
+
345
+ /* regular case (not using 'follow_all') - follow the corresponding branch */
346
+ switch(model_outputs.all_trees[col][curr_tree].column_type) {
347
+
348
+ case NoType:
349
+ {
350
+ if (model_outputs.all_trees[col][curr_tree].binary_branches.size() > 0) {
351
+ for (size_t tree_follow : model_outputs.all_trees[col][curr_tree].binary_branches) {
352
+ if (tree_follow > 0)
353
+ found_outliers = follow_tree(model_outputs, prediction_data, tree_follow, curr_depth,
354
+ row, col, col_is_num, num_val_this, cat_val_this)?
355
+ true : found_outliers;
356
+ }
357
+ return found_outliers;
358
+ }
359
+ break;
360
+ }
361
+
362
+ case Numeric:
363
+ {
364
+ num_val_other = prediction_data.numeric_data[row + model_outputs.all_trees[col][curr_tree].col_num * prediction_data.nrows];
365
+ if (isnan(num_val_other)) {
366
+
367
+ if (model_outputs.all_trees[col][curr_tree].tree_NA > 0)
368
+ return follow_tree(model_outputs, prediction_data, model_outputs.all_trees[col][curr_tree].tree_NA, curr_depth + 1,
369
+ row, col, col_is_num, num_val_this, cat_val_this);
370
+
371
+ } else if (num_val_other <= model_outputs.all_trees[col][curr_tree].split_point) {
372
+
373
+ if (model_outputs.all_trees[col][curr_tree].tree_left > 0)
374
+ return follow_tree(model_outputs, prediction_data, model_outputs.all_trees[col][curr_tree].tree_left, curr_depth + 1,
375
+ row, col, col_is_num, num_val_this, cat_val_this);
376
+
377
+ } else {
378
+
379
+ if (model_outputs.all_trees[col][curr_tree].tree_right > 0)
380
+ return follow_tree(model_outputs, prediction_data, model_outputs.all_trees[col][curr_tree].tree_right, curr_depth + 1,
381
+ row, col, col_is_num, num_val_this, cat_val_this);
382
+
383
+ }
384
+ break;
385
+ }
386
+
387
+ case Categorical:
388
+ {
389
+ cat_val_other = prediction_data.categorical_data[row + model_outputs.all_trees[col][curr_tree].col_num * prediction_data.nrows];
390
+ if (cat_val_other >= model_outputs.ncat[model_outputs.all_trees[col][curr_tree].col_num]) return false;
391
+ if (cat_val_other < 0) {
392
+ if (model_outputs.all_trees[col][curr_tree].tree_NA > 0)
393
+ return follow_tree(model_outputs, prediction_data, model_outputs.all_trees[col][curr_tree].tree_NA, curr_depth + 1,
394
+ row, col, col_is_num, num_val_this, cat_val_this);
395
+ else return false;
396
+ }
397
+
398
+
399
+ if (model_outputs.all_trees[col][curr_tree].binary_branches.size() > 0) {
400
+
401
+ if (curr_tree == 0 && model_outputs.all_trees[col][curr_tree].column_type == NoType) {
402
+ /* binarized branches in the main tree */
403
+ for (size_t tree_follow : model_outputs.all_trees[col][curr_tree].binary_branches) {
404
+
405
+ if (tree_follow > 0)
406
+ found_outliers = follow_tree(model_outputs, prediction_data, tree_follow, curr_depth,
407
+ row, col, col_is_num, num_val_this, cat_val_this)?
408
+ true : found_outliers;
409
+ }
410
+ return found_outliers;
411
+
412
+ } else {
413
+
414
+ /* single-category branch in a categorical-by-categorical split */
415
+ if (model_outputs.all_trees[col][curr_tree].binary_branches[cat_val_other] > 0) {
416
+ return follow_tree(model_outputs, prediction_data, model_outputs.all_trees[col][curr_tree].binary_branches[cat_val_other], curr_depth + 1,
417
+ row, col, col_is_num, num_val_this, cat_val_this);
418
+ }
419
+ }
420
+
421
+ }
422
+
423
+ else if (model_outputs.all_trees[col][curr_tree].split_lev != INT_MAX) {
424
+
425
+ if (model_outputs.all_trees[col][curr_tree].split_lev == cat_val_other) {
426
+
427
+ if (model_outputs.all_trees[col][curr_tree].tree_left > 0)
428
+ return follow_tree(model_outputs, prediction_data, model_outputs.all_trees[col][curr_tree].tree_left, curr_depth + 1,
429
+ row, col, col_is_num, num_val_this, cat_val_this);
430
+
431
+ } else {
432
+
433
+ if (model_outputs.all_trees[col][curr_tree].tree_right > 0)
434
+ return follow_tree(model_outputs, prediction_data, model_outputs.all_trees[col][curr_tree].tree_right, curr_depth + 1,
435
+ row, col, col_is_num, num_val_this, cat_val_this);
436
+
437
+ }
438
+
439
+ }
440
+
441
+ else {
442
+
443
+ if (model_outputs.all_trees[col][curr_tree].split_subset[cat_val_other] == 1) {
444
+
445
+ if (model_outputs.all_trees[col][curr_tree].tree_left > 0)
446
+ return follow_tree(model_outputs, prediction_data, model_outputs.all_trees[col][curr_tree].tree_left, curr_depth + 1,
447
+ row, col, col_is_num, num_val_this, cat_val_this);
448
+
449
+ } else if (model_outputs.all_trees[col][curr_tree].split_subset[cat_val_other] == 0) {
450
+
451
+ if (model_outputs.all_trees[col][curr_tree].tree_right > 0)
452
+ return follow_tree(model_outputs, prediction_data, model_outputs.all_trees[col][curr_tree].tree_right, curr_depth + 1,
453
+ row, col, col_is_num, num_val_this, cat_val_this);
454
+
455
+ }
456
+
457
+ }
458
+ break;
459
+ }
460
+
461
+ case Ordinal:
462
+ {
463
+ cat_val_other = prediction_data.ordinal_data[row + model_outputs.all_trees[col][curr_tree].col_num * prediction_data.nrows];
464
+ if (cat_val_other >= model_outputs.ncat_ord[model_outputs.all_trees[col][curr_tree].col_num]) return false;
465
+ if (cat_val_other < 0) {
466
+
467
+ if (model_outputs.all_trees[col][curr_tree].tree_NA > 0)
468
+ return follow_tree(model_outputs, prediction_data, model_outputs.all_trees[col][curr_tree].tree_NA, curr_depth + 1,
469
+ row, col, col_is_num, num_val_this, cat_val_this);
470
+
471
+ } else if (cat_val_other <= model_outputs.all_trees[col][curr_tree].split_lev) {
472
+
473
+ if (model_outputs.all_trees[col][curr_tree].tree_left > 0)
474
+ return follow_tree(model_outputs, prediction_data, model_outputs.all_trees[col][curr_tree].tree_left, curr_depth + 1,
475
+ row, col, col_is_num, num_val_this, cat_val_this);
476
+
477
+ } else {
478
+
479
+ if (model_outputs.all_trees[col][curr_tree].tree_right > 0)
480
+ return follow_tree(model_outputs, prediction_data, model_outputs.all_trees[col][curr_tree].tree_right, curr_depth + 1,
481
+ row, col, col_is_num, num_val_this, cat_val_this);
482
+ }
483
+ break;
484
+ }
485
+
486
+ }
487
+
488
+ return false;
489
+ }
490
+
491
+ bool check_is_outlier_in_tree(std::vector<size_t> &clusters_in_tree, size_t curr_depth, size_t curr_tree,
492
+ ModelOutputs &model_outputs, PredictionData &prediction_data, size_t_for row, size_t_for col,
493
+ bool col_is_num, double num_val_this, int cat_val_this)
494
+ {
495
+
496
+
497
+ bool tree_has_outliers = false;
498
+ bool flag_this_cluster;
499
+ double outlier_score;
500
+ size_t cluster_size;
501
+ size_t cluster_depth;
502
+ double num_val_other;
503
+ int cat_val_other;
504
+
505
+ if (clusters_in_tree.size() > 0) {
506
+
507
+ /* see if it would be an outlier under any of the clusters from this tree */
508
+ for (const size_t cl : clusters_in_tree) {
509
+
510
+ if (col_is_num) {
511
+ if (
512
+ num_val_this > model_outputs.all_clusters[col][cl].lower_lim &&
513
+ num_val_this < model_outputs.all_clusters[col][cl].upper_lim
514
+ ) continue;
515
+ } else {
516
+ if (model_outputs.all_clusters[col][cl].subset_common[cat_val_this] == 0)
517
+ continue;
518
+ }
519
+
520
+ /* if so, then check if it actually belongs into the cluster */
521
+ flag_this_cluster = false;
522
+ switch(model_outputs.all_clusters[col][cl].column_type) {
523
+
524
+ case NoType:
525
+ {
526
+ flag_this_cluster = true;
527
+ break;
528
+ }
529
+
530
+ case Numeric:
531
+ {
532
+ num_val_other = prediction_data.numeric_data[row + model_outputs.all_clusters[col][cl].col_num * prediction_data.nrows];
533
+ switch(model_outputs.all_clusters[col][cl].split_type) {
534
+ case IsNa:
535
+ {
536
+ if (isnan(num_val_other)) flag_this_cluster = true;
537
+ break;
538
+ }
539
+
540
+ case LessOrEqual:
541
+ {
542
+ if (!isnan(num_val_other) && num_val_other <= model_outputs.all_clusters[col][cl].split_point) flag_this_cluster = true;
543
+ break;
544
+ }
545
+
546
+ case Greater:
547
+ {
548
+ if (!isnan(num_val_other) && num_val_other > model_outputs.all_clusters[col][cl].split_point) flag_this_cluster = true;
549
+ break;
550
+ }
551
+ }
552
+ break;
553
+ }
554
+
555
+ case Categorical:
556
+ {
557
+ cat_val_other = prediction_data.categorical_data[row + model_outputs.all_clusters[col][cl].col_num * prediction_data.nrows];
558
+ if (cat_val_other >= model_outputs.ncat[model_outputs.all_clusters[col][cl].col_num]) continue;
559
+ switch(model_outputs.all_clusters[col][cl].split_type) {
560
+ case IsNa:
561
+ {
562
+ if (cat_val_other < 0) flag_this_cluster = true;
563
+ break;
564
+ }
565
+
566
+ case InSubset:
567
+ {
568
+ if (cat_val_other >=0 && model_outputs.all_clusters[col][cl].split_subset[cat_val_other] == 1) flag_this_cluster = true;
569
+ break;
570
+ }
571
+
572
+ case NotInSubset:
573
+ {
574
+ if (cat_val_other >=0 && model_outputs.all_clusters[col][cl].split_subset[cat_val_other] == 0) flag_this_cluster = true;
575
+ break;
576
+ }
577
+
578
+ case Equal:
579
+ {
580
+ if (cat_val_other == model_outputs.all_clusters[col][cl].split_lev) flag_this_cluster = true;
581
+ break;
582
+ }
583
+
584
+ case NotEqual:
585
+ {
586
+ if (cat_val_other >=0 && cat_val_other != model_outputs.all_clusters[col][cl].split_lev) flag_this_cluster = true;
587
+ break;
588
+ }
589
+
590
+ /* Note: type 'SingleCateg' is only used temporarily, later gets converted to 'Equal' */
591
+ }
592
+ break;
593
+ }
594
+
595
+ case Ordinal:
596
+ {
597
+ cat_val_other = prediction_data.ordinal_data[row + model_outputs.all_clusters[col][cl].col_num * prediction_data.nrows];
598
+ if (cat_val_other >= model_outputs.ncat_ord[model_outputs.all_clusters[col][cl].col_num]) continue;
599
+ switch(model_outputs.all_clusters[col][cl].split_type) {
600
+ case IsNa:
601
+ {
602
+ if (cat_val_other < 0) flag_this_cluster = true;
603
+ break;
604
+ }
605
+
606
+ case LessOrEqual:
607
+ {
608
+ if (cat_val_other >=0 && cat_val_other <= model_outputs.all_clusters[col][cl].split_lev) flag_this_cluster = true;
609
+ break;
610
+ }
611
+
612
+ case Greater:
613
+ {
614
+ if (cat_val_other >=0 && cat_val_other > model_outputs.all_clusters[col][cl].split_lev) flag_this_cluster = true;
615
+ break;
616
+ }
617
+
618
+ case Equal:
619
+ {
620
+ if (cat_val_other == model_outputs.all_clusters[col][cl].split_lev) flag_this_cluster = true;
621
+ break;
622
+ }
623
+
624
+ case NotEqual:
625
+ {
626
+ if (cat_val_other >=0 && cat_val_other != model_outputs.all_clusters[col][cl].split_lev) flag_this_cluster = true;
627
+ break;
628
+ }
629
+ }
630
+ break;
631
+ }
632
+ }
633
+ if (flag_this_cluster) {
634
+
635
+ tree_has_outliers = true;
636
+ cluster_size = model_outputs.all_clusters[col][cl].cluster_size;
637
+ cluster_depth = curr_depth + ((model_outputs.all_clusters[col][cl].column_type == NoType)? 0 : 1);
638
+ if (col_is_num) {
639
+ outlier_score = chebyshyov_bound(z_score(
640
+ (model_outputs.col_transf[col] == NoTransf)? num_val_this :
641
+ (model_outputs.col_transf[col] == Log)?
642
+ log(num_val_this - model_outputs.transf_offset[col]) :
643
+ exp( (num_val_this - model_outputs.transf_offset[col]) / model_outputs.sd_div[col] ),
644
+ model_outputs.all_clusters[col][cl].cluster_mean,
645
+ model_outputs.all_clusters[col][cl].cluster_sd
646
+ )
647
+ );
648
+ } else {
649
+ outlier_score = model_outputs.all_clusters[col][cl].score_categ[cat_val_this];
650
+ }
651
+
652
+ /* if this is the best cluster so far, remember it */
653
+ /* Note: if parallelizing by columns, must turn this into a critical section as the previously-assigned column can change in the meantime */
654
+ // #pragma omp critical
655
+ if (
656
+ model_outputs.outlier_scores_final[row] >= 1.0 ||
657
+ (
658
+ cluster_depth < model_outputs.outlier_depth_final[row] &&
659
+ (
660
+ model_outputs.all_clusters[col][cl].has_NA_branch
661
+ ==
662
+ model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
663
+ )
664
+ ) ||
665
+ (
666
+ model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch &&
667
+ !model_outputs.all_clusters[col][cl].has_NA_branch
668
+ ) ||
669
+ (
670
+ cluster_depth == model_outputs.outlier_depth_final[row] &&
671
+ (
672
+ model_outputs.all_clusters[col][cl].has_NA_branch
673
+ ==
674
+ model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
675
+ ) &&
676
+ cluster_size > model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].cluster_size
677
+ ) ||
678
+ (
679
+ cluster_depth == model_outputs.outlier_depth_final[row] &&
680
+ cluster_size == model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].cluster_size &&
681
+ (
682
+ model_outputs.all_clusters[col][cl].has_NA_branch
683
+ ==
684
+ model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
685
+ ) &&
686
+ outlier_score < model_outputs.outlier_scores_final[row]
687
+ )
688
+ )
689
+ {
690
+ model_outputs.outlier_columns_final[row] = col;
691
+ model_outputs.outlier_scores_final[row] = outlier_score;
692
+ model_outputs.outlier_clusters_final[row] = cl;
693
+ model_outputs.outlier_trees_final[row] = curr_tree;
694
+ model_outputs.outlier_depth_final[row] = cluster_depth;
695
+ }
696
+
697
+
698
+ }
699
+
700
+ }
701
+
702
+ }
703
+
704
+ return tree_has_outliers;
705
+
706
+ }