outliertree 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,706 @@
1
+ /********************************************************************************************************************
2
+ * Explainable outlier detection
3
+ *
4
+ * Tries to detect outliers by generating decision trees that attempt to predict the values of each column based on
5
+ * each other column, testing in each branch of every tried split (if it meets some minimum criteria) whether there
6
+ * are observations that seem too distant from the others in a 1-D distribution for the column that the split tries
7
+ * to "predict" (will not generate a score for each observation).
8
+ * Splits are based on gain, while outlierness is based on confidence intervals.
9
+ * Similar in spirit to the GritBot software developed by RuleQuest research.
10
+ *
11
+ *
12
+ * Copyright 2019 David Cortes.
13
+ *
14
+ * Written for C++11 standard and OpenMP 2.0 or later. Code is meant to be wrapped into scripting languages
15
+ * such as R or Python.
16
+ *
17
+ * This file is part of OutlierTree.
18
+ *
19
+ * OutlierTree is free software: you can redistribute it and/or modify
20
+ * it under the terms of the GNU General Public License as published by
21
+ * the Free Software Foundation, either version 3 of the License, or
22
+ * (at your option) any later version.
23
+ *
24
+ * OutlierTree is distributed in the hope that it will be useful,
25
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
26
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27
+ * GNU General Public License for more details.
28
+ *
29
+ * You should have received a copy of the GNU General Public License
30
+ * along with OutlierTree. If not, see <https://www.gnu.org/licenses/>.
31
+ ********************************************************************************************************************/
32
+ #include "outlier_tree.hpp"
33
+
34
+
35
+ /* Check if new data would be outliers according to previously-fit model
36
+ *
37
+ * Note that the new data must follow the exact same column order, and must also be passsed as arrays
38
+ * order by columns (like Fortran arrays, not like C arrays). For data with < 10,000 rows, it's usually
39
+ * faster to run it single-threaded. The outputs are pased in arrays within the 'ModelOutputs' struct,
40
+ * just like when fitting the model. Outpus for rows from previous calls to this function or to the model-fitting
41
+ * function will be overwriten.
42
+ *
43
+ * Parameters:
44
+ * - numeric_data[n * m1] (in)
45
+ * Array with numerical columns in the data. Must be ordered by columns like Fortran arrays.
46
+ * Missing values should be encoded as NaN. Infinite values in most sections are treated as NaN too.
47
+ * If there are no numerical columns, pass NULL.
48
+ * - categorical_data[n * m2] (in)
49
+ * Array with categorical columns in the data. Must be ordered by columns like Fortran arrays.
50
+ * Negative numbers will be interpreted as missing values. Numeration must start at zero and be
51
+ * contiguous (i.e. if there's category 2, must also have category 1).
52
+ * If there are no categorical columns, pass NULL.
53
+ * - ordinal_data[n * m3] (in)
54
+ * Array with ordinal categorical columns in the data. Must be ordered by columns like Fortran arrays.
55
+ * Same rules as for categorical data. Note that the order will only be taken into consideration when
56
+ * producing splits by these columns, but outliers are still detected in the same way as for categoricals.
57
+ * If there are no ordinal columns, pass NULL.
58
+ * - nrows (in)
59
+ * Number of rows (n) in the arrays passed above.
60
+ * - nthreads (in)
61
+ * Number of parallel threads to use.
62
+ * - model_outputs (in, out)
63
+ * Struct containing the data from the fitted model necessary to make new predictions,
64
+ * and buffer vectors where to store the details of the potential outliers found.
65
+ *
66
+ * Returns:
67
+ * Whether there were any outliers identified in the data passed here. Their details will be inside the
68
+ * 'ModelOutputs' struct.
69
+ */
70
+ bool find_new_outliers(double *restrict numeric_data,
71
+ int *restrict categorical_data,
72
+ int *restrict ordinal_data,
73
+ size_t nrows, int nthreads, ModelOutputs &model_outputs)
74
+ {
75
+ size_t tot_cols = model_outputs.ncols_numeric + model_outputs.ncols_categ + model_outputs.ncols_ord;
76
+ double num_val_this;
77
+ int cat_val_this;
78
+ bool col_is_num;
79
+
80
+ bool found_outliers = false;
81
+ if (nrows < (size_t)nthreads)
82
+ nthreads = (int) nrows;
83
+ #if defined(_OPENMP)
84
+ std::vector<char> outliers_thread(nthreads, false);
85
+ #endif
86
+
87
+ /* reset the output data structures */
88
+ allocate_row_outputs(model_outputs, nrows, model_outputs.max_depth);
89
+
90
+ /* put data into a struct and pass it by reference */
91
+ PredictionData prediction_data = {numeric_data, categorical_data, ordinal_data, nrows};
92
+
93
+ /* Note: if parallelizing by columns instead of by rows, need to switch on the `#pragma omp critical`
94
+ in the block that assigns the cluster to an observation */
95
+
96
+ /* see if any value is an outlier */
97
+ // #pragma omp parallel for schedule(dynamic) num_threads(nthreads) shared(model_outputs, outliers_thread, nrows, tot_cols, prediction_data) private(col_is_num, num_val_this, cat_val_this)
98
+ for (size_t_for col = 0; col < tot_cols; col++) {
99
+
100
+ if (model_outputs.all_trees[col].size() == 0 || model_outputs.all_clusters[col].size() == 0) continue;
101
+ col_is_num = col < model_outputs.ncols_numeric;
102
+
103
+ /* Note: earlier versions of OpenMP (like v2 released in 2000 and still used by MSVC in 2019) don't support max reduction, hence this code */
104
+ #pragma omp parallel for schedule(dynamic) num_threads(nthreads) shared(model_outputs, outliers_thread, nrows, prediction_data) \
105
+ firstprivate(col_is_num, col) private(num_val_this, cat_val_this)
106
+ for (size_t_for row = 0; row < nrows; row++) {
107
+
108
+ /* first make a pre-check that the value could be flagged as outlier in some cluster */
109
+ if (col < model_outputs.ncols_numeric) {
110
+
111
+ num_val_this = prediction_data.numeric_data[row + col * nrows];
112
+ if (is_na_or_inf(num_val_this)) continue;
113
+ if ((num_val_this < model_outputs.max_outlier_any_cl[col]) && (num_val_this > model_outputs.min_outlier_any_cl[col])) continue;
114
+
115
+ } else if (col < (model_outputs.ncols_numeric + model_outputs.ncols_categ)) {
116
+
117
+ cat_val_this = prediction_data.categorical_data[row + (col - model_outputs.ncols_numeric) * nrows];
118
+ if (cat_val_this < 0) continue;
119
+ if (cat_val_this >= model_outputs.ncat[col - model_outputs.ncols_numeric]) continue;
120
+ if (!model_outputs.cat_outlier_any_cl[col - model_outputs.ncols_numeric][cat_val_this]) continue;
121
+
122
+ } else {
123
+
124
+ cat_val_this = prediction_data.ordinal_data[row + (col - model_outputs.ncols_numeric - model_outputs.ncols_categ) * nrows];
125
+ if (cat_val_this < 0) continue;
126
+ if (cat_val_this >= model_outputs.ncat_ord[col - model_outputs.ncols_numeric - model_outputs.ncols_categ]) continue;
127
+ if (!model_outputs.cat_outlier_any_cl[col - model_outputs.ncols_numeric][cat_val_this]) continue;
128
+
129
+ }
130
+
131
+ #ifdef _OPENMP
132
+ outliers_thread[omp_get_thread_num()] = follow_tree(model_outputs, prediction_data, 0, 0, row, col, col_is_num, num_val_this, cat_val_this)?
133
+ true : outliers_thread[omp_get_thread_num()];
134
+ #else
135
+ found_outliers = std::max(found_outliers, follow_tree(model_outputs, prediction_data, 0, 0, row, col, col_is_num, num_val_this, cat_val_this));
136
+ #endif
137
+
138
+ }
139
+ }
140
+
141
+ #if defined(_OPENMP)
142
+ for (size_t tid = 0; tid < outliers_thread.size(); tid++) {
143
+ if (outliers_thread[tid] != 0) found_outliers = true;
144
+ }
145
+ #endif
146
+
147
+ if (found_outliers)
148
+ calc_min_decimals_to_print(model_outputs, prediction_data.numeric_data, nthreads);
149
+
150
+ return found_outliers;
151
+ }
152
+
153
+ bool follow_tree(ModelOutputs &model_outputs, PredictionData &prediction_data, size_t curr_tree, size_t curr_depth,
154
+ size_t_for row, size_t_for col, bool col_is_num, double num_val_this, int cat_val_this)
155
+ {
156
+ bool found_outliers = check_is_outlier_in_tree(model_outputs.all_trees[col][curr_tree].clusters,
157
+ curr_depth, curr_tree, model_outputs, prediction_data,
158
+ row, col, col_is_num, num_val_this, cat_val_this);
159
+
160
+ /* if there's outliers at this level and it's in a non-NA branch, there's no point in continuing
161
+ further down the tree as deeper clusters are not preferred */
162
+ if (
163
+ found_outliers &&
164
+ !model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
165
+ ) return true;
166
+
167
+ /* check if the tree is a dead-end */
168
+ if (
169
+ model_outputs.all_trees[col][curr_tree].tree_NA == 0 &&
170
+ model_outputs.all_trees[col][curr_tree].tree_left == 0 &&
171
+ model_outputs.all_trees[col][curr_tree].tree_right == 0 &&
172
+ (
173
+ model_outputs.all_trees[col][curr_tree].binary_branches.size() == 0 ||
174
+ *std::max_element(
175
+ model_outputs.all_trees[col][curr_tree].binary_branches.begin(),
176
+ model_outputs.all_trees[col][curr_tree].binary_branches.end()
177
+ ) == 0
178
+ ) &&
179
+ (
180
+ model_outputs.all_trees[col][curr_tree].all_branches.size() == 0 ||
181
+ *std::max_element(
182
+ model_outputs.all_trees[col][curr_tree].all_branches.begin(),
183
+ model_outputs.all_trees[col][curr_tree].all_branches.end()
184
+ ) == 0
185
+ )
186
+ ) return false;
187
+
188
+ /* try to follow trees according to the value of the columns they look at */
189
+ double num_val_other;
190
+ int cat_val_other;
191
+
192
+ /* if using 'follow_all', follow on all possible branches */
193
+ if (model_outputs.all_trees[col][curr_tree].all_branches.size() > 0) {
194
+
195
+ for (size_t br : model_outputs.all_trees[col][curr_tree].all_branches) {
196
+ if (br > 0) {
197
+ switch(model_outputs.all_trees[col][br].column_type) {
198
+
199
+ case Numeric:
200
+ {
201
+ num_val_other = prediction_data.numeric_data[row + model_outputs.all_trees[col][br].col_num * prediction_data.nrows];
202
+ switch (model_outputs.all_trees[col][br].split_this_branch) {
203
+ case IsNa:
204
+ {
205
+ if (isnan(num_val_other))
206
+ found_outliers = follow_tree(model_outputs, prediction_data, br, curr_depth + 1,
207
+ row, col, col_is_num, num_val_this, cat_val_this)?
208
+ true : found_outliers;
209
+ break;
210
+ }
211
+
212
+ case LessOrEqual:
213
+ {
214
+ if (!isnan(num_val_other) && num_val_other <= model_outputs.all_trees[col][br].split_point)
215
+ found_outliers = follow_tree(model_outputs, prediction_data, br, curr_depth + 1,
216
+ row, col, col_is_num, num_val_this, cat_val_this)?
217
+ true : found_outliers;
218
+ break;
219
+ }
220
+
221
+ case Greater:
222
+ {
223
+ if (!isnan(num_val_other) && num_val_other > model_outputs.all_trees[col][br].split_point)
224
+ found_outliers = follow_tree(model_outputs, prediction_data, br, curr_depth + 1,
225
+ row, col, col_is_num, num_val_this, cat_val_this)?
226
+ true : found_outliers;
227
+ break;
228
+ }
229
+ }
230
+ break;
231
+ }
232
+
233
+ case Categorical:
234
+ {
235
+ cat_val_other = prediction_data.categorical_data[row + model_outputs.all_trees[col][br].col_num * prediction_data.nrows];
236
+ if (cat_val_other >= model_outputs.ncat[model_outputs.all_trees[col][br].col_num]) continue;
237
+ switch (model_outputs.all_trees[col][br].split_this_branch) {
238
+ case IsNa:
239
+ {
240
+ if (cat_val_other < 0)
241
+ found_outliers = follow_tree(model_outputs, prediction_data, br, curr_depth + 1,
242
+ row, col, col_is_num, num_val_this, cat_val_this)?
243
+ true : found_outliers;
244
+ break;
245
+ }
246
+
247
+ case InSubset:
248
+ {
249
+ if (cat_val_other >= 0 && model_outputs.all_trees[col][br].split_subset[cat_val_other] == 1)
250
+ found_outliers = follow_tree(model_outputs, prediction_data, br, curr_depth + 1,
251
+ row, col, col_is_num, num_val_this, cat_val_this)?
252
+ true : found_outliers;
253
+ break;
254
+ }
255
+
256
+ case NotInSubset:
257
+ {
258
+ if (cat_val_other >= 0 && model_outputs.all_trees[col][br].split_subset[cat_val_other] == 0)
259
+ found_outliers = follow_tree(model_outputs, prediction_data, br, curr_depth + 1,
260
+ row, col, col_is_num, num_val_this, cat_val_this)?
261
+ true : found_outliers;
262
+ break;
263
+ }
264
+
265
+ case Equal:
266
+ {
267
+ if (cat_val_other == model_outputs.all_trees[col][br].split_lev)
268
+ found_outliers = follow_tree(model_outputs, prediction_data, br, curr_depth + 1,
269
+ row, col, col_is_num, num_val_this, cat_val_this)?
270
+ true : found_outliers;
271
+ break;
272
+ }
273
+
274
+ case NotEqual:
275
+ {
276
+ if (cat_val_other >= 0 && cat_val_other != model_outputs.all_trees[col][br].split_lev)
277
+ found_outliers = follow_tree(model_outputs, prediction_data, br, curr_depth + 1,
278
+ row, col, col_is_num, num_val_this, cat_val_this)?
279
+ true : found_outliers;
280
+ break;
281
+ }
282
+ }
283
+ break;
284
+ }
285
+
286
+ case Ordinal:
287
+ {
288
+ cat_val_other = prediction_data.ordinal_data[row + model_outputs.all_trees[col][br].col_num * prediction_data.nrows];
289
+ if (cat_val_other >= model_outputs.ncat_ord[model_outputs.all_trees[col][br].col_num]) continue;
290
+ switch (model_outputs.all_trees[col][br].split_this_branch) {
291
+ case IsNa:
292
+ {
293
+ if (cat_val_other < 0)
294
+ found_outliers = follow_tree(model_outputs, prediction_data, br, curr_depth + 1,
295
+ row, col, col_is_num, num_val_this, cat_val_this)?
296
+ true : found_outliers;
297
+ break;
298
+ }
299
+
300
+ case LessOrEqual:
301
+ {
302
+ if (cat_val_other >= 0 && cat_val_other <= model_outputs.all_trees[col][br].split_lev)
303
+ found_outliers = follow_tree(model_outputs, prediction_data, br, curr_depth + 1,
304
+ row, col, col_is_num, num_val_this, cat_val_this)?
305
+ true : found_outliers;
306
+ break;
307
+ }
308
+
309
+ case Greater:
310
+ {
311
+ if (cat_val_other > model_outputs.all_trees[col][br].split_lev)
312
+ found_outliers = follow_tree(model_outputs, prediction_data, br, curr_depth + 1,
313
+ row, col, col_is_num, num_val_this, cat_val_this)?
314
+ true : found_outliers;
315
+ break;
316
+ }
317
+
318
+ case Equal:
319
+ {
320
+ if (cat_val_other == model_outputs.all_trees[col][br].split_lev)
321
+ found_outliers = follow_tree(model_outputs, prediction_data, br, curr_depth + 1,
322
+ row, col, col_is_num, num_val_this, cat_val_this)?
323
+ true : found_outliers;
324
+ break;
325
+ }
326
+
327
+ case NotEqual:
328
+ {
329
+ if (cat_val_other >= 0 && cat_val_other != model_outputs.all_trees[col][br].split_lev)
330
+ found_outliers = follow_tree(model_outputs, prediction_data, br, curr_depth + 1,
331
+ row, col, col_is_num, num_val_this, cat_val_this)?
332
+ true : found_outliers;
333
+ break;
334
+ }
335
+ }
336
+ break;
337
+ }
338
+
339
+ }
340
+ }
341
+ }
342
+ return found_outliers;
343
+ }
344
+
345
+ /* regular case (not using 'follow_all') - follow the corresponding branch */
346
+ switch(model_outputs.all_trees[col][curr_tree].column_type) {
347
+
348
+ case NoType:
349
+ {
350
+ if (model_outputs.all_trees[col][curr_tree].binary_branches.size() > 0) {
351
+ for (size_t tree_follow : model_outputs.all_trees[col][curr_tree].binary_branches) {
352
+ if (tree_follow > 0)
353
+ found_outliers = follow_tree(model_outputs, prediction_data, tree_follow, curr_depth,
354
+ row, col, col_is_num, num_val_this, cat_val_this)?
355
+ true : found_outliers;
356
+ }
357
+ return found_outliers;
358
+ }
359
+ break;
360
+ }
361
+
362
+ case Numeric:
363
+ {
364
+ num_val_other = prediction_data.numeric_data[row + model_outputs.all_trees[col][curr_tree].col_num * prediction_data.nrows];
365
+ if (isnan(num_val_other)) {
366
+
367
+ if (model_outputs.all_trees[col][curr_tree].tree_NA > 0)
368
+ return follow_tree(model_outputs, prediction_data, model_outputs.all_trees[col][curr_tree].tree_NA, curr_depth + 1,
369
+ row, col, col_is_num, num_val_this, cat_val_this);
370
+
371
+ } else if (num_val_other <= model_outputs.all_trees[col][curr_tree].split_point) {
372
+
373
+ if (model_outputs.all_trees[col][curr_tree].tree_left > 0)
374
+ return follow_tree(model_outputs, prediction_data, model_outputs.all_trees[col][curr_tree].tree_left, curr_depth + 1,
375
+ row, col, col_is_num, num_val_this, cat_val_this);
376
+
377
+ } else {
378
+
379
+ if (model_outputs.all_trees[col][curr_tree].tree_right > 0)
380
+ return follow_tree(model_outputs, prediction_data, model_outputs.all_trees[col][curr_tree].tree_right, curr_depth + 1,
381
+ row, col, col_is_num, num_val_this, cat_val_this);
382
+
383
+ }
384
+ break;
385
+ }
386
+
387
+ case Categorical:
388
+ {
389
+ cat_val_other = prediction_data.categorical_data[row + model_outputs.all_trees[col][curr_tree].col_num * prediction_data.nrows];
390
+ if (cat_val_other >= model_outputs.ncat[model_outputs.all_trees[col][curr_tree].col_num]) return false;
391
+ if (cat_val_other < 0) {
392
+ if (model_outputs.all_trees[col][curr_tree].tree_NA > 0)
393
+ return follow_tree(model_outputs, prediction_data, model_outputs.all_trees[col][curr_tree].tree_NA, curr_depth + 1,
394
+ row, col, col_is_num, num_val_this, cat_val_this);
395
+ else return false;
396
+ }
397
+
398
+
399
+ if (model_outputs.all_trees[col][curr_tree].binary_branches.size() > 0) {
400
+
401
+ if (curr_tree == 0 && model_outputs.all_trees[col][curr_tree].column_type == NoType) {
402
+ /* binarized branches in the main tree */
403
+ for (size_t tree_follow : model_outputs.all_trees[col][curr_tree].binary_branches) {
404
+
405
+ if (tree_follow > 0)
406
+ found_outliers = follow_tree(model_outputs, prediction_data, tree_follow, curr_depth,
407
+ row, col, col_is_num, num_val_this, cat_val_this)?
408
+ true : found_outliers;
409
+ }
410
+ return found_outliers;
411
+
412
+ } else {
413
+
414
+ /* single-category branch in a categorical-by-categorical split */
415
+ if (model_outputs.all_trees[col][curr_tree].binary_branches[cat_val_other] > 0) {
416
+ return follow_tree(model_outputs, prediction_data, model_outputs.all_trees[col][curr_tree].binary_branches[cat_val_other], curr_depth + 1,
417
+ row, col, col_is_num, num_val_this, cat_val_this);
418
+ }
419
+ }
420
+
421
+ }
422
+
423
+ else if (model_outputs.all_trees[col][curr_tree].split_lev != INT_MAX) {
424
+
425
+ if (model_outputs.all_trees[col][curr_tree].split_lev == cat_val_other) {
426
+
427
+ if (model_outputs.all_trees[col][curr_tree].tree_left > 0)
428
+ return follow_tree(model_outputs, prediction_data, model_outputs.all_trees[col][curr_tree].tree_left, curr_depth + 1,
429
+ row, col, col_is_num, num_val_this, cat_val_this);
430
+
431
+ } else {
432
+
433
+ if (model_outputs.all_trees[col][curr_tree].tree_right > 0)
434
+ return follow_tree(model_outputs, prediction_data, model_outputs.all_trees[col][curr_tree].tree_right, curr_depth + 1,
435
+ row, col, col_is_num, num_val_this, cat_val_this);
436
+
437
+ }
438
+
439
+ }
440
+
441
+ else {
442
+
443
+ if (model_outputs.all_trees[col][curr_tree].split_subset[cat_val_other] == 1) {
444
+
445
+ if (model_outputs.all_trees[col][curr_tree].tree_left > 0)
446
+ return follow_tree(model_outputs, prediction_data, model_outputs.all_trees[col][curr_tree].tree_left, curr_depth + 1,
447
+ row, col, col_is_num, num_val_this, cat_val_this);
448
+
449
+ } else if (model_outputs.all_trees[col][curr_tree].split_subset[cat_val_other] == 0) {
450
+
451
+ if (model_outputs.all_trees[col][curr_tree].tree_right > 0)
452
+ return follow_tree(model_outputs, prediction_data, model_outputs.all_trees[col][curr_tree].tree_right, curr_depth + 1,
453
+ row, col, col_is_num, num_val_this, cat_val_this);
454
+
455
+ }
456
+
457
+ }
458
+ break;
459
+ }
460
+
461
+ case Ordinal:
462
+ {
463
+ cat_val_other = prediction_data.ordinal_data[row + model_outputs.all_trees[col][curr_tree].col_num * prediction_data.nrows];
464
+ if (cat_val_other >= model_outputs.ncat_ord[model_outputs.all_trees[col][curr_tree].col_num]) return false;
465
+ if (cat_val_other < 0) {
466
+
467
+ if (model_outputs.all_trees[col][curr_tree].tree_NA > 0)
468
+ return follow_tree(model_outputs, prediction_data, model_outputs.all_trees[col][curr_tree].tree_NA, curr_depth + 1,
469
+ row, col, col_is_num, num_val_this, cat_val_this);
470
+
471
+ } else if (cat_val_other <= model_outputs.all_trees[col][curr_tree].split_lev) {
472
+
473
+ if (model_outputs.all_trees[col][curr_tree].tree_left > 0)
474
+ return follow_tree(model_outputs, prediction_data, model_outputs.all_trees[col][curr_tree].tree_left, curr_depth + 1,
475
+ row, col, col_is_num, num_val_this, cat_val_this);
476
+
477
+ } else {
478
+
479
+ if (model_outputs.all_trees[col][curr_tree].tree_right > 0)
480
+ return follow_tree(model_outputs, prediction_data, model_outputs.all_trees[col][curr_tree].tree_right, curr_depth + 1,
481
+ row, col, col_is_num, num_val_this, cat_val_this);
482
+ }
483
+ break;
484
+ }
485
+
486
+ }
487
+
488
+ return false;
489
+ }
490
+
491
+ bool check_is_outlier_in_tree(std::vector<size_t> &clusters_in_tree, size_t curr_depth, size_t curr_tree,
492
+ ModelOutputs &model_outputs, PredictionData &prediction_data, size_t_for row, size_t_for col,
493
+ bool col_is_num, double num_val_this, int cat_val_this)
494
+ {
495
+
496
+
497
+ bool tree_has_outliers = false;
498
+ bool flag_this_cluster;
499
+ double outlier_score;
500
+ size_t cluster_size;
501
+ size_t cluster_depth;
502
+ double num_val_other;
503
+ int cat_val_other;
504
+
505
+ if (clusters_in_tree.size() > 0) {
506
+
507
+ /* see if it would be an outlier under any of the clusters from this tree */
508
+ for (const size_t cl : clusters_in_tree) {
509
+
510
+ if (col_is_num) {
511
+ if (
512
+ num_val_this > model_outputs.all_clusters[col][cl].lower_lim &&
513
+ num_val_this < model_outputs.all_clusters[col][cl].upper_lim
514
+ ) continue;
515
+ } else {
516
+ if (model_outputs.all_clusters[col][cl].subset_common[cat_val_this] == 0)
517
+ continue;
518
+ }
519
+
520
+ /* if so, then check if it actually belongs into the cluster */
521
+ flag_this_cluster = false;
522
+ switch(model_outputs.all_clusters[col][cl].column_type) {
523
+
524
+ case NoType:
525
+ {
526
+ flag_this_cluster = true;
527
+ break;
528
+ }
529
+
530
+ case Numeric:
531
+ {
532
+ num_val_other = prediction_data.numeric_data[row + model_outputs.all_clusters[col][cl].col_num * prediction_data.nrows];
533
+ switch(model_outputs.all_clusters[col][cl].split_type) {
534
+ case IsNa:
535
+ {
536
+ if (isnan(num_val_other)) flag_this_cluster = true;
537
+ break;
538
+ }
539
+
540
+ case LessOrEqual:
541
+ {
542
+ if (!isnan(num_val_other) && num_val_other <= model_outputs.all_clusters[col][cl].split_point) flag_this_cluster = true;
543
+ break;
544
+ }
545
+
546
+ case Greater:
547
+ {
548
+ if (!isnan(num_val_other) && num_val_other > model_outputs.all_clusters[col][cl].split_point) flag_this_cluster = true;
549
+ break;
550
+ }
551
+ }
552
+ break;
553
+ }
554
+
555
+ case Categorical:
556
+ {
557
+ cat_val_other = prediction_data.categorical_data[row + model_outputs.all_clusters[col][cl].col_num * prediction_data.nrows];
558
+ if (cat_val_other >= model_outputs.ncat[model_outputs.all_clusters[col][cl].col_num]) continue;
559
+ switch(model_outputs.all_clusters[col][cl].split_type) {
560
+ case IsNa:
561
+ {
562
+ if (cat_val_other < 0) flag_this_cluster = true;
563
+ break;
564
+ }
565
+
566
+ case InSubset:
567
+ {
568
+ if (cat_val_other >=0 && model_outputs.all_clusters[col][cl].split_subset[cat_val_other] == 1) flag_this_cluster = true;
569
+ break;
570
+ }
571
+
572
+ case NotInSubset:
573
+ {
574
+ if (cat_val_other >=0 && model_outputs.all_clusters[col][cl].split_subset[cat_val_other] == 0) flag_this_cluster = true;
575
+ break;
576
+ }
577
+
578
+ case Equal:
579
+ {
580
+ if (cat_val_other == model_outputs.all_clusters[col][cl].split_lev) flag_this_cluster = true;
581
+ break;
582
+ }
583
+
584
+ case NotEqual:
585
+ {
586
+ if (cat_val_other >=0 && cat_val_other != model_outputs.all_clusters[col][cl].split_lev) flag_this_cluster = true;
587
+ break;
588
+ }
589
+
590
+ /* Note: type 'SingleCateg' is only used temporarily, later gets converted to 'Equal' */
591
+ }
592
+ break;
593
+ }
594
+
595
+ case Ordinal:
596
+ {
597
+ cat_val_other = prediction_data.ordinal_data[row + model_outputs.all_clusters[col][cl].col_num * prediction_data.nrows];
598
+ if (cat_val_other >= model_outputs.ncat_ord[model_outputs.all_clusters[col][cl].col_num]) continue;
599
+ switch(model_outputs.all_clusters[col][cl].split_type) {
600
+ case IsNa:
601
+ {
602
+ if (cat_val_other < 0) flag_this_cluster = true;
603
+ break;
604
+ }
605
+
606
+ case LessOrEqual:
607
+ {
608
+ if (cat_val_other >=0 && cat_val_other <= model_outputs.all_clusters[col][cl].split_lev) flag_this_cluster = true;
609
+ break;
610
+ }
611
+
612
+ case Greater:
613
+ {
614
+ if (cat_val_other >=0 && cat_val_other > model_outputs.all_clusters[col][cl].split_lev) flag_this_cluster = true;
615
+ break;
616
+ }
617
+
618
+ case Equal:
619
+ {
620
+ if (cat_val_other == model_outputs.all_clusters[col][cl].split_lev) flag_this_cluster = true;
621
+ break;
622
+ }
623
+
624
+ case NotEqual:
625
+ {
626
+ if (cat_val_other >=0 && cat_val_other != model_outputs.all_clusters[col][cl].split_lev) flag_this_cluster = true;
627
+ break;
628
+ }
629
+ }
630
+ break;
631
+ }
632
+ }
633
+ if (flag_this_cluster) {
634
+
635
+ tree_has_outliers = true;
636
+ cluster_size = model_outputs.all_clusters[col][cl].cluster_size;
637
+ cluster_depth = curr_depth + ((model_outputs.all_clusters[col][cl].column_type == NoType)? 0 : 1);
638
+ if (col_is_num) {
639
+ outlier_score = chebyshyov_bound(z_score(
640
+ (model_outputs.col_transf[col] == NoTransf)? num_val_this :
641
+ (model_outputs.col_transf[col] == Log)?
642
+ log(num_val_this - model_outputs.transf_offset[col]) :
643
+ exp( (num_val_this - model_outputs.transf_offset[col]) / model_outputs.sd_div[col] ),
644
+ model_outputs.all_clusters[col][cl].cluster_mean,
645
+ model_outputs.all_clusters[col][cl].cluster_sd
646
+ )
647
+ );
648
+ } else {
649
+ outlier_score = model_outputs.all_clusters[col][cl].score_categ[cat_val_this];
650
+ }
651
+
652
+ /* if this is the best cluster so far, remember it */
653
+ /* Note: if parallelizing by columns, must turn this into a critical section as the previously-assigned column can change in the meantime */
654
+ // #pragma omp critical
655
+ if (
656
+ model_outputs.outlier_scores_final[row] >= 1.0 ||
657
+ (
658
+ cluster_depth < model_outputs.outlier_depth_final[row] &&
659
+ (
660
+ model_outputs.all_clusters[col][cl].has_NA_branch
661
+ ==
662
+ model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
663
+ )
664
+ ) ||
665
+ (
666
+ model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch &&
667
+ !model_outputs.all_clusters[col][cl].has_NA_branch
668
+ ) ||
669
+ (
670
+ cluster_depth == model_outputs.outlier_depth_final[row] &&
671
+ (
672
+ model_outputs.all_clusters[col][cl].has_NA_branch
673
+ ==
674
+ model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
675
+ ) &&
676
+ cluster_size > model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].cluster_size
677
+ ) ||
678
+ (
679
+ cluster_depth == model_outputs.outlier_depth_final[row] &&
680
+ cluster_size == model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].cluster_size &&
681
+ (
682
+ model_outputs.all_clusters[col][cl].has_NA_branch
683
+ ==
684
+ model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
685
+ ) &&
686
+ outlier_score < model_outputs.outlier_scores_final[row]
687
+ )
688
+ )
689
+ {
690
+ model_outputs.outlier_columns_final[row] = col;
691
+ model_outputs.outlier_scores_final[row] = outlier_score;
692
+ model_outputs.outlier_clusters_final[row] = cl;
693
+ model_outputs.outlier_trees_final[row] = curr_tree;
694
+ model_outputs.outlier_depth_final[row] = cluster_depth;
695
+ }
696
+
697
+
698
+ }
699
+
700
+ }
701
+
702
+ }
703
+
704
+ return tree_has_outliers;
705
+
706
+ }