outliertree 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE.txt +674 -0
- data/NOTICE.txt +14 -0
- data/README.md +107 -0
- data/ext/outliertree/ext.cpp +260 -0
- data/ext/outliertree/extconf.rb +21 -0
- data/lib/outliertree.rb +17 -0
- data/lib/outliertree/dataset.rb +35 -0
- data/lib/outliertree/model.rb +128 -0
- data/lib/outliertree/result.rb +190 -0
- data/lib/outliertree/version.rb +3 -0
- data/vendor/outliertree/LICENSE +674 -0
- data/vendor/outliertree/README.md +155 -0
- data/vendor/outliertree/src/Makevars +3 -0
- data/vendor/outliertree/src/RcppExports.cpp +123 -0
- data/vendor/outliertree/src/Rwrapper.cpp +1225 -0
- data/vendor/outliertree/src/cat_outlier.cpp +328 -0
- data/vendor/outliertree/src/clusters.cpp +972 -0
- data/vendor/outliertree/src/fit_model.cpp +1932 -0
- data/vendor/outliertree/src/misc.cpp +685 -0
- data/vendor/outliertree/src/outlier_tree.hpp +758 -0
- data/vendor/outliertree/src/predict.cpp +706 -0
- data/vendor/outliertree/src/split.cpp +1098 -0
- metadata +150 -0
@@ -0,0 +1,706 @@
|
|
1
|
+
/********************************************************************************************************************
|
2
|
+
* Explainable outlier detection
|
3
|
+
*
|
4
|
+
* Tries to detect outliers by generating decision trees that attempt to predict the values of each column based on
|
5
|
+
* each other column, testing in each branch of every tried split (if it meets some minimum criteria) whether there
|
6
|
+
* are observations that seem too distant from the others in a 1-D distribution for the column that the split tries
|
7
|
+
* to "predict" (will not generate a score for each observation).
|
8
|
+
* Splits are based on gain, while outlierness is based on confidence intervals.
|
9
|
+
* Similar in spirit to the GritBot software developed by RuleQuest research.
|
10
|
+
*
|
11
|
+
*
|
12
|
+
* Copyright 2019 David Cortes.
|
13
|
+
*
|
14
|
+
* Written for C++11 standard and OpenMP 2.0 or later. Code is meant to be wrapped into scripting languages
|
15
|
+
* such as R or Python.
|
16
|
+
*
|
17
|
+
* This file is part of OutlierTree.
|
18
|
+
*
|
19
|
+
* OutlierTree is free software: you can redistribute it and/or modify
|
20
|
+
* it under the terms of the GNU General Public License as published by
|
21
|
+
* the Free Software Foundation, either version 3 of the License, or
|
22
|
+
* (at your option) any later version.
|
23
|
+
*
|
24
|
+
* OutlierTree is distributed in the hope that it will be useful,
|
25
|
+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
26
|
+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
27
|
+
* GNU General Public License for more details.
|
28
|
+
*
|
29
|
+
* You should have received a copy of the GNU General Public License
|
30
|
+
* along with OutlierTree. If not, see <https://www.gnu.org/licenses/>.
|
31
|
+
********************************************************************************************************************/
|
32
|
+
#include "outlier_tree.hpp"
|
33
|
+
|
34
|
+
|
35
|
+
/* Check if new data would be outliers according to previously-fit model
|
36
|
+
*
|
37
|
+
* Note that the new data must follow the exact same column order, and must also be passsed as arrays
|
38
|
+
* order by columns (like Fortran arrays, not like C arrays). For data with < 10,000 rows, it's usually
|
39
|
+
* faster to run it single-threaded. The outputs are pased in arrays within the 'ModelOutputs' struct,
|
40
|
+
* just like when fitting the model. Outpus for rows from previous calls to this function or to the model-fitting
|
41
|
+
* function will be overwriten.
|
42
|
+
*
|
43
|
+
* Parameters:
|
44
|
+
* - numeric_data[n * m1] (in)
|
45
|
+
* Array with numerical columns in the data. Must be ordered by columns like Fortran arrays.
|
46
|
+
* Missing values should be encoded as NaN. Infinite values in most sections are treated as NaN too.
|
47
|
+
* If there are no numerical columns, pass NULL.
|
48
|
+
* - categorical_data[n * m2] (in)
|
49
|
+
* Array with categorical columns in the data. Must be ordered by columns like Fortran arrays.
|
50
|
+
* Negative numbers will be interpreted as missing values. Numeration must start at zero and be
|
51
|
+
* contiguous (i.e. if there's category 2, must also have category 1).
|
52
|
+
* If there are no categorical columns, pass NULL.
|
53
|
+
* - ordinal_data[n * m3] (in)
|
54
|
+
* Array with ordinal categorical columns in the data. Must be ordered by columns like Fortran arrays.
|
55
|
+
* Same rules as for categorical data. Note that the order will only be taken into consideration when
|
56
|
+
* producing splits by these columns, but outliers are still detected in the same way as for categoricals.
|
57
|
+
* If there are no ordinal columns, pass NULL.
|
58
|
+
* - nrows (in)
|
59
|
+
* Number of rows (n) in the arrays passed above.
|
60
|
+
* - nthreads (in)
|
61
|
+
* Number of parallel threads to use.
|
62
|
+
* - model_outputs (in, out)
|
63
|
+
* Struct containing the data from the fitted model necessary to make new predictions,
|
64
|
+
* and buffer vectors where to store the details of the potential outliers found.
|
65
|
+
*
|
66
|
+
* Returns:
|
67
|
+
* Whether there were any outliers identified in the data passed here. Their details will be inside the
|
68
|
+
* 'ModelOutputs' struct.
|
69
|
+
*/
|
70
|
+
bool find_new_outliers(double *restrict numeric_data,
|
71
|
+
int *restrict categorical_data,
|
72
|
+
int *restrict ordinal_data,
|
73
|
+
size_t nrows, int nthreads, ModelOutputs &model_outputs)
|
74
|
+
{
|
75
|
+
size_t tot_cols = model_outputs.ncols_numeric + model_outputs.ncols_categ + model_outputs.ncols_ord;
|
76
|
+
double num_val_this;
|
77
|
+
int cat_val_this;
|
78
|
+
bool col_is_num;
|
79
|
+
|
80
|
+
bool found_outliers = false;
|
81
|
+
if (nrows < (size_t)nthreads)
|
82
|
+
nthreads = (int) nrows;
|
83
|
+
#if defined(_OPENMP)
|
84
|
+
std::vector<char> outliers_thread(nthreads, false);
|
85
|
+
#endif
|
86
|
+
|
87
|
+
/* reset the output data structures */
|
88
|
+
allocate_row_outputs(model_outputs, nrows, model_outputs.max_depth);
|
89
|
+
|
90
|
+
/* put data into a struct and pass it by reference */
|
91
|
+
PredictionData prediction_data = {numeric_data, categorical_data, ordinal_data, nrows};
|
92
|
+
|
93
|
+
/* Note: if parallelizing by columns instead of by rows, need to switch on the `#pragma omp critical`
|
94
|
+
in the block that assigns the cluster to an observation */
|
95
|
+
|
96
|
+
/* see if any value is an outlier */
|
97
|
+
// #pragma omp parallel for schedule(dynamic) num_threads(nthreads) shared(model_outputs, outliers_thread, nrows, tot_cols, prediction_data) private(col_is_num, num_val_this, cat_val_this)
|
98
|
+
for (size_t_for col = 0; col < tot_cols; col++) {
|
99
|
+
|
100
|
+
if (model_outputs.all_trees[col].size() == 0 || model_outputs.all_clusters[col].size() == 0) continue;
|
101
|
+
col_is_num = col < model_outputs.ncols_numeric;
|
102
|
+
|
103
|
+
/* Note: earlier versions of OpenMP (like v2 released in 2000 and still used by MSVC in 2019) don't support max reduction, hence this code */
|
104
|
+
#pragma omp parallel for schedule(dynamic) num_threads(nthreads) shared(model_outputs, outliers_thread, nrows, prediction_data) \
|
105
|
+
firstprivate(col_is_num, col) private(num_val_this, cat_val_this)
|
106
|
+
for (size_t_for row = 0; row < nrows; row++) {
|
107
|
+
|
108
|
+
/* first make a pre-check that the value could be flagged as outlier in some cluster */
|
109
|
+
if (col < model_outputs.ncols_numeric) {
|
110
|
+
|
111
|
+
num_val_this = prediction_data.numeric_data[row + col * nrows];
|
112
|
+
if (is_na_or_inf(num_val_this)) continue;
|
113
|
+
if ((num_val_this < model_outputs.max_outlier_any_cl[col]) && (num_val_this > model_outputs.min_outlier_any_cl[col])) continue;
|
114
|
+
|
115
|
+
} else if (col < (model_outputs.ncols_numeric + model_outputs.ncols_categ)) {
|
116
|
+
|
117
|
+
cat_val_this = prediction_data.categorical_data[row + (col - model_outputs.ncols_numeric) * nrows];
|
118
|
+
if (cat_val_this < 0) continue;
|
119
|
+
if (cat_val_this >= model_outputs.ncat[col - model_outputs.ncols_numeric]) continue;
|
120
|
+
if (!model_outputs.cat_outlier_any_cl[col - model_outputs.ncols_numeric][cat_val_this]) continue;
|
121
|
+
|
122
|
+
} else {
|
123
|
+
|
124
|
+
cat_val_this = prediction_data.ordinal_data[row + (col - model_outputs.ncols_numeric - model_outputs.ncols_categ) * nrows];
|
125
|
+
if (cat_val_this < 0) continue;
|
126
|
+
if (cat_val_this >= model_outputs.ncat_ord[col - model_outputs.ncols_numeric - model_outputs.ncols_categ]) continue;
|
127
|
+
if (!model_outputs.cat_outlier_any_cl[col - model_outputs.ncols_numeric][cat_val_this]) continue;
|
128
|
+
|
129
|
+
}
|
130
|
+
|
131
|
+
#ifdef _OPENMP
|
132
|
+
outliers_thread[omp_get_thread_num()] = follow_tree(model_outputs, prediction_data, 0, 0, row, col, col_is_num, num_val_this, cat_val_this)?
|
133
|
+
true : outliers_thread[omp_get_thread_num()];
|
134
|
+
#else
|
135
|
+
found_outliers = std::max(found_outliers, follow_tree(model_outputs, prediction_data, 0, 0, row, col, col_is_num, num_val_this, cat_val_this));
|
136
|
+
#endif
|
137
|
+
|
138
|
+
}
|
139
|
+
}
|
140
|
+
|
141
|
+
#if defined(_OPENMP)
|
142
|
+
for (size_t tid = 0; tid < outliers_thread.size(); tid++) {
|
143
|
+
if (outliers_thread[tid] != 0) found_outliers = true;
|
144
|
+
}
|
145
|
+
#endif
|
146
|
+
|
147
|
+
if (found_outliers)
|
148
|
+
calc_min_decimals_to_print(model_outputs, prediction_data.numeric_data, nthreads);
|
149
|
+
|
150
|
+
return found_outliers;
|
151
|
+
}
|
152
|
+
|
153
|
+
bool follow_tree(ModelOutputs &model_outputs, PredictionData &prediction_data, size_t curr_tree, size_t curr_depth,
|
154
|
+
size_t_for row, size_t_for col, bool col_is_num, double num_val_this, int cat_val_this)
|
155
|
+
{
|
156
|
+
bool found_outliers = check_is_outlier_in_tree(model_outputs.all_trees[col][curr_tree].clusters,
|
157
|
+
curr_depth, curr_tree, model_outputs, prediction_data,
|
158
|
+
row, col, col_is_num, num_val_this, cat_val_this);
|
159
|
+
|
160
|
+
/* if there's outliers at this level and it's in a non-NA branch, there's no point in continuing
|
161
|
+
further down the tree as deeper clusters are not preferred */
|
162
|
+
if (
|
163
|
+
found_outliers &&
|
164
|
+
!model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
|
165
|
+
) return true;
|
166
|
+
|
167
|
+
/* check if the tree is a dead-end */
|
168
|
+
if (
|
169
|
+
model_outputs.all_trees[col][curr_tree].tree_NA == 0 &&
|
170
|
+
model_outputs.all_trees[col][curr_tree].tree_left == 0 &&
|
171
|
+
model_outputs.all_trees[col][curr_tree].tree_right == 0 &&
|
172
|
+
(
|
173
|
+
model_outputs.all_trees[col][curr_tree].binary_branches.size() == 0 ||
|
174
|
+
*std::max_element(
|
175
|
+
model_outputs.all_trees[col][curr_tree].binary_branches.begin(),
|
176
|
+
model_outputs.all_trees[col][curr_tree].binary_branches.end()
|
177
|
+
) == 0
|
178
|
+
) &&
|
179
|
+
(
|
180
|
+
model_outputs.all_trees[col][curr_tree].all_branches.size() == 0 ||
|
181
|
+
*std::max_element(
|
182
|
+
model_outputs.all_trees[col][curr_tree].all_branches.begin(),
|
183
|
+
model_outputs.all_trees[col][curr_tree].all_branches.end()
|
184
|
+
) == 0
|
185
|
+
)
|
186
|
+
) return false;
|
187
|
+
|
188
|
+
/* try to follow trees according to the value of the columns they look at */
|
189
|
+
double num_val_other;
|
190
|
+
int cat_val_other;
|
191
|
+
|
192
|
+
/* if using 'follow_all', follow on all possible branches */
|
193
|
+
if (model_outputs.all_trees[col][curr_tree].all_branches.size() > 0) {
|
194
|
+
|
195
|
+
for (size_t br : model_outputs.all_trees[col][curr_tree].all_branches) {
|
196
|
+
if (br > 0) {
|
197
|
+
switch(model_outputs.all_trees[col][br].column_type) {
|
198
|
+
|
199
|
+
case Numeric:
|
200
|
+
{
|
201
|
+
num_val_other = prediction_data.numeric_data[row + model_outputs.all_trees[col][br].col_num * prediction_data.nrows];
|
202
|
+
switch (model_outputs.all_trees[col][br].split_this_branch) {
|
203
|
+
case IsNa:
|
204
|
+
{
|
205
|
+
if (isnan(num_val_other))
|
206
|
+
found_outliers = follow_tree(model_outputs, prediction_data, br, curr_depth + 1,
|
207
|
+
row, col, col_is_num, num_val_this, cat_val_this)?
|
208
|
+
true : found_outliers;
|
209
|
+
break;
|
210
|
+
}
|
211
|
+
|
212
|
+
case LessOrEqual:
|
213
|
+
{
|
214
|
+
if (!isnan(num_val_other) && num_val_other <= model_outputs.all_trees[col][br].split_point)
|
215
|
+
found_outliers = follow_tree(model_outputs, prediction_data, br, curr_depth + 1,
|
216
|
+
row, col, col_is_num, num_val_this, cat_val_this)?
|
217
|
+
true : found_outliers;
|
218
|
+
break;
|
219
|
+
}
|
220
|
+
|
221
|
+
case Greater:
|
222
|
+
{
|
223
|
+
if (!isnan(num_val_other) && num_val_other > model_outputs.all_trees[col][br].split_point)
|
224
|
+
found_outliers = follow_tree(model_outputs, prediction_data, br, curr_depth + 1,
|
225
|
+
row, col, col_is_num, num_val_this, cat_val_this)?
|
226
|
+
true : found_outliers;
|
227
|
+
break;
|
228
|
+
}
|
229
|
+
}
|
230
|
+
break;
|
231
|
+
}
|
232
|
+
|
233
|
+
case Categorical:
|
234
|
+
{
|
235
|
+
cat_val_other = prediction_data.categorical_data[row + model_outputs.all_trees[col][br].col_num * prediction_data.nrows];
|
236
|
+
if (cat_val_other >= model_outputs.ncat[model_outputs.all_trees[col][br].col_num]) continue;
|
237
|
+
switch (model_outputs.all_trees[col][br].split_this_branch) {
|
238
|
+
case IsNa:
|
239
|
+
{
|
240
|
+
if (cat_val_other < 0)
|
241
|
+
found_outliers = follow_tree(model_outputs, prediction_data, br, curr_depth + 1,
|
242
|
+
row, col, col_is_num, num_val_this, cat_val_this)?
|
243
|
+
true : found_outliers;
|
244
|
+
break;
|
245
|
+
}
|
246
|
+
|
247
|
+
case InSubset:
|
248
|
+
{
|
249
|
+
if (cat_val_other >= 0 && model_outputs.all_trees[col][br].split_subset[cat_val_other] == 1)
|
250
|
+
found_outliers = follow_tree(model_outputs, prediction_data, br, curr_depth + 1,
|
251
|
+
row, col, col_is_num, num_val_this, cat_val_this)?
|
252
|
+
true : found_outliers;
|
253
|
+
break;
|
254
|
+
}
|
255
|
+
|
256
|
+
case NotInSubset:
|
257
|
+
{
|
258
|
+
if (cat_val_other >= 0 && model_outputs.all_trees[col][br].split_subset[cat_val_other] == 0)
|
259
|
+
found_outliers = follow_tree(model_outputs, prediction_data, br, curr_depth + 1,
|
260
|
+
row, col, col_is_num, num_val_this, cat_val_this)?
|
261
|
+
true : found_outliers;
|
262
|
+
break;
|
263
|
+
}
|
264
|
+
|
265
|
+
case Equal:
|
266
|
+
{
|
267
|
+
if (cat_val_other == model_outputs.all_trees[col][br].split_lev)
|
268
|
+
found_outliers = follow_tree(model_outputs, prediction_data, br, curr_depth + 1,
|
269
|
+
row, col, col_is_num, num_val_this, cat_val_this)?
|
270
|
+
true : found_outliers;
|
271
|
+
break;
|
272
|
+
}
|
273
|
+
|
274
|
+
case NotEqual:
|
275
|
+
{
|
276
|
+
if (cat_val_other >= 0 && cat_val_other != model_outputs.all_trees[col][br].split_lev)
|
277
|
+
found_outliers = follow_tree(model_outputs, prediction_data, br, curr_depth + 1,
|
278
|
+
row, col, col_is_num, num_val_this, cat_val_this)?
|
279
|
+
true : found_outliers;
|
280
|
+
break;
|
281
|
+
}
|
282
|
+
}
|
283
|
+
break;
|
284
|
+
}
|
285
|
+
|
286
|
+
case Ordinal:
|
287
|
+
{
|
288
|
+
cat_val_other = prediction_data.ordinal_data[row + model_outputs.all_trees[col][br].col_num * prediction_data.nrows];
|
289
|
+
if (cat_val_other >= model_outputs.ncat_ord[model_outputs.all_trees[col][br].col_num]) continue;
|
290
|
+
switch (model_outputs.all_trees[col][br].split_this_branch) {
|
291
|
+
case IsNa:
|
292
|
+
{
|
293
|
+
if (cat_val_other < 0)
|
294
|
+
found_outliers = follow_tree(model_outputs, prediction_data, br, curr_depth + 1,
|
295
|
+
row, col, col_is_num, num_val_this, cat_val_this)?
|
296
|
+
true : found_outliers;
|
297
|
+
break;
|
298
|
+
}
|
299
|
+
|
300
|
+
case LessOrEqual:
|
301
|
+
{
|
302
|
+
if (cat_val_other >= 0 && cat_val_other <= model_outputs.all_trees[col][br].split_lev)
|
303
|
+
found_outliers = follow_tree(model_outputs, prediction_data, br, curr_depth + 1,
|
304
|
+
row, col, col_is_num, num_val_this, cat_val_this)?
|
305
|
+
true : found_outliers;
|
306
|
+
break;
|
307
|
+
}
|
308
|
+
|
309
|
+
case Greater:
|
310
|
+
{
|
311
|
+
if (cat_val_other > model_outputs.all_trees[col][br].split_lev)
|
312
|
+
found_outliers = follow_tree(model_outputs, prediction_data, br, curr_depth + 1,
|
313
|
+
row, col, col_is_num, num_val_this, cat_val_this)?
|
314
|
+
true : found_outliers;
|
315
|
+
break;
|
316
|
+
}
|
317
|
+
|
318
|
+
case Equal:
|
319
|
+
{
|
320
|
+
if (cat_val_other == model_outputs.all_trees[col][br].split_lev)
|
321
|
+
found_outliers = follow_tree(model_outputs, prediction_data, br, curr_depth + 1,
|
322
|
+
row, col, col_is_num, num_val_this, cat_val_this)?
|
323
|
+
true : found_outliers;
|
324
|
+
break;
|
325
|
+
}
|
326
|
+
|
327
|
+
case NotEqual:
|
328
|
+
{
|
329
|
+
if (cat_val_other >= 0 && cat_val_other != model_outputs.all_trees[col][br].split_lev)
|
330
|
+
found_outliers = follow_tree(model_outputs, prediction_data, br, curr_depth + 1,
|
331
|
+
row, col, col_is_num, num_val_this, cat_val_this)?
|
332
|
+
true : found_outliers;
|
333
|
+
break;
|
334
|
+
}
|
335
|
+
}
|
336
|
+
break;
|
337
|
+
}
|
338
|
+
|
339
|
+
}
|
340
|
+
}
|
341
|
+
}
|
342
|
+
return found_outliers;
|
343
|
+
}
|
344
|
+
|
345
|
+
/* regular case (not using 'follow_all') - follow the corresponding branch */
|
346
|
+
switch(model_outputs.all_trees[col][curr_tree].column_type) {
|
347
|
+
|
348
|
+
case NoType:
|
349
|
+
{
|
350
|
+
if (model_outputs.all_trees[col][curr_tree].binary_branches.size() > 0) {
|
351
|
+
for (size_t tree_follow : model_outputs.all_trees[col][curr_tree].binary_branches) {
|
352
|
+
if (tree_follow > 0)
|
353
|
+
found_outliers = follow_tree(model_outputs, prediction_data, tree_follow, curr_depth,
|
354
|
+
row, col, col_is_num, num_val_this, cat_val_this)?
|
355
|
+
true : found_outliers;
|
356
|
+
}
|
357
|
+
return found_outliers;
|
358
|
+
}
|
359
|
+
break;
|
360
|
+
}
|
361
|
+
|
362
|
+
case Numeric:
|
363
|
+
{
|
364
|
+
num_val_other = prediction_data.numeric_data[row + model_outputs.all_trees[col][curr_tree].col_num * prediction_data.nrows];
|
365
|
+
if (isnan(num_val_other)) {
|
366
|
+
|
367
|
+
if (model_outputs.all_trees[col][curr_tree].tree_NA > 0)
|
368
|
+
return follow_tree(model_outputs, prediction_data, model_outputs.all_trees[col][curr_tree].tree_NA, curr_depth + 1,
|
369
|
+
row, col, col_is_num, num_val_this, cat_val_this);
|
370
|
+
|
371
|
+
} else if (num_val_other <= model_outputs.all_trees[col][curr_tree].split_point) {
|
372
|
+
|
373
|
+
if (model_outputs.all_trees[col][curr_tree].tree_left > 0)
|
374
|
+
return follow_tree(model_outputs, prediction_data, model_outputs.all_trees[col][curr_tree].tree_left, curr_depth + 1,
|
375
|
+
row, col, col_is_num, num_val_this, cat_val_this);
|
376
|
+
|
377
|
+
} else {
|
378
|
+
|
379
|
+
if (model_outputs.all_trees[col][curr_tree].tree_right > 0)
|
380
|
+
return follow_tree(model_outputs, prediction_data, model_outputs.all_trees[col][curr_tree].tree_right, curr_depth + 1,
|
381
|
+
row, col, col_is_num, num_val_this, cat_val_this);
|
382
|
+
|
383
|
+
}
|
384
|
+
break;
|
385
|
+
}
|
386
|
+
|
387
|
+
case Categorical:
|
388
|
+
{
|
389
|
+
cat_val_other = prediction_data.categorical_data[row + model_outputs.all_trees[col][curr_tree].col_num * prediction_data.nrows];
|
390
|
+
if (cat_val_other >= model_outputs.ncat[model_outputs.all_trees[col][curr_tree].col_num]) return false;
|
391
|
+
if (cat_val_other < 0) {
|
392
|
+
if (model_outputs.all_trees[col][curr_tree].tree_NA > 0)
|
393
|
+
return follow_tree(model_outputs, prediction_data, model_outputs.all_trees[col][curr_tree].tree_NA, curr_depth + 1,
|
394
|
+
row, col, col_is_num, num_val_this, cat_val_this);
|
395
|
+
else return false;
|
396
|
+
}
|
397
|
+
|
398
|
+
|
399
|
+
if (model_outputs.all_trees[col][curr_tree].binary_branches.size() > 0) {
|
400
|
+
|
401
|
+
if (curr_tree == 0 && model_outputs.all_trees[col][curr_tree].column_type == NoType) {
|
402
|
+
/* binarized branches in the main tree */
|
403
|
+
for (size_t tree_follow : model_outputs.all_trees[col][curr_tree].binary_branches) {
|
404
|
+
|
405
|
+
if (tree_follow > 0)
|
406
|
+
found_outliers = follow_tree(model_outputs, prediction_data, tree_follow, curr_depth,
|
407
|
+
row, col, col_is_num, num_val_this, cat_val_this)?
|
408
|
+
true : found_outliers;
|
409
|
+
}
|
410
|
+
return found_outliers;
|
411
|
+
|
412
|
+
} else {
|
413
|
+
|
414
|
+
/* single-category branch in a categorical-by-categorical split */
|
415
|
+
if (model_outputs.all_trees[col][curr_tree].binary_branches[cat_val_other] > 0) {
|
416
|
+
return follow_tree(model_outputs, prediction_data, model_outputs.all_trees[col][curr_tree].binary_branches[cat_val_other], curr_depth + 1,
|
417
|
+
row, col, col_is_num, num_val_this, cat_val_this);
|
418
|
+
}
|
419
|
+
}
|
420
|
+
|
421
|
+
}
|
422
|
+
|
423
|
+
else if (model_outputs.all_trees[col][curr_tree].split_lev != INT_MAX) {
|
424
|
+
|
425
|
+
if (model_outputs.all_trees[col][curr_tree].split_lev == cat_val_other) {
|
426
|
+
|
427
|
+
if (model_outputs.all_trees[col][curr_tree].tree_left > 0)
|
428
|
+
return follow_tree(model_outputs, prediction_data, model_outputs.all_trees[col][curr_tree].tree_left, curr_depth + 1,
|
429
|
+
row, col, col_is_num, num_val_this, cat_val_this);
|
430
|
+
|
431
|
+
} else {
|
432
|
+
|
433
|
+
if (model_outputs.all_trees[col][curr_tree].tree_right > 0)
|
434
|
+
return follow_tree(model_outputs, prediction_data, model_outputs.all_trees[col][curr_tree].tree_right, curr_depth + 1,
|
435
|
+
row, col, col_is_num, num_val_this, cat_val_this);
|
436
|
+
|
437
|
+
}
|
438
|
+
|
439
|
+
}
|
440
|
+
|
441
|
+
else {
|
442
|
+
|
443
|
+
if (model_outputs.all_trees[col][curr_tree].split_subset[cat_val_other] == 1) {
|
444
|
+
|
445
|
+
if (model_outputs.all_trees[col][curr_tree].tree_left > 0)
|
446
|
+
return follow_tree(model_outputs, prediction_data, model_outputs.all_trees[col][curr_tree].tree_left, curr_depth + 1,
|
447
|
+
row, col, col_is_num, num_val_this, cat_val_this);
|
448
|
+
|
449
|
+
} else if (model_outputs.all_trees[col][curr_tree].split_subset[cat_val_other] == 0) {
|
450
|
+
|
451
|
+
if (model_outputs.all_trees[col][curr_tree].tree_right > 0)
|
452
|
+
return follow_tree(model_outputs, prediction_data, model_outputs.all_trees[col][curr_tree].tree_right, curr_depth + 1,
|
453
|
+
row, col, col_is_num, num_val_this, cat_val_this);
|
454
|
+
|
455
|
+
}
|
456
|
+
|
457
|
+
}
|
458
|
+
break;
|
459
|
+
}
|
460
|
+
|
461
|
+
case Ordinal:
|
462
|
+
{
|
463
|
+
cat_val_other = prediction_data.ordinal_data[row + model_outputs.all_trees[col][curr_tree].col_num * prediction_data.nrows];
|
464
|
+
if (cat_val_other >= model_outputs.ncat_ord[model_outputs.all_trees[col][curr_tree].col_num]) return false;
|
465
|
+
if (cat_val_other < 0) {
|
466
|
+
|
467
|
+
if (model_outputs.all_trees[col][curr_tree].tree_NA > 0)
|
468
|
+
return follow_tree(model_outputs, prediction_data, model_outputs.all_trees[col][curr_tree].tree_NA, curr_depth + 1,
|
469
|
+
row, col, col_is_num, num_val_this, cat_val_this);
|
470
|
+
|
471
|
+
} else if (cat_val_other <= model_outputs.all_trees[col][curr_tree].split_lev) {
|
472
|
+
|
473
|
+
if (model_outputs.all_trees[col][curr_tree].tree_left > 0)
|
474
|
+
return follow_tree(model_outputs, prediction_data, model_outputs.all_trees[col][curr_tree].tree_left, curr_depth + 1,
|
475
|
+
row, col, col_is_num, num_val_this, cat_val_this);
|
476
|
+
|
477
|
+
} else {
|
478
|
+
|
479
|
+
if (model_outputs.all_trees[col][curr_tree].tree_right > 0)
|
480
|
+
return follow_tree(model_outputs, prediction_data, model_outputs.all_trees[col][curr_tree].tree_right, curr_depth + 1,
|
481
|
+
row, col, col_is_num, num_val_this, cat_val_this);
|
482
|
+
}
|
483
|
+
break;
|
484
|
+
}
|
485
|
+
|
486
|
+
}
|
487
|
+
|
488
|
+
return false;
|
489
|
+
}
|
490
|
+
|
491
|
+
bool check_is_outlier_in_tree(std::vector<size_t> &clusters_in_tree, size_t curr_depth, size_t curr_tree,
|
492
|
+
ModelOutputs &model_outputs, PredictionData &prediction_data, size_t_for row, size_t_for col,
|
493
|
+
bool col_is_num, double num_val_this, int cat_val_this)
|
494
|
+
{
|
495
|
+
|
496
|
+
|
497
|
+
bool tree_has_outliers = false;
|
498
|
+
bool flag_this_cluster;
|
499
|
+
double outlier_score;
|
500
|
+
size_t cluster_size;
|
501
|
+
size_t cluster_depth;
|
502
|
+
double num_val_other;
|
503
|
+
int cat_val_other;
|
504
|
+
|
505
|
+
if (clusters_in_tree.size() > 0) {
|
506
|
+
|
507
|
+
/* see if it would be an outlier under any of the clusters from this tree */
|
508
|
+
for (const size_t cl : clusters_in_tree) {
|
509
|
+
|
510
|
+
if (col_is_num) {
|
511
|
+
if (
|
512
|
+
num_val_this > model_outputs.all_clusters[col][cl].lower_lim &&
|
513
|
+
num_val_this < model_outputs.all_clusters[col][cl].upper_lim
|
514
|
+
) continue;
|
515
|
+
} else {
|
516
|
+
if (model_outputs.all_clusters[col][cl].subset_common[cat_val_this] == 0)
|
517
|
+
continue;
|
518
|
+
}
|
519
|
+
|
520
|
+
/* if so, then check if it actually belongs into the cluster */
|
521
|
+
flag_this_cluster = false;
|
522
|
+
switch(model_outputs.all_clusters[col][cl].column_type) {
|
523
|
+
|
524
|
+
case NoType:
|
525
|
+
{
|
526
|
+
flag_this_cluster = true;
|
527
|
+
break;
|
528
|
+
}
|
529
|
+
|
530
|
+
case Numeric:
|
531
|
+
{
|
532
|
+
num_val_other = prediction_data.numeric_data[row + model_outputs.all_clusters[col][cl].col_num * prediction_data.nrows];
|
533
|
+
switch(model_outputs.all_clusters[col][cl].split_type) {
|
534
|
+
case IsNa:
|
535
|
+
{
|
536
|
+
if (isnan(num_val_other)) flag_this_cluster = true;
|
537
|
+
break;
|
538
|
+
}
|
539
|
+
|
540
|
+
case LessOrEqual:
|
541
|
+
{
|
542
|
+
if (!isnan(num_val_other) && num_val_other <= model_outputs.all_clusters[col][cl].split_point) flag_this_cluster = true;
|
543
|
+
break;
|
544
|
+
}
|
545
|
+
|
546
|
+
case Greater:
|
547
|
+
{
|
548
|
+
if (!isnan(num_val_other) && num_val_other > model_outputs.all_clusters[col][cl].split_point) flag_this_cluster = true;
|
549
|
+
break;
|
550
|
+
}
|
551
|
+
}
|
552
|
+
break;
|
553
|
+
}
|
554
|
+
|
555
|
+
case Categorical:
|
556
|
+
{
|
557
|
+
cat_val_other = prediction_data.categorical_data[row + model_outputs.all_clusters[col][cl].col_num * prediction_data.nrows];
|
558
|
+
if (cat_val_other >= model_outputs.ncat[model_outputs.all_clusters[col][cl].col_num]) continue;
|
559
|
+
switch(model_outputs.all_clusters[col][cl].split_type) {
|
560
|
+
case IsNa:
|
561
|
+
{
|
562
|
+
if (cat_val_other < 0) flag_this_cluster = true;
|
563
|
+
break;
|
564
|
+
}
|
565
|
+
|
566
|
+
case InSubset:
|
567
|
+
{
|
568
|
+
if (cat_val_other >=0 && model_outputs.all_clusters[col][cl].split_subset[cat_val_other] == 1) flag_this_cluster = true;
|
569
|
+
break;
|
570
|
+
}
|
571
|
+
|
572
|
+
case NotInSubset:
|
573
|
+
{
|
574
|
+
if (cat_val_other >=0 && model_outputs.all_clusters[col][cl].split_subset[cat_val_other] == 0) flag_this_cluster = true;
|
575
|
+
break;
|
576
|
+
}
|
577
|
+
|
578
|
+
case Equal:
|
579
|
+
{
|
580
|
+
if (cat_val_other == model_outputs.all_clusters[col][cl].split_lev) flag_this_cluster = true;
|
581
|
+
break;
|
582
|
+
}
|
583
|
+
|
584
|
+
case NotEqual:
|
585
|
+
{
|
586
|
+
if (cat_val_other >=0 && cat_val_other != model_outputs.all_clusters[col][cl].split_lev) flag_this_cluster = true;
|
587
|
+
break;
|
588
|
+
}
|
589
|
+
|
590
|
+
/* Note: type 'SingleCateg' is only used temporarily, later gets converted to 'Equal' */
|
591
|
+
}
|
592
|
+
break;
|
593
|
+
}
|
594
|
+
|
595
|
+
case Ordinal:
|
596
|
+
{
|
597
|
+
cat_val_other = prediction_data.ordinal_data[row + model_outputs.all_clusters[col][cl].col_num * prediction_data.nrows];
|
598
|
+
if (cat_val_other >= model_outputs.ncat_ord[model_outputs.all_clusters[col][cl].col_num]) continue;
|
599
|
+
switch(model_outputs.all_clusters[col][cl].split_type) {
|
600
|
+
case IsNa:
|
601
|
+
{
|
602
|
+
if (cat_val_other < 0) flag_this_cluster = true;
|
603
|
+
break;
|
604
|
+
}
|
605
|
+
|
606
|
+
case LessOrEqual:
|
607
|
+
{
|
608
|
+
if (cat_val_other >=0 && cat_val_other <= model_outputs.all_clusters[col][cl].split_lev) flag_this_cluster = true;
|
609
|
+
break;
|
610
|
+
}
|
611
|
+
|
612
|
+
case Greater:
|
613
|
+
{
|
614
|
+
if (cat_val_other >=0 && cat_val_other > model_outputs.all_clusters[col][cl].split_lev) flag_this_cluster = true;
|
615
|
+
break;
|
616
|
+
}
|
617
|
+
|
618
|
+
case Equal:
|
619
|
+
{
|
620
|
+
if (cat_val_other == model_outputs.all_clusters[col][cl].split_lev) flag_this_cluster = true;
|
621
|
+
break;
|
622
|
+
}
|
623
|
+
|
624
|
+
case NotEqual:
|
625
|
+
{
|
626
|
+
if (cat_val_other >=0 && cat_val_other != model_outputs.all_clusters[col][cl].split_lev) flag_this_cluster = true;
|
627
|
+
break;
|
628
|
+
}
|
629
|
+
}
|
630
|
+
break;
|
631
|
+
}
|
632
|
+
}
|
633
|
+
if (flag_this_cluster) {
|
634
|
+
|
635
|
+
tree_has_outliers = true;
|
636
|
+
cluster_size = model_outputs.all_clusters[col][cl].cluster_size;
|
637
|
+
cluster_depth = curr_depth + ((model_outputs.all_clusters[col][cl].column_type == NoType)? 0 : 1);
|
638
|
+
if (col_is_num) {
|
639
|
+
outlier_score = chebyshyov_bound(z_score(
|
640
|
+
(model_outputs.col_transf[col] == NoTransf)? num_val_this :
|
641
|
+
(model_outputs.col_transf[col] == Log)?
|
642
|
+
log(num_val_this - model_outputs.transf_offset[col]) :
|
643
|
+
exp( (num_val_this - model_outputs.transf_offset[col]) / model_outputs.sd_div[col] ),
|
644
|
+
model_outputs.all_clusters[col][cl].cluster_mean,
|
645
|
+
model_outputs.all_clusters[col][cl].cluster_sd
|
646
|
+
)
|
647
|
+
);
|
648
|
+
} else {
|
649
|
+
outlier_score = model_outputs.all_clusters[col][cl].score_categ[cat_val_this];
|
650
|
+
}
|
651
|
+
|
652
|
+
/* if this is the best cluster so far, remember it */
|
653
|
+
/* Note: if parallelizing by columns, must turn this into a critical section as the previously-assigned column can change in the meantime */
|
654
|
+
// #pragma omp critical
|
655
|
+
if (
|
656
|
+
model_outputs.outlier_scores_final[row] >= 1.0 ||
|
657
|
+
(
|
658
|
+
cluster_depth < model_outputs.outlier_depth_final[row] &&
|
659
|
+
(
|
660
|
+
model_outputs.all_clusters[col][cl].has_NA_branch
|
661
|
+
==
|
662
|
+
model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
|
663
|
+
)
|
664
|
+
) ||
|
665
|
+
(
|
666
|
+
model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch &&
|
667
|
+
!model_outputs.all_clusters[col][cl].has_NA_branch
|
668
|
+
) ||
|
669
|
+
(
|
670
|
+
cluster_depth == model_outputs.outlier_depth_final[row] &&
|
671
|
+
(
|
672
|
+
model_outputs.all_clusters[col][cl].has_NA_branch
|
673
|
+
==
|
674
|
+
model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
|
675
|
+
) &&
|
676
|
+
cluster_size > model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].cluster_size
|
677
|
+
) ||
|
678
|
+
(
|
679
|
+
cluster_depth == model_outputs.outlier_depth_final[row] &&
|
680
|
+
cluster_size == model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].cluster_size &&
|
681
|
+
(
|
682
|
+
model_outputs.all_clusters[col][cl].has_NA_branch
|
683
|
+
==
|
684
|
+
model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
|
685
|
+
) &&
|
686
|
+
outlier_score < model_outputs.outlier_scores_final[row]
|
687
|
+
)
|
688
|
+
)
|
689
|
+
{
|
690
|
+
model_outputs.outlier_columns_final[row] = col;
|
691
|
+
model_outputs.outlier_scores_final[row] = outlier_score;
|
692
|
+
model_outputs.outlier_clusters_final[row] = cl;
|
693
|
+
model_outputs.outlier_trees_final[row] = curr_tree;
|
694
|
+
model_outputs.outlier_depth_final[row] = cluster_depth;
|
695
|
+
}
|
696
|
+
|
697
|
+
|
698
|
+
}
|
699
|
+
|
700
|
+
}
|
701
|
+
|
702
|
+
}
|
703
|
+
|
704
|
+
return tree_has_outliers;
|
705
|
+
|
706
|
+
}
|