outliertree 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE.txt +674 -0
- data/NOTICE.txt +14 -0
- data/README.md +107 -0
- data/ext/outliertree/ext.cpp +260 -0
- data/ext/outliertree/extconf.rb +21 -0
- data/lib/outliertree.rb +17 -0
- data/lib/outliertree/dataset.rb +35 -0
- data/lib/outliertree/model.rb +128 -0
- data/lib/outliertree/result.rb +190 -0
- data/lib/outliertree/version.rb +3 -0
- data/vendor/outliertree/LICENSE +674 -0
- data/vendor/outliertree/README.md +155 -0
- data/vendor/outliertree/src/Makevars +3 -0
- data/vendor/outliertree/src/RcppExports.cpp +123 -0
- data/vendor/outliertree/src/Rwrapper.cpp +1225 -0
- data/vendor/outliertree/src/cat_outlier.cpp +328 -0
- data/vendor/outliertree/src/clusters.cpp +972 -0
- data/vendor/outliertree/src/fit_model.cpp +1932 -0
- data/vendor/outliertree/src/misc.cpp +685 -0
- data/vendor/outliertree/src/outlier_tree.hpp +758 -0
- data/vendor/outliertree/src/predict.cpp +706 -0
- data/vendor/outliertree/src/split.cpp +1098 -0
- metadata +150 -0
@@ -0,0 +1,685 @@
|
|
1
|
+
/********************************************************************************************************************
|
2
|
+
* Explainable outlier detection
|
3
|
+
*
|
4
|
+
* Tries to detect outliers by generating decision trees that attempt to predict the values of each column based on
|
5
|
+
* each other column, testing in each branch of every tried split (if it meets some minimum criteria) whether there
|
6
|
+
* are observations that seem too distant from the others in a 1-D distribution for the column that the split tries
|
7
|
+
* to "predict" (will not generate a score for each observation).
|
8
|
+
* Splits are based on gain, while outlierness is based on confidence intervals.
|
9
|
+
* Similar in spirit to the GritBot software developed by RuleQuest research. Reference article is:
|
10
|
+
* Cortes, David. "Explainable outlier detection through decision tree conditioning."
|
11
|
+
* arXiv preprint arXiv:2001.00636 (2020).
|
12
|
+
*
|
13
|
+
*
|
14
|
+
* Copyright 2020 David Cortes.
|
15
|
+
*
|
16
|
+
* Written for C++11 standard and OpenMP 2.0 or later. Code is meant to be wrapped into scripting languages
|
17
|
+
* such as R or Python.
|
18
|
+
*
|
19
|
+
* This file is part of OutlierTree.
|
20
|
+
*
|
21
|
+
* OutlierTree is free software: you can redistribute it and/or modify
|
22
|
+
* it under the terms of the GNU General Public License as published by
|
23
|
+
* the Free Software Foundation, either version 3 of the License, or
|
24
|
+
* (at your option) any later version.
|
25
|
+
*
|
26
|
+
* OutlierTree is distributed in the hope that it will be useful,
|
27
|
+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
28
|
+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
29
|
+
* GNU General Public License for more details.
|
30
|
+
*
|
31
|
+
* You should have received a copy of the GNU General Public License
|
32
|
+
* along with OutlierTree. If not, see <https://www.gnu.org/licenses/>.
|
33
|
+
********************************************************************************************************************/
|
34
|
+
#include "outlier_tree.hpp"
|
35
|
+
|
36
|
+
|
37
|
+
/*
|
38
|
+
* Calculate, in a continuous array that would hold start indices for each category for each column in sequence,
|
39
|
+
* at which position will the counts for a given column start. Note that NAs are stored as the last index in each
|
40
|
+
* column, so each one needs one extra category
|
41
|
+
*/
|
42
|
+
int calculate_category_indices(size_t start_ix_cat_counts[], int ncat[], size_t ncols, bool skip_col[], int max_categ)
|
43
|
+
{
|
44
|
+
for (size_t col = 0; col < ncols; col++) {
|
45
|
+
max_categ = std::max(ncat[col], max_categ);
|
46
|
+
start_ix_cat_counts[col + 1] = start_ix_cat_counts[col] + ncat[col] + 1;
|
47
|
+
if (ncat[col] < 2) skip_col[col] = true;
|
48
|
+
}
|
49
|
+
|
50
|
+
return max_categ;
|
51
|
+
}
|
52
|
+
|
53
|
+
/* Save the counts of each category for each column in the array determined above */
|
54
|
+
void calculate_all_cat_counts(size_t start_ix_cat_counts[], size_t cat_counts[], int ncat[],
|
55
|
+
int categorical_data[], size_t ncols, size_t nrows,
|
56
|
+
bool has_NA[], bool skip_col[], int nthreads)
|
57
|
+
{
|
58
|
+
size_t col_st_offset;
|
59
|
+
size_t col_stop;
|
60
|
+
|
61
|
+
#pragma omp parallel for schedule(static, 1) num_threads(nthreads) private(col_st_offset, col_stop)
|
62
|
+
for (size_t_for col = 0; col < ncols; col++) {
|
63
|
+
|
64
|
+
if (skip_col[col]) continue;
|
65
|
+
|
66
|
+
col_st_offset = start_ix_cat_counts[col];
|
67
|
+
col_stop = (col + 1) * nrows;
|
68
|
+
for (size_t row = (col * nrows); row < col_stop; row++) {
|
69
|
+
|
70
|
+
if (categorical_data[row] >= 0) {
|
71
|
+
cat_counts[ categorical_data[row] + col_st_offset ]++;
|
72
|
+
} else {
|
73
|
+
cat_counts[ ncat[col] + col_st_offset ]++;
|
74
|
+
has_NA[col] = true;
|
75
|
+
}
|
76
|
+
|
77
|
+
}
|
78
|
+
}
|
79
|
+
}
|
80
|
+
|
81
|
+
/* Check if some column has a large majority that would make any split fail to meet minimum sizes */
|
82
|
+
void check_cat_col_unsplittable(size_t start_ix_cat_counts[], size_t cat_counts[], int ncat[],
|
83
|
+
size_t ncols, size_t min_conditioned_size, size_t nrows, bool skip_col[], int nthreads)
|
84
|
+
{
|
85
|
+
size_t largest_cnt;
|
86
|
+
#pragma omp parallel for num_threads(nthreads) private(largest_cnt) shared(ncols, nrows, ncat, cat_counts, start_ix_cat_counts, min_conditioned_size, skip_col)
|
87
|
+
for (size_t_for col = 0; col < ncols; col++) {
|
88
|
+
|
89
|
+
largest_cnt = 0;
|
90
|
+
for (int cat = 0; cat <= ncat[col]; cat++) {
|
91
|
+
largest_cnt = std::max(largest_cnt, cat_counts[ cat + start_ix_cat_counts[col] ]);
|
92
|
+
}
|
93
|
+
if (largest_cnt > (nrows - min_conditioned_size)) skip_col[col] = true;
|
94
|
+
if (largest_cnt <= 1) skip_col[col] = true;
|
95
|
+
|
96
|
+
}
|
97
|
+
}
|
98
|
+
|
99
|
+
/* Calculate the maxmimum proportions in a subset below which a category *can* be considered as outlier (must meet other conditions too) */
|
100
|
+
void calculate_lowerlim_proportion(long double *restrict prop_small, long double *restrict prop,
|
101
|
+
size_t start_ix_cat_counts[], size_t cat_counts[],
|
102
|
+
size_t ncols, size_t nrows, double z_norm, double z_tail)
|
103
|
+
{
|
104
|
+
/* TODO: come up with some way of flagging unseen categories as outliers */
|
105
|
+
long double mean;
|
106
|
+
long double sd;
|
107
|
+
long double nrows_dbl = (long double) nrows;
|
108
|
+
for (size_t col = 0; col < ncols; col++) {
|
109
|
+
|
110
|
+
for (size_t cat = start_ix_cat_counts[col]; cat < (start_ix_cat_counts[col + 1] - 1); cat++) {
|
111
|
+
|
112
|
+
if (cat_counts[cat] > 0) {
|
113
|
+
mean = (long double) cat_counts[cat] / nrows_dbl;
|
114
|
+
sd = sqrtl( mean * (1.0 - mean) / nrows_dbl );
|
115
|
+
prop_small[cat] = fminl(mean - z_norm * sd, mean * 0.5);
|
116
|
+
prop[cat] = mean;
|
117
|
+
} else {
|
118
|
+
prop_small[cat] = 0;
|
119
|
+
prop[cat] = 0;
|
120
|
+
}
|
121
|
+
|
122
|
+
}
|
123
|
+
|
124
|
+
}
|
125
|
+
}
|
126
|
+
|
127
|
+
|
128
|
+
/* Check if a numerical column has no variance (i.e. will not be splittable).
|
129
|
+
Along the way, also record the number of decimals to display for this column. */
|
130
|
+
void check_missing_no_variance(double numeric_data[], size_t ncols, size_t nrows, bool has_NA[],
|
131
|
+
bool skip_col[], int min_decimals[], int nthreads)
|
132
|
+
{
|
133
|
+
long double running_mean;
|
134
|
+
long double mean_prev;
|
135
|
+
long double running_ssq;
|
136
|
+
size_t cnt;
|
137
|
+
size_t col_stop;
|
138
|
+
double xval;
|
139
|
+
double min_val;
|
140
|
+
double max_val;
|
141
|
+
int min_decimals_col;
|
142
|
+
|
143
|
+
#pragma omp parallel for schedule(static) num_threads(nthreads) \
|
144
|
+
shared(nrows, ncols, numeric_data, has_NA, skip_col, min_decimals) \
|
145
|
+
private(running_mean, mean_prev, running_ssq, cnt, col_stop, xval, min_val, max_val, min_decimals_col)
|
146
|
+
for (size_t_for col = 0; col < ncols; col++) {
|
147
|
+
running_mean = 0;
|
148
|
+
mean_prev = 0;
|
149
|
+
running_ssq = 0;
|
150
|
+
min_val = HUGE_VAL;
|
151
|
+
max_val = -HUGE_VAL;
|
152
|
+
cnt = 0;
|
153
|
+
col_stop = (col + 1) * nrows;
|
154
|
+
for (size_t row = col * nrows; row < col_stop; row++) {
|
155
|
+
xval = numeric_data[row];
|
156
|
+
if (!is_na_or_inf(xval)) {
|
157
|
+
running_mean += (xval - running_mean) / (long double)(++cnt);
|
158
|
+
running_ssq += (xval - running_mean) * (xval - mean_prev);
|
159
|
+
mean_prev = running_mean;
|
160
|
+
min_val = fmin(min_val, xval);
|
161
|
+
max_val = fmax(max_val, xval);
|
162
|
+
} else {
|
163
|
+
has_NA[col] = true;
|
164
|
+
}
|
165
|
+
}
|
166
|
+
if ( (running_ssq / (long double)(cnt - 1)) < 1e-6 ) skip_col[col] = true;
|
167
|
+
if (cnt > 1) {
|
168
|
+
min_decimals_col = 0;
|
169
|
+
min_decimals_col = std::max(min_decimals_col, decimals_diff(running_mean, min_val));
|
170
|
+
min_decimals_col = std::max(min_decimals_col, decimals_diff(running_mean, max_val));
|
171
|
+
min_decimals_col = std::max(min_decimals_col, decimals_diff(0., sqrtl((running_ssq / (long double)(cnt - 1)))));
|
172
|
+
min_decimals[col] = min_decimals_col;
|
173
|
+
}
|
174
|
+
}
|
175
|
+
}
|
176
|
+
|
177
|
+
/* Calculate mean and standard deviation from the central half of the data, and adjust SD heuristically by x2.5 */
|
178
|
+
void calc_central_mean_and_sd(size_t ix_arr[], size_t st, size_t end, double x[], size_t size_quarter, double *mean_central, double *sd_central)
|
179
|
+
{
|
180
|
+
long double running_mean = 0;
|
181
|
+
long double mean_prev = 0;
|
182
|
+
long double running_ssq = 0;
|
183
|
+
double xval;
|
184
|
+
size_t st_offset = st + size_quarter;
|
185
|
+
if (ix_arr != NULL) {
|
186
|
+
for (size_t row = st_offset; row <= (end - size_quarter); row++) {
|
187
|
+
xval = x[ix_arr[row]];
|
188
|
+
running_mean += (xval - running_mean) / (long double)(row - st_offset + 1);
|
189
|
+
running_ssq += (xval - running_mean) * (xval - mean_prev);
|
190
|
+
mean_prev = running_mean;
|
191
|
+
}
|
192
|
+
} else {
|
193
|
+
for (size_t row = st_offset; row <= (end - size_quarter); row++) {
|
194
|
+
xval = x[row];
|
195
|
+
running_mean += (xval - running_mean) / (long double)(row - st_offset + 1);
|
196
|
+
running_ssq += (xval - running_mean) * (xval - mean_prev);
|
197
|
+
mean_prev = running_mean;
|
198
|
+
}
|
199
|
+
}
|
200
|
+
*mean_central = (double) running_mean;
|
201
|
+
*sd_central = 2.5 * sqrtl(running_ssq / (long double)(end - st - 2 * size_quarter));
|
202
|
+
}
|
203
|
+
|
204
|
+
|
205
|
+
/* Check whether a numerical column has long tails, and whether a transformation is appropriate
|
206
|
+
*
|
207
|
+
* Will check if there are too many observations with large Z values at either side. If found, will
|
208
|
+
* see if applying a trasnformation (exponentiation for left tail, logarithm for right tail) would
|
209
|
+
* solve the problem. If not, will report approximate values for where the tails end/start. If it
|
210
|
+
* has tails at both sides, will not process the column.
|
211
|
+
*
|
212
|
+
* Parameters:
|
213
|
+
* - ix_arr[n] (in)
|
214
|
+
* Indices by which the 'x' variable would be sorted in ascending order. (Must be already sorted!!!)
|
215
|
+
* - st (in)
|
216
|
+
* Position at which ix_arr starts (inclusive).
|
217
|
+
* - end (in)
|
218
|
+
* Position at which ix_arr ends (inclusive).
|
219
|
+
* - x[n] (in)
|
220
|
+
* Column with the original values to check for tails.
|
221
|
+
* - z_norm (in)
|
222
|
+
* Model parameter. Default is 2.67.
|
223
|
+
* - max_perc_outliers (in)
|
224
|
+
* Model parameter. Default is 0.01.
|
225
|
+
* - buffer_x[n] (temp)
|
226
|
+
* Array where to store the transformed values of 'x' (will ignore ix_arr).
|
227
|
+
* - mean (in)
|
228
|
+
* Mean to use for transforming to Z scores before exponentiating.
|
229
|
+
* - sd (in)
|
230
|
+
* Standard deviation to use for transforming to Z scores before exponentiating.
|
231
|
+
* - left_tail (out)
|
232
|
+
* Approximate value at which the tail is considered to end (if found and not solvable by transforming).
|
233
|
+
* - right_tail (out)
|
234
|
+
* Approximate value at which the tail is considered to start (if found and not solvable by transforming).
|
235
|
+
* - exp_transf (out)
|
236
|
+
* Whether to apply an exponential transformation (on the Z values!!!) to solve the problem of having a long left tail.
|
237
|
+
* - log_transf (out)
|
238
|
+
* Whether to apply a log transform to solve the problem of having a long right tail.
|
239
|
+
*/
|
240
|
+
void check_for_tails(size_t ix_arr[], size_t st, size_t end, double *restrict x,
|
241
|
+
double z_norm, double max_perc_outliers,
|
242
|
+
double *restrict buffer_x, double mean, double sd,
|
243
|
+
double *restrict left_tail, double *restrict right_tail,
|
244
|
+
bool *exp_transf, bool *log_transf)
|
245
|
+
{
|
246
|
+
size_t size_quarter = (end - st + 1) / 4;
|
247
|
+
size_t tail_ix;
|
248
|
+
size_t median = 2 * size_quarter;
|
249
|
+
double z_tail = 2 * z_norm;
|
250
|
+
double const_add_log;
|
251
|
+
*left_tail = -HUGE_VAL;
|
252
|
+
*right_tail = HUGE_VAL;
|
253
|
+
size_t max_norm_tail = (size_t) calculate_max_outliers((long double)(end - st + 1), max_perc_outliers);
|
254
|
+
double mean_central, sd_central;
|
255
|
+
calc_central_mean_and_sd(ix_arr, st, end, x, size_quarter, &mean_central, &sd_central);
|
256
|
+
*exp_transf = false;
|
257
|
+
*log_transf = false;
|
258
|
+
if ( z_score(x[ix_arr[st + max_norm_tail]], mean_central, sd_central) < (-z_tail) ) *left_tail = 1;
|
259
|
+
if ( z_score(x[ix_arr[end - max_norm_tail]], mean_central, sd_central) > z_tail ) *right_tail = 1;
|
260
|
+
|
261
|
+
/* check for left tail (too many low values) */
|
262
|
+
if (*left_tail == 1) {
|
263
|
+
|
264
|
+
/* check if exponentiation would help */
|
265
|
+
for (size_t row = (st + size_quarter); row <= (end - size_quarter); row++)
|
266
|
+
buffer_x[row] = exp(z_score(x[ix_arr[row]], mean, sd));
|
267
|
+
calc_central_mean_and_sd(NULL, st, end, buffer_x, size_quarter, &mean_central, &sd_central);
|
268
|
+
buffer_x[st + max_norm_tail] = exp(z_score(x[ix_arr[st + max_norm_tail]], mean, sd));
|
269
|
+
if (z_score(buffer_x[st + max_norm_tail], mean_central, sd_central) >= -z_tail)
|
270
|
+
{
|
271
|
+
*left_tail = HUGE_VAL;
|
272
|
+
*exp_transf = true;
|
273
|
+
}
|
274
|
+
|
275
|
+
/* if exponentiation doesn't help, determine where does the tail lie on the untransformed data */
|
276
|
+
else {
|
277
|
+
|
278
|
+
*exp_transf = false;
|
279
|
+
for (tail_ix = st; tail_ix <= median; tail_ix++) {
|
280
|
+
if (z_score(x[ix_arr[tail_ix]], mean_central, sd_central) > (-z_tail)) break;
|
281
|
+
}
|
282
|
+
*left_tail = x[ix_arr[tail_ix]];
|
283
|
+
|
284
|
+
}
|
285
|
+
|
286
|
+
}
|
287
|
+
|
288
|
+
/* check for right tail (too many high values) */
|
289
|
+
if (*right_tail == 1 ) {
|
290
|
+
|
291
|
+
if (x[ix_arr[st]] == 0) {
|
292
|
+
const_add_log = +1;
|
293
|
+
} else {
|
294
|
+
const_add_log = - x[ix_arr[st]] + 1e-3;
|
295
|
+
}
|
296
|
+
|
297
|
+
/* check if a log transform would help */
|
298
|
+
for (size_t row = (st + size_quarter); row <= (end - size_quarter); row++)
|
299
|
+
buffer_x[row] = log(x[ix_arr[row]] + const_add_log);
|
300
|
+
calc_central_mean_and_sd(NULL, st, end, buffer_x, size_quarter, &mean_central, &sd_central);
|
301
|
+
buffer_x[end - max_norm_tail] = log(x[ix_arr[end - max_norm_tail]] + const_add_log);
|
302
|
+
if (z_score(buffer_x[end - max_norm_tail], mean_central, sd_central) <= z_tail)
|
303
|
+
{
|
304
|
+
*right_tail = HUGE_VAL;
|
305
|
+
*log_transf = true;
|
306
|
+
}
|
307
|
+
|
308
|
+
/* if log transform doesn't help, determine where does the tail lie on the untransformed data */
|
309
|
+
else {
|
310
|
+
for (tail_ix = end; tail_ix >= median; tail_ix--) {
|
311
|
+
if (z_score(x[ix_arr[tail_ix]], mean_central, sd_central) < z_tail) break;
|
312
|
+
}
|
313
|
+
*right_tail = x[ix_arr[tail_ix]];
|
314
|
+
}
|
315
|
+
|
316
|
+
}
|
317
|
+
|
318
|
+
}
|
319
|
+
|
320
|
+
/* Move identified outliers for a given column to the beginning of the indices array,
|
321
|
+
and return the position at which the non-outliers start */
|
322
|
+
size_t move_outliers_to_front(size_t ix_arr[], double outlier_scores[], size_t st, size_t end)
|
323
|
+
{
|
324
|
+
size_t st_non_na = st;
|
325
|
+
size_t temp;
|
326
|
+
|
327
|
+
for (size_t i = st; i <= end; i++) {
|
328
|
+
if (outlier_scores[ix_arr[i]] < 1.0) {
|
329
|
+
temp = ix_arr[st_non_na];
|
330
|
+
ix_arr[st_non_na] = ix_arr[i];
|
331
|
+
ix_arr[i] = temp;
|
332
|
+
st_non_na++;
|
333
|
+
}
|
334
|
+
}
|
335
|
+
return st_non_na;
|
336
|
+
}
|
337
|
+
|
338
|
+
/* Move missing values of a numeric variable to the front of the indices array and return the position at which non-missing ones start */
|
339
|
+
size_t move_NAs_to_front(size_t ix_arr[], double x[], size_t st, size_t end, bool inf_as_NA)
|
340
|
+
{
|
341
|
+
size_t st_non_na = st;
|
342
|
+
size_t temp;
|
343
|
+
|
344
|
+
if (inf_as_NA) {
|
345
|
+
for (size_t i = st; i <= end; i++) {
|
346
|
+
if (is_na_or_inf(x[ix_arr[i]])) {
|
347
|
+
temp = ix_arr[st_non_na];
|
348
|
+
ix_arr[st_non_na] = ix_arr[i];
|
349
|
+
ix_arr[i] = temp;
|
350
|
+
st_non_na++;
|
351
|
+
}
|
352
|
+
}
|
353
|
+
} else {
|
354
|
+
for (size_t i = st; i <= end; i++) {
|
355
|
+
if (isnan(x[ix_arr[i]])) {
|
356
|
+
temp = ix_arr[st_non_na];
|
357
|
+
ix_arr[st_non_na] = ix_arr[i];
|
358
|
+
ix_arr[i] = temp;
|
359
|
+
st_non_na++;
|
360
|
+
}
|
361
|
+
}
|
362
|
+
}
|
363
|
+
|
364
|
+
return st_non_na;
|
365
|
+
}
|
366
|
+
|
367
|
+
/* Move missing values of a categorical variable to the front of the indices array and return the position at which non-missing ones start */
|
368
|
+
size_t move_NAs_to_front(size_t ix_arr[], int x[], size_t st, size_t end)
|
369
|
+
{
|
370
|
+
size_t st_non_na = st;
|
371
|
+
size_t temp;
|
372
|
+
|
373
|
+
for (size_t i = st; i <= end; i++) {
|
374
|
+
if (x[ix_arr[i]] < 0) { /* categorical NAs are represented as negative integers */
|
375
|
+
temp = ix_arr[st_non_na];
|
376
|
+
ix_arr[st_non_na] = ix_arr[i];
|
377
|
+
ix_arr[i] = temp;
|
378
|
+
st_non_na++;
|
379
|
+
}
|
380
|
+
}
|
381
|
+
return st_non_na;
|
382
|
+
}
|
383
|
+
|
384
|
+
/* for numerical */
|
385
|
+
void divide_subset_split(size_t ix_arr[], double x[], size_t st, size_t end, double split_point, bool has_NA, size_t *split_NA, size_t *st_right)
|
386
|
+
{
|
387
|
+
size_t temp;
|
388
|
+
|
389
|
+
if (has_NA) {
|
390
|
+
*split_NA = move_NAs_to_front(ix_arr, x, st, end, false);
|
391
|
+
st = *split_NA;
|
392
|
+
} else { *split_NA = st; }
|
393
|
+
for (size_t row = st; row <= end; row++) {
|
394
|
+
|
395
|
+
/* move to the left if the category is there */
|
396
|
+
if (x[ix_arr[row]] <= split_point) {
|
397
|
+
temp = ix_arr[st];
|
398
|
+
ix_arr[st] = ix_arr[row];
|
399
|
+
ix_arr[row] = temp;
|
400
|
+
st++;
|
401
|
+
}
|
402
|
+
}
|
403
|
+
|
404
|
+
*st_right = st;
|
405
|
+
}
|
406
|
+
|
407
|
+
/* for categorical */
|
408
|
+
void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, char subset_categ[], int ncat, bool has_NA, size_t *split_NA, size_t *st_right)
|
409
|
+
{
|
410
|
+
size_t temp;
|
411
|
+
|
412
|
+
if (has_NA) {
|
413
|
+
*split_NA = move_NAs_to_front(ix_arr, x, st, end);
|
414
|
+
st = *split_NA;
|
415
|
+
} else { *split_NA = st; }
|
416
|
+
for (size_t row = st; row <= end; row++) {
|
417
|
+
|
418
|
+
/* move to the left if the category is there */
|
419
|
+
if (subset_categ[ x[ix_arr[row]] ] != 0) {
|
420
|
+
temp = ix_arr[st];
|
421
|
+
ix_arr[st] = ix_arr[row];
|
422
|
+
ix_arr[row] = temp;
|
423
|
+
st++;
|
424
|
+
}
|
425
|
+
}
|
426
|
+
|
427
|
+
*st_right = st;
|
428
|
+
}
|
429
|
+
|
430
|
+
/* for ordinal */
|
431
|
+
void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, int split_lev, bool has_NA, size_t *split_NA, size_t *st_right)
|
432
|
+
{
|
433
|
+
size_t temp;
|
434
|
+
|
435
|
+
if (has_NA) {
|
436
|
+
*split_NA = move_NAs_to_front(ix_arr, x, st, end);
|
437
|
+
st = *split_NA;
|
438
|
+
} else { *split_NA = st; }
|
439
|
+
for (size_t row = st; row <= end; row++) {
|
440
|
+
|
441
|
+
/* move to the left if the category is there */
|
442
|
+
if (x[ix_arr[row]] <= split_lev) {
|
443
|
+
temp = ix_arr[st];
|
444
|
+
ix_arr[st] = ix_arr[row];
|
445
|
+
ix_arr[row] = temp;
|
446
|
+
st++;
|
447
|
+
}
|
448
|
+
}
|
449
|
+
|
450
|
+
*st_right = st;
|
451
|
+
}
|
452
|
+
|
453
|
+
/* thread-local memory where intermediate outputs and buffers are stored */
|
454
|
+
bool check_workspace_is_allocated(Workspace &workspace)
|
455
|
+
{
|
456
|
+
return workspace.ix_arr.size() > 0;
|
457
|
+
}
|
458
|
+
|
459
|
+
void allocate_thread_workspace(Workspace &workspace, size_t nrows, int max_categ)
|
460
|
+
{
|
461
|
+
workspace.buffer_transf_y.resize(nrows);
|
462
|
+
workspace.buffer_bin_y.resize(nrows);
|
463
|
+
workspace.ix_arr.resize(nrows);
|
464
|
+
for (size_t i = 0; i < nrows; i++) workspace.ix_arr[i] = i;
|
465
|
+
|
466
|
+
workspace.outlier_scores.resize(nrows);
|
467
|
+
workspace.outlier_clusters.resize(nrows);
|
468
|
+
workspace.outlier_trees.resize(nrows);
|
469
|
+
workspace.outlier_depth.resize(nrows);
|
470
|
+
workspace.buffer_sd.resize(nrows);
|
471
|
+
|
472
|
+
workspace.buffer_cat_sum.resize(max_categ + 1);
|
473
|
+
workspace.buffer_cat_sum_sq.resize(max_categ + 1);
|
474
|
+
workspace.buffer_cat_cnt.resize( (max_categ + 1) * 3);
|
475
|
+
workspace.buffer_cat_sorted.resize(max_categ);
|
476
|
+
workspace.buffer_subset_categ.resize(max_categ);
|
477
|
+
|
478
|
+
workspace.buffer_subset_categ_best.resize(max_categ);
|
479
|
+
workspace.buffer_crosstab.resize(square(max_categ + 1));
|
480
|
+
workspace.buffer_subset_outlier.resize(max_categ);
|
481
|
+
}
|
482
|
+
|
483
|
+
/*
|
484
|
+
* This was a quick way of coding up the option 'follow_all' - it basically backs up the modifyable data that
|
485
|
+
* is looked at during a recursion, in a rather un-optimal manner. It can be optimized further by not copying
|
486
|
+
* everything, as it doesn't really need to always copy all variables (same for the restore function below).
|
487
|
+
* For example, at a given point, only one of buffer_subset_categ/this_split_point is used.
|
488
|
+
*/
|
489
|
+
void backup_recursion_state(Workspace &workspace, RecursionState &state_backup)
|
490
|
+
{
|
491
|
+
state_backup.gain_restore = workspace.this_gain;
|
492
|
+
state_backup.gain_best_restore = workspace.best_gain;
|
493
|
+
state_backup.split_point_restore = workspace.this_split_point;
|
494
|
+
state_backup.split_lev_restore = workspace.this_split_lev;
|
495
|
+
state_backup.split_subset_restore = workspace.buffer_subset_categ;
|
496
|
+
state_backup.ix1_restore = workspace.st;
|
497
|
+
state_backup.ix2_restore = workspace.this_split_NA;
|
498
|
+
state_backup.ix3_restore = workspace.this_split_ix;
|
499
|
+
state_backup.ix4_restore = workspace.end;
|
500
|
+
state_backup.col_best_restore = workspace.col_best;
|
501
|
+
state_backup.col_type_best_rememer = workspace.column_type_best;
|
502
|
+
state_backup.split_point_best_restore = workspace.split_point_best;
|
503
|
+
state_backup.split_lev_best_restore = workspace.split_lev_best;
|
504
|
+
state_backup.split_subset_best_restore = workspace.buffer_subset_categ_best;
|
505
|
+
state_backup.base_info_restore = workspace.base_info;
|
506
|
+
state_backup.base_info_orig_restore = workspace.base_info_orig;
|
507
|
+
state_backup.sd_y_restore = workspace.sd_y;
|
508
|
+
state_backup.has_outliers_restore = workspace.has_outliers;
|
509
|
+
state_backup.lev_has_outliers_restore = workspace.lev_has_outliers;
|
510
|
+
state_backup.temp_ptr_x = workspace.temp_ptr_x;
|
511
|
+
}
|
512
|
+
|
513
|
+
void restore_recursion_state(Workspace &workspace, RecursionState &state_backup)
|
514
|
+
{
|
515
|
+
workspace.this_gain = state_backup.gain_restore;
|
516
|
+
workspace.best_gain = state_backup.gain_best_restore;
|
517
|
+
workspace.this_split_point = state_backup.split_point_restore;
|
518
|
+
workspace.this_split_lev = state_backup.split_lev_restore;
|
519
|
+
workspace.buffer_subset_categ = state_backup.split_subset_restore;
|
520
|
+
workspace.st = state_backup.ix1_restore;
|
521
|
+
workspace.this_split_NA = state_backup.ix2_restore;
|
522
|
+
workspace.this_split_ix = state_backup.ix3_restore;
|
523
|
+
workspace.end = state_backup.ix4_restore;
|
524
|
+
workspace.col_best = state_backup.col_best_restore;
|
525
|
+
workspace.column_type_best = state_backup.col_type_best_rememer;
|
526
|
+
workspace.split_point_best = state_backup.split_point_best_restore;
|
527
|
+
workspace.split_lev_best = state_backup.split_lev_best_restore;
|
528
|
+
workspace.buffer_subset_categ_best = state_backup.split_subset_best_restore;
|
529
|
+
workspace.base_info = state_backup.base_info_restore;
|
530
|
+
workspace.base_info_orig = state_backup.base_info_orig_restore;
|
531
|
+
workspace.sd_y = state_backup.sd_y_restore;
|
532
|
+
workspace.has_outliers = state_backup.has_outliers_restore;
|
533
|
+
workspace.lev_has_outliers = state_backup.lev_has_outliers_restore;
|
534
|
+
workspace.temp_ptr_x = state_backup.temp_ptr_x;
|
535
|
+
}
|
536
|
+
|
537
|
+
/* Next split on the trees is only decided after they are already initialized */
|
538
|
+
void set_tree_as_numeric(ClusterTree &tree, double split_point, size_t col)
|
539
|
+
{
|
540
|
+
tree.column_type = Numeric;
|
541
|
+
tree.split_point = split_point;
|
542
|
+
tree.col_num = col;
|
543
|
+
}
|
544
|
+
|
545
|
+
void set_tree_as_categorical(ClusterTree &tree, int ncat, char *split_subset, size_t col)
|
546
|
+
{
|
547
|
+
tree.column_type = Categorical;
|
548
|
+
tree.col_num = col;
|
549
|
+
tree.split_subset.assign(split_subset, split_subset + ncat);
|
550
|
+
tree.split_subset.shrink_to_fit();
|
551
|
+
}
|
552
|
+
|
553
|
+
void set_tree_as_categorical(ClusterTree &tree, size_t col)
|
554
|
+
{
|
555
|
+
tree.column_type = Categorical;
|
556
|
+
tree.col_num = col;
|
557
|
+
tree.split_subset.resize(2);
|
558
|
+
tree.split_subset[0] = 1;
|
559
|
+
tree.split_subset[1] = 0;
|
560
|
+
tree.split_subset.shrink_to_fit();
|
561
|
+
}
|
562
|
+
|
563
|
+
void set_tree_as_categorical(ClusterTree &tree, size_t col, int ncat)
|
564
|
+
{
|
565
|
+
tree.column_type = Categorical;
|
566
|
+
tree.col_num = col;
|
567
|
+
tree.binary_branches.resize(ncat, 0);
|
568
|
+
tree.binary_branches.shrink_to_fit();
|
569
|
+
tree.split_subset.shrink_to_fit();
|
570
|
+
}
|
571
|
+
|
572
|
+
void set_tree_as_ordinal(ClusterTree &tree, int split_lev, size_t col)
|
573
|
+
{
|
574
|
+
tree.column_type = Ordinal;
|
575
|
+
tree.split_lev = split_lev;
|
576
|
+
tree.col_num = col;
|
577
|
+
}
|
578
|
+
|
579
|
+
|
580
|
+
|
581
|
+
/* After presenting outliers, it's not necessary to retain their details about column/cluster/tree/etc. */
|
582
|
+
void forget_row_outputs(ModelOutputs &model_outputs)
|
583
|
+
{
|
584
|
+
model_outputs.outlier_scores_final.clear();
|
585
|
+
model_outputs.outlier_clusters_final.clear();
|
586
|
+
model_outputs.outlier_columns_final.clear();
|
587
|
+
model_outputs.outlier_trees_final.clear();
|
588
|
+
model_outputs.outlier_depth_final.clear();
|
589
|
+
model_outputs.outlier_decimals_distr.clear();
|
590
|
+
|
591
|
+
model_outputs.outlier_scores_final.shrink_to_fit();
|
592
|
+
model_outputs.outlier_clusters_final.shrink_to_fit();
|
593
|
+
model_outputs.outlier_columns_final.shrink_to_fit();
|
594
|
+
model_outputs.outlier_trees_final.shrink_to_fit();
|
595
|
+
model_outputs.outlier_depth_final.shrink_to_fit();
|
596
|
+
model_outputs.outlier_decimals_distr.shrink_to_fit();
|
597
|
+
}
|
598
|
+
|
599
|
+
void allocate_row_outputs(ModelOutputs &model_outputs, size_t nrows, size_t max_depth)
|
600
|
+
{
|
601
|
+
forget_row_outputs(model_outputs);
|
602
|
+
model_outputs.outlier_scores_final.resize(nrows, 1.0);
|
603
|
+
model_outputs.outlier_clusters_final.resize(nrows, 0);
|
604
|
+
model_outputs.outlier_columns_final.resize(nrows);
|
605
|
+
model_outputs.outlier_trees_final.resize(nrows);
|
606
|
+
model_outputs.outlier_depth_final.resize(nrows, max_depth + 2);
|
607
|
+
model_outputs.outlier_decimals_distr.resize(nrows, 0);
|
608
|
+
|
609
|
+
model_outputs.outlier_scores_final.shrink_to_fit();
|
610
|
+
model_outputs.outlier_clusters_final.shrink_to_fit();
|
611
|
+
model_outputs.outlier_columns_final.shrink_to_fit();
|
612
|
+
model_outputs.outlier_trees_final.shrink_to_fit();
|
613
|
+
model_outputs.outlier_depth_final.shrink_to_fit();
|
614
|
+
model_outputs.outlier_decimals_distr.shrink_to_fit();
|
615
|
+
}
|
616
|
+
|
617
|
+
void check_more_two_values(double arr_num[], size_t nrows, size_t ncols, int nthreads, char too_few_values[])
|
618
|
+
{
|
619
|
+
std::vector<std::unordered_set<double>> seen_values(ncols);
|
620
|
+
|
621
|
+
#pragma omp parallel for schedule(dynamic) num_threads(nthreads) shared(arr_num, nrows, ncols, too_few_values, seen_values)
|
622
|
+
for (size_t_for col = 0; col < ncols; col++) {
|
623
|
+
for (size_t row = 0; row < nrows; row++) {
|
624
|
+
if (!isnan(arr_num[row + col * nrows]))
|
625
|
+
seen_values[col].insert(arr_num[row + col * nrows]);
|
626
|
+
if (seen_values[col].size() > 2) break;
|
627
|
+
}
|
628
|
+
if (seen_values[col].size() <= 2)too_few_values[col] = true;
|
629
|
+
}
|
630
|
+
}
|
631
|
+
|
632
|
+
void calc_min_decimals_to_print(ModelOutputs &model_outputs, double *restrict numeric_data, int nthreads)
|
633
|
+
{
|
634
|
+
if (numeric_data == NULL) return;
|
635
|
+
|
636
|
+
double val_this;
|
637
|
+
double val_comp;
|
638
|
+
int min_decimals;
|
639
|
+
size_t col_this;
|
640
|
+
Cluster *cluster_this;
|
641
|
+
size_t nrows = model_outputs.outlier_columns_final.size();
|
642
|
+
|
643
|
+
#pragma omp parallel for schedule(dynamic) num_threads(nthreads) \
|
644
|
+
shared(model_outputs, nrows, numeric_data) \
|
645
|
+
private(val_this, val_comp, min_decimals, col_this, cluster_this)
|
646
|
+
for (size_t_for row = 0; row < nrows; row++) {
|
647
|
+
if (model_outputs.outlier_scores_final[row] < 1.0 &&
|
648
|
+
model_outputs.outlier_columns_final[row] < model_outputs.ncols_numeric
|
649
|
+
) {
|
650
|
+
|
651
|
+
col_this = model_outputs.outlier_columns_final[row];
|
652
|
+
cluster_this = &model_outputs.all_clusters[col_this][model_outputs.outlier_clusters_final[row]];
|
653
|
+
val_this = numeric_data[row + nrows * col_this];
|
654
|
+
val_comp = cluster_this->display_mean;
|
655
|
+
min_decimals = std::max(0, decimals_diff(val_this, val_comp));
|
656
|
+
|
657
|
+
if (val_this >= cluster_this->upper_lim)
|
658
|
+
val_comp = cluster_this->display_lim_high;
|
659
|
+
else
|
660
|
+
val_comp = cluster_this->display_lim_low;
|
661
|
+
min_decimals = std::max(min_decimals, decimals_diff(val_this, val_comp));
|
662
|
+
|
663
|
+
model_outputs.outlier_decimals_distr[row] = min_decimals;
|
664
|
+
}
|
665
|
+
}
|
666
|
+
}
|
667
|
+
|
668
|
+
int decimals_diff(double val1, double val2)
|
669
|
+
{
|
670
|
+
double res = ceil(-log10(fabs(val1 - val2)));
|
671
|
+
if (is_na_or_inf(res)) res = 0.;
|
672
|
+
return (int) res;
|
673
|
+
}
|
674
|
+
|
675
|
+
|
676
|
+
/* Reason behind this function: Cython (as of v0.29) will not auto-deallocate
|
677
|
+
structs which are part of a cdef'd class, which produces a memory leak
|
678
|
+
but can be force-destructed. Unfortunately, Cython itself doesn't even
|
679
|
+
allow calling destructors for structs, so it has to be done externally.
|
680
|
+
This function should otherwise have no reason to exist.
|
681
|
+
*/
|
682
|
+
void dealloc_ModelOutputs(ModelOutputs &model_outputs)
|
683
|
+
{
|
684
|
+
model_outputs.~ModelOutputs();
|
685
|
+
}
|