outliertree 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE.txt +674 -0
- data/NOTICE.txt +14 -0
- data/README.md +107 -0
- data/ext/outliertree/ext.cpp +260 -0
- data/ext/outliertree/extconf.rb +21 -0
- data/lib/outliertree.rb +17 -0
- data/lib/outliertree/dataset.rb +35 -0
- data/lib/outliertree/model.rb +128 -0
- data/lib/outliertree/result.rb +190 -0
- data/lib/outliertree/version.rb +3 -0
- data/vendor/outliertree/LICENSE +674 -0
- data/vendor/outliertree/README.md +155 -0
- data/vendor/outliertree/src/Makevars +3 -0
- data/vendor/outliertree/src/RcppExports.cpp +123 -0
- data/vendor/outliertree/src/Rwrapper.cpp +1225 -0
- data/vendor/outliertree/src/cat_outlier.cpp +328 -0
- data/vendor/outliertree/src/clusters.cpp +972 -0
- data/vendor/outliertree/src/fit_model.cpp +1932 -0
- data/vendor/outliertree/src/misc.cpp +685 -0
- data/vendor/outliertree/src/outlier_tree.hpp +758 -0
- data/vendor/outliertree/src/predict.cpp +706 -0
- data/vendor/outliertree/src/split.cpp +1098 -0
- metadata +150 -0
@@ -0,0 +1,328 @@
|
|
1
|
+
/********************************************************************************************************************
|
2
|
+
* Explainable outlier detection
|
3
|
+
*
|
4
|
+
* Tries to detect outliers by generating decision trees that attempt to predict the values of each column based on
|
5
|
+
* each other column, testing in each branch of every tried split (if it meets some minimum criteria) whether there
|
6
|
+
* are observations that seem too distant from the others in a 1-D distribution for the column that the split tries
|
7
|
+
* to "predict" (will not generate a score for each observation).
|
8
|
+
* Splits are based on gain, while outlierness is based on confidence intervals.
|
9
|
+
* Similar in spirit to the GritBot software developed by RuleQuest research. Reference article is:
|
10
|
+
* Cortes, David. "Explainable outlier detection through decision tree conditioning."
|
11
|
+
* arXiv preprint arXiv:2001.00636 (2020).
|
12
|
+
*
|
13
|
+
*
|
14
|
+
* Copyright 2020 David Cortes.
|
15
|
+
*
|
16
|
+
* Written for C++11 standard and OpenMP 2.0 or later. Code is meant to be wrapped into scripting languages
|
17
|
+
* such as R or Python.
|
18
|
+
*
|
19
|
+
* This file is part of OutlierTree.
|
20
|
+
*
|
21
|
+
* OutlierTree is free software: you can redistribute it and/or modify
|
22
|
+
* it under the terms of the GNU General Public License as published by
|
23
|
+
* the Free Software Foundation, either version 3 of the License, or
|
24
|
+
* (at your option) any later version.
|
25
|
+
*
|
26
|
+
* OutlierTree is distributed in the hope that it will be useful,
|
27
|
+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
28
|
+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
29
|
+
* GNU General Public License for more details.
|
30
|
+
*
|
31
|
+
* You should have received a copy of the GNU General Public License
|
32
|
+
* along with OutlierTree. If not, see <https://www.gnu.org/licenses/>.
|
33
|
+
********************************************************************************************************************/
|
34
|
+
#include "outlier_tree.hpp"
|
35
|
+
|
36
|
+
|
37
|
+
/* Check whether to consider any category as outlier, based on current counts and prior probabilities
|
38
|
+
*
|
39
|
+
* Function is to be applied to some subset of the data obtained by splitting by one or more columns.
|
40
|
+
* For outliers before any split there is a separate function. Note that since it required current
|
41
|
+
* probability to be lower than prior probability in order to consider as outlier, it cannot be
|
42
|
+
* used with the full data (only with subsets).
|
43
|
+
*
|
44
|
+
* Parameters:
|
45
|
+
* - categ_counts[ncateg] (in)
|
46
|
+
* Counts of each category in the subset (including non-present categories).
|
47
|
+
* - ncateg (in)
|
48
|
+
* Number of categories for this column (including non-present categories).
|
49
|
+
* - tot (in)
|
50
|
+
* Number of rows in the subset.
|
51
|
+
* - max_perc_outliers (in)
|
52
|
+
* Model parameter. Default value is 0.01.
|
53
|
+
* - perc_threshold[ncateg] (in)
|
54
|
+
* Threshold for the proportion/probability of each category below which it can be considered
|
55
|
+
* to be an outlier in a subset of the data. Note that in addition it will build a confidence
|
56
|
+
* interval here which might make it even smaller.
|
57
|
+
* - buffer_ix[ncateg] (temp)
|
58
|
+
* Buffer where to store indices of categories sorted by proportion.
|
59
|
+
* - buffer_perc[ncateg] (temp)
|
60
|
+
* Buffer where to store proportions of counts.
|
61
|
+
* - z_norm (in)
|
62
|
+
* Model parameter. Default value is 2.67.
|
63
|
+
* - is_outlier[ncateg] (out)
|
64
|
+
* Array where to define whether any category is an outlier. Values will be as follows:
|
65
|
+
* (-1) -> Category had zero count, but would be an outlier if it appeared among this group
|
66
|
+
* 0 -> Category is not an outlier
|
67
|
+
* (+1) -> Category is an outlier
|
68
|
+
* - found_outliers (out)
|
69
|
+
* Whether there were any outliers identified among the counts.
|
70
|
+
* - new_is_outlier (out)
|
71
|
+
* Whether any of the categories with zero count would be flagged as outlier if they appeared in this group.
|
72
|
+
* - next_most_comm (out)
|
73
|
+
* Proportion of the least common category that is not flagged as outlier.
|
74
|
+
*/
|
75
|
+
void find_outlier_categories(size_t categ_counts[], size_t ncateg, size_t tot, double max_perc_outliers,
|
76
|
+
long double perc_threshold[], size_t buffer_ix[], long double buffer_perc[],
|
77
|
+
double z_norm, char is_outlier[], bool *found_outliers, bool *new_is_outlier,
|
78
|
+
double *next_most_comm)
|
79
|
+
{
|
80
|
+
//TODO: must also establish bounds for new, unseen categories
|
81
|
+
|
82
|
+
/* initialize parameters as needed */
|
83
|
+
*found_outliers = false;
|
84
|
+
*new_is_outlier = false;
|
85
|
+
size_t st_non_zero = 0;
|
86
|
+
size_t end_tail = 0;
|
87
|
+
size_t max_outliers = (size_t) calculate_max_cat_outliers((long double)tot, max_perc_outliers, z_norm);
|
88
|
+
long double tot_dbl = (long double) tot;
|
89
|
+
long double pct_unseen = (long double)1 / (long double)(tot + 1);
|
90
|
+
size_t size_tail = 0;
|
91
|
+
|
92
|
+
/* reset the temporary arrays and fill them */
|
93
|
+
memset(is_outlier, 0, ncateg * sizeof(char));
|
94
|
+
for (size_t cat = 0; cat < ncateg; cat++) {
|
95
|
+
buffer_ix[cat] = cat;
|
96
|
+
buffer_perc[cat] = (categ_counts[cat] > 0)? ((long double)categ_counts[cat] / tot_dbl) : 0;
|
97
|
+
}
|
98
|
+
|
99
|
+
/* sort the categories by counts */
|
100
|
+
std::sort(buffer_ix, buffer_ix + ncateg,
|
101
|
+
[&categ_counts](const size_t a, const size_t b){return categ_counts[a] < categ_counts[b];});
|
102
|
+
|
103
|
+
/* find the first non-zero */
|
104
|
+
for (size_t cat = 0; cat < ncateg; cat++) {
|
105
|
+
if (categ_counts[ buffer_ix[cat] ] > 0) {
|
106
|
+
st_non_zero = cat;
|
107
|
+
break;
|
108
|
+
}
|
109
|
+
}
|
110
|
+
|
111
|
+
/* check that least common is not common enough to be normal */
|
112
|
+
if (categ_counts[ buffer_ix[st_non_zero] ] > max_outliers) return;
|
113
|
+
|
114
|
+
/* find tail among non-zero proportions
|
115
|
+
* a tail is considered to be so if:
|
116
|
+
* - the difference is above z_norm sd's of either proportion
|
117
|
+
* - the difference is greater than some fraction of the larger
|
118
|
+
* - the actual proportion here is lower than a CI of the prior proportion
|
119
|
+
* - the actual proportion here is half or less of the prior proportion
|
120
|
+
*/
|
121
|
+
for (size_t cat = st_non_zero; cat < ncateg - 1; cat++) {
|
122
|
+
if (
|
123
|
+
(
|
124
|
+
(buffer_perc[buffer_ix[cat + 1]] - buffer_perc[buffer_ix[cat]])
|
125
|
+
>
|
126
|
+
z_norm * sqrtl(
|
127
|
+
fmaxl(
|
128
|
+
buffer_perc[buffer_ix[cat + 1]] * ((long double)1 - buffer_perc[buffer_ix[cat + 1]]),
|
129
|
+
buffer_perc[buffer_ix[cat]] * ((long double)1 - buffer_perc[buffer_ix[cat]])
|
130
|
+
)
|
131
|
+
/ tot_dbl
|
132
|
+
)
|
133
|
+
)
|
134
|
+
&&
|
135
|
+
(
|
136
|
+
buffer_perc[buffer_ix[cat + 1]] * 0.5 > buffer_perc[buffer_ix[cat]]
|
137
|
+
)
|
138
|
+
)
|
139
|
+
{
|
140
|
+
end_tail = cat;
|
141
|
+
*next_most_comm = buffer_perc[buffer_ix[cat + 1]];
|
142
|
+
break;
|
143
|
+
}
|
144
|
+
}
|
145
|
+
|
146
|
+
/* if the tail is too long, don't identify any as outlier, but see if unseen categories (with prior > 0) would create a new tail */
|
147
|
+
for (size_t cat = st_non_zero; cat <= end_tail; cat++) size_tail += categ_counts[ buffer_ix[cat] ];
|
148
|
+
|
149
|
+
if (size_tail >= max_outliers) {
|
150
|
+
|
151
|
+
if (
|
152
|
+
st_non_zero == 0 ||
|
153
|
+
// ((long double)buffer_ix[buffer_ix[st_non_zero]] / (tot_dbl + 1)) * 0.5 <= pct_unseen ||
|
154
|
+
( ((long double)buffer_ix[buffer_ix[st_non_zero]] * 0.5) / (tot_dbl + 1)) <= pct_unseen ||
|
155
|
+
((long double)(buffer_ix[buffer_ix[st_non_zero]] - 1) / (tot_dbl + 1))
|
156
|
+
- (long double)z_norm * sqrtl(buffer_perc[buffer_ix[st_non_zero]] * ((long double)1 - buffer_perc[buffer_ix[st_non_zero]]) / tot_dbl)
|
157
|
+
>= pct_unseen
|
158
|
+
) return;
|
159
|
+
|
160
|
+
for (size_t cat = 0; cat < st_non_zero; cat++) {
|
161
|
+
if (perc_threshold[buffer_ix[cat]] > pct_unseen) {
|
162
|
+
*new_is_outlier = true;
|
163
|
+
is_outlier[buffer_ix[cat]] = -1;
|
164
|
+
}
|
165
|
+
}
|
166
|
+
*next_most_comm = buffer_perc[buffer_ix[st_non_zero]];
|
167
|
+
return;
|
168
|
+
|
169
|
+
}
|
170
|
+
|
171
|
+
/* now determine if any category in the tail is an outlier */
|
172
|
+
for (size_t cat = st_non_zero; cat <= end_tail; cat++) {
|
173
|
+
|
174
|
+
/* must have a proportion below CI and below half of prior */
|
175
|
+
if (buffer_perc[buffer_ix[cat]] < perc_threshold[buffer_ix[cat]]) {
|
176
|
+
is_outlier[buffer_ix[cat]] = 1;
|
177
|
+
*found_outliers = true;
|
178
|
+
}
|
179
|
+
}
|
180
|
+
|
181
|
+
/* check if any new categories would be outliers */
|
182
|
+
if (st_non_zero > 0) {
|
183
|
+
for (size_t cat = 0; cat < st_non_zero; cat++) {
|
184
|
+
if (perc_threshold[buffer_ix[cat]] > pct_unseen) {
|
185
|
+
*new_is_outlier = true;
|
186
|
+
is_outlier[buffer_ix[cat]] = -1;
|
187
|
+
}
|
188
|
+
}
|
189
|
+
}
|
190
|
+
if (*new_is_outlier && !(*found_outliers)) {
|
191
|
+
*next_most_comm = buffer_perc[buffer_ix[st_non_zero]];
|
192
|
+
}
|
193
|
+
|
194
|
+
}
|
195
|
+
|
196
|
+
/* Check whether to consider any category as outlier, based on majority category and prior probabilties
|
197
|
+
*
|
198
|
+
* Function is to be applied to some subset of the data obtained by splitting by one or more columns.
|
199
|
+
* For outliers before any split there is a separate function. This is an alternative to the "tail"
|
200
|
+
* approach above which is more in line with GritBot.
|
201
|
+
*
|
202
|
+
* Parameters:
|
203
|
+
* - categ_counts[ncateg] (in)
|
204
|
+
* Counts of each category in the subset (including non-present categories).
|
205
|
+
* - ncateg (in)
|
206
|
+
* Number of categories for this column (including non-present categories).
|
207
|
+
* - tot (in)
|
208
|
+
* Number of rows in the subset.
|
209
|
+
* - max_perc_outliers (in)
|
210
|
+
* Model parameter. Default value is 0.01.
|
211
|
+
* - prior_prob[ncateg] (in)
|
212
|
+
* Proportions that each category had in the full data.
|
213
|
+
* - z_outlier (in)
|
214
|
+
* Model parameter. Default value is 8.0
|
215
|
+
* - is_outlier[ncateg] (out)
|
216
|
+
* Array where to define whether any category is an outlier. Values will be as follows:
|
217
|
+
* (-1) -> Category had zero count, but would be an outlier if it appeared among this group
|
218
|
+
* 0 -> Category is not an outlier
|
219
|
+
* (+1) -> Category is an outlier
|
220
|
+
* - found_outliers (out)
|
221
|
+
* Whether there were any outliers identified among the counts.
|
222
|
+
* - new_is_outlier (out)
|
223
|
+
* Whether any of the categories with zero count would be flagged as outlier if they appeared in this group.
|
224
|
+
* - categ_maj (out)
|
225
|
+
* Category to which the majority of the observations belong.
|
226
|
+
*/
|
227
|
+
void find_outlier_categories_by_maj(size_t categ_counts[], size_t ncateg, size_t tot, double max_perc_outliers,
|
228
|
+
long double prior_prob[], double z_outlier, char is_outlier[],
|
229
|
+
bool *found_outliers, bool *new_is_outlier, int *categ_maj)
|
230
|
+
{
|
231
|
+
/* initialize parameters as needed */
|
232
|
+
*found_outliers = false;
|
233
|
+
*new_is_outlier = false;
|
234
|
+
memset(is_outlier, 0, ncateg * sizeof(char));
|
235
|
+
size_t max_outliers = (size_t) calculate_max_outliers((long double)tot, max_perc_outliers);
|
236
|
+
long double tot_dbl = (long double) (tot + 1);
|
237
|
+
size_t n_non_maj;
|
238
|
+
long double thr_prop = (double)1 / square(z_outlier);
|
239
|
+
|
240
|
+
/* check if any can be considered as outlier */
|
241
|
+
size_t *ptr_maj = std::max_element(categ_counts, categ_counts + ncateg);
|
242
|
+
*categ_maj = (int)(ptr_maj - categ_counts);
|
243
|
+
n_non_maj = tot - *ptr_maj;
|
244
|
+
if (n_non_maj > max_outliers)
|
245
|
+
return;
|
246
|
+
|
247
|
+
/* determine proportions and check for outlierness */
|
248
|
+
long double n_non_maj_dbl = (long double) n_non_maj;
|
249
|
+
for (size_t cat = 0; cat < ncateg; cat++) {
|
250
|
+
|
251
|
+
if ((int)cat == *categ_maj) continue;
|
252
|
+
|
253
|
+
if ( (n_non_maj_dbl / (tot_dbl * prior_prob[cat])) < thr_prop ) {
|
254
|
+
if (categ_counts[cat]) {
|
255
|
+
is_outlier[cat] = 1;
|
256
|
+
*found_outliers = true;
|
257
|
+
} else {
|
258
|
+
is_outlier[cat] = -1;
|
259
|
+
*new_is_outlier = true;
|
260
|
+
}
|
261
|
+
}
|
262
|
+
}
|
263
|
+
|
264
|
+
/* TODO: implement formula for flagging unsen categories (not in the sample, nor the full data) as outliers */
|
265
|
+
}
|
266
|
+
|
267
|
+
|
268
|
+
/* Check whether to consider any category as outlier before splitting, based on prior counts
|
269
|
+
*
|
270
|
+
* Follows very rough criteria: there can be at most 1-3 outliers depending on size of dataset,
|
271
|
+
* and the next most common category must have a count of at least 250.
|
272
|
+
*
|
273
|
+
* Parameters:
|
274
|
+
* - categ_counts[ncateg] (in)
|
275
|
+
* Frequencies of each category in the full data.
|
276
|
+
* - ncateg (in)
|
277
|
+
* Number of categories with non-zero count.
|
278
|
+
* - tot (in)
|
279
|
+
* Number of rows.
|
280
|
+
* - is_outlier[ncateg] (out)
|
281
|
+
* Array indicating whether any category is outlier (0 = non-outlier, 1 = outlier).
|
282
|
+
* - next_most_comm (out)
|
283
|
+
* Proportion of the least common non-outlier category.
|
284
|
+
*/
|
285
|
+
bool find_outlier_categories_no_cond(size_t categ_counts[], size_t ncateg, size_t tot,
|
286
|
+
char is_outlier[], double *next_most_comm)
|
287
|
+
{
|
288
|
+
/* if sample is too small, don't flag any as outliers */
|
289
|
+
if (tot < 1000) return false;
|
290
|
+
|
291
|
+
/* set a very low outlier threshold with a hard limit of 3 */
|
292
|
+
size_t max_outliers = (tot < 10000)? 1 : ((tot < 100000)? 2 : 3);
|
293
|
+
|
294
|
+
/* will only consider a category as outlier if the next most common is very common */
|
295
|
+
size_t max_next_most_comm = 250;
|
296
|
+
|
297
|
+
/* look if there's any category meeting the first condition and none meeting the second one */
|
298
|
+
bool has_outlier_cat = false;
|
299
|
+
memset(is_outlier, 0, sizeof(char) * ncateg);
|
300
|
+
for (size_t cat = 0; cat < ncateg; cat++) {
|
301
|
+
if (categ_counts[cat] > max_outliers && categ_counts[cat] < max_next_most_comm) {
|
302
|
+
has_outlier_cat = false;
|
303
|
+
break;
|
304
|
+
}
|
305
|
+
|
306
|
+
if (categ_counts[cat] > 0 && categ_counts[cat] <= max_outliers) {
|
307
|
+
/* can only have 1 outlier category in the whole column */
|
308
|
+
if (has_outlier_cat) { has_outlier_cat = false; break; }
|
309
|
+
|
310
|
+
has_outlier_cat = true;
|
311
|
+
is_outlier[cat] = 1;
|
312
|
+
}
|
313
|
+
|
314
|
+
}
|
315
|
+
|
316
|
+
/* if outlier is found, find next most common frequency for printed statistics */
|
317
|
+
if (has_outlier_cat) {
|
318
|
+
size_t next_most_comm_cat = INT_MAX;
|
319
|
+
for (size_t cat = 0; cat < ncateg; cat++) {
|
320
|
+
if (categ_counts[cat] > 0 && !is_outlier[cat]) {
|
321
|
+
next_most_comm_cat = std::min(next_most_comm_cat, categ_counts[cat]);
|
322
|
+
}
|
323
|
+
}
|
324
|
+
*next_most_comm = (long double)next_most_comm_cat / (long double)tot;
|
325
|
+
}
|
326
|
+
|
327
|
+
return has_outlier_cat;
|
328
|
+
}
|
@@ -0,0 +1,972 @@
|
|
1
|
+
/********************************************************************************************************************
|
2
|
+
* Explainable outlier detection
|
3
|
+
*
|
4
|
+
* Tries to detect outliers by generating decision trees that attempt to predict the values of each column based on
|
5
|
+
* each other column, testing in each branch of every tried split (if it meets some minimum criteria) whether there
|
6
|
+
* are observations that seem too distant from the others in a 1-D distribution for the column that the split tries
|
7
|
+
* to "predict" (will not generate a score for each observation).
|
8
|
+
* Splits are based on gain, while outlierness is based on confidence intervals.
|
9
|
+
* Similar in spirit to the GritBot software developed by RuleQuest research. Reference article is:
|
10
|
+
* Cortes, David. "Explainable outlier detection through decision tree conditioning."
|
11
|
+
* arXiv preprint arXiv:2001.00636 (2020).
|
12
|
+
*
|
13
|
+
*
|
14
|
+
* Copyright 2020 David Cortes.
|
15
|
+
*
|
16
|
+
* Written for C++11 standard and OpenMP 2.0 or later. Code is meant to be wrapped into scripting languages
|
17
|
+
* such as R or Python.
|
18
|
+
*
|
19
|
+
* This file is part of OutlierTree.
|
20
|
+
*
|
21
|
+
* OutlierTree is free software: you can redistribute it and/or modify
|
22
|
+
* it under the terms of the GNU General Public License as published by
|
23
|
+
* the Free Software Foundation, either version 3 of the License, or
|
24
|
+
* (at your option) any later version.
|
25
|
+
*
|
26
|
+
* OutlierTree is distributed in the hope that it will be useful,
|
27
|
+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
28
|
+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
29
|
+
* GNU General Public License for more details.
|
30
|
+
*
|
31
|
+
* You should have received a copy of the GNU General Public License
|
32
|
+
* along with OutlierTree. If not, see <https://www.gnu.org/licenses/>.
|
33
|
+
********************************************************************************************************************/
|
34
|
+
#include "outlier_tree.hpp"
|
35
|
+
|
36
|
+
|
37
|
+
|
38
|
+
/* Characterize a homogenous 1-dimensional cluster
|
39
|
+
*
|
40
|
+
* Calcualtes limits and display statistics on the distribution of one numerical variable,
|
41
|
+
* flagging potential outliers if found. Can be run on the full data or on subsets obtained from splitting
|
42
|
+
* by other variables.
|
43
|
+
*
|
44
|
+
* In order to flag an observation as outlier, it must:
|
45
|
+
* * Be in a very small/large percentile of the subset passed here.
|
46
|
+
* * Have a large absolute Z value (standardized and centered).
|
47
|
+
* * Have a large gap in the Z value with respect to the next largest/smallest ovservation.
|
48
|
+
* * Not be in a long tail (unless the variable was transformed by exponentiating or taking logarithm).
|
49
|
+
*
|
50
|
+
* Parameters:
|
51
|
+
* - x[n] (in)
|
52
|
+
* Variable for which to define the cluster.
|
53
|
+
* - ix_arr[n] (in)
|
54
|
+
* Indices to take from the array above.
|
55
|
+
* - st (in)
|
56
|
+
* Position at which ix_arr starts (inclusive).
|
57
|
+
* - end (in)
|
58
|
+
* Position at which ix_arr ends (inclusive).
|
59
|
+
* - outlier_scores[n] (in, out)
|
60
|
+
* Outlier scores (based on chebyshyov's inequality) that are already assigned to the observations from this column
|
61
|
+
* from previous runs of this function in larger subsets (should be started to 1).
|
62
|
+
* - outlier_clusters[n] (in, out)
|
63
|
+
* Cluster number under which an observation is the most anomalous.
|
64
|
+
* - outlier_trees[n] (in, out)
|
65
|
+
* Tree under which the outlier cluster assigned lies.
|
66
|
+
* - outlier_depth[n] (in, out)
|
67
|
+
* Tree depth at which the outlier cluster assigned is found.
|
68
|
+
* - cluster (in, out)
|
69
|
+
* Outlier cluster object with statistics and limits.
|
70
|
+
* - clusters (in)
|
71
|
+
* Vector containing all cluster already generated.
|
72
|
+
* - cluster_num (in)
|
73
|
+
* Number to give to this cluster.
|
74
|
+
* - tree_num (in)
|
75
|
+
* Number of the tree under which this cluster is to be found.
|
76
|
+
* - tree_depth (in)
|
77
|
+
* Distance form the tree root at which this tree is to be found.
|
78
|
+
* - is_log_transf (in)
|
79
|
+
* Whether the column 'x' has undergone a logarithmic transformation.
|
80
|
+
* - log_minval (in)
|
81
|
+
* Value that was added to 'x' before taking its logarithm (if it was log-transformed).
|
82
|
+
* - is_exp_transf (in)
|
83
|
+
* Whether the column 'x' has undergone an exponential transformation on its standardized values.
|
84
|
+
* - orig_mean (in)
|
85
|
+
* Mean of the variable 'x' before being standardized (if it was exponentiated).
|
86
|
+
* - orig_sd (in)
|
87
|
+
* Standard deviation of the variable 'x' before being standardized (if it was exponentiated).
|
88
|
+
* - left_tail (in)
|
89
|
+
* Value of 'x' after which it is considered a long tail, in which outliers will not be searched for.
|
90
|
+
* - rught_tail (in)
|
91
|
+
* Value of 'x' before which it is considered a long tail, in which outliers will not be searched for.
|
92
|
+
* - orig_x (in)
|
93
|
+
* Original values of 'x' if it was transformed (log or exp).
|
94
|
+
* - max_perc_outliers (in)
|
95
|
+
* Model parameter. Default is 0.01.
|
96
|
+
* - z_norm (in)
|
97
|
+
* Model parameter. Default is 2.67.
|
98
|
+
* - z_outlier (in)
|
99
|
+
* Model parameter. Default is 8.0. Must be greater than z_norm.
|
100
|
+
*
|
101
|
+
* Returns:
|
102
|
+
* - Whether there were any outliers detected.
|
103
|
+
*/
|
104
|
+
bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_t st, size_t end,
|
105
|
+
double *restrict outlier_scores, size_t *restrict outlier_clusters, size_t *restrict outlier_trees,
|
106
|
+
size_t *restrict outlier_depth, Cluster &cluster, std::vector<Cluster> &clusters,
|
107
|
+
size_t cluster_num, size_t tree_num, size_t tree_depth,
|
108
|
+
bool is_log_transf, double log_minval, bool is_exp_transf, double orig_mean, double orig_sd,
|
109
|
+
double left_tail, double right_tail, double *restrict orig_x,
|
110
|
+
double max_perc_outliers, double z_norm, double z_outlier)
|
111
|
+
{
|
112
|
+
|
113
|
+
/* TODO: this function could try to determine if the distribution is multimodal, and if so,
|
114
|
+
take only the most extreme means/sd for outlier comparisons */
|
115
|
+
|
116
|
+
/* TODO: statistics like SD, mean; are already available from the splitting function which
|
117
|
+
is called right before this, so these should *only* need to be recalculated them if the column
|
118
|
+
has undergone log or exp transform */
|
119
|
+
|
120
|
+
/* NAs and Inf should have already been removed, and outliers with fewer conditionals already discarded */
|
121
|
+
bool has_low_values = false;
|
122
|
+
bool has_high_values = false;
|
123
|
+
long double running_mean = 0;
|
124
|
+
long double mean_prev = 0;
|
125
|
+
long double running_ssq = 0;
|
126
|
+
double xval;
|
127
|
+
double mean;
|
128
|
+
double sd;
|
129
|
+
size_t cnt;
|
130
|
+
size_t tail_size = (size_t) calculate_max_outliers((long double)(end - st + 1), max_perc_outliers);
|
131
|
+
size_t st_non_tail = st + tail_size;
|
132
|
+
size_t end_non_tail = end - tail_size;
|
133
|
+
size_t st_normals = 0;
|
134
|
+
size_t end_normals = 0;
|
135
|
+
double min_gap = z_outlier - z_norm;
|
136
|
+
|
137
|
+
/* TODO: here it's not necessary to sort the whole data, only top/bottom N */
|
138
|
+
|
139
|
+
/* sort the data */
|
140
|
+
std::sort(ix_arr + st, ix_arr + end + 1, [&x](const size_t a, const size_t b){return x[a] < x[b];});
|
141
|
+
|
142
|
+
/* calculate statistics with tails and previous outliers excluded */
|
143
|
+
cnt = end_non_tail - st_non_tail + 1;
|
144
|
+
for (size_t row = st_non_tail; row <= end_non_tail; row++) {
|
145
|
+
xval = x[ ix_arr[row] ];
|
146
|
+
running_mean += (xval - running_mean) / (long double)(row - st_non_tail + 1);
|
147
|
+
running_ssq += (xval - running_mean) * (xval - mean_prev);
|
148
|
+
mean_prev = running_mean;
|
149
|
+
|
150
|
+
}
|
151
|
+
mean = (double) running_mean;
|
152
|
+
sd = (double) sqrtl(running_ssq / (long double)(cnt - 1));
|
153
|
+
|
154
|
+
/* adjust SD heuristically to account for reduced size, by (N + tail)/(N-tail) --- note that cnt = N-2*tail */
|
155
|
+
sd *= (long double)(cnt + 3 * tail_size) / (long double)(cnt + tail_size);
|
156
|
+
/* re-adjust if there's a one-sided tail and no transformation was applies */
|
157
|
+
if ((!isinf(left_tail) || !isinf(right_tail)) && !is_log_transf && !is_exp_transf) {
|
158
|
+
sd *= 0.5;
|
159
|
+
}
|
160
|
+
cluster.cluster_mean = mean;
|
161
|
+
cluster.cluster_sd = sd;
|
162
|
+
cnt = end - st + 1;
|
163
|
+
|
164
|
+
/* see if the minimum and/or maximum values qualify for outliers */
|
165
|
+
if (-z_score(x[ix_arr[st]], mean, sd) >= z_outlier && x[ix_arr[st]] > left_tail) has_low_values = true;
|
166
|
+
if ( z_score(x[ix_arr[end]], mean, sd) >= z_outlier && x[ix_arr[end]] < right_tail) has_high_values = true;
|
167
|
+
|
168
|
+
/* look for a large gap in the z-scores */
|
169
|
+
if (has_low_values) {
|
170
|
+
for (size_t row = st; row < st + tail_size; row++) {
|
171
|
+
|
172
|
+
if (( z_score(x[ix_arr[row + 1]], mean, sd) - z_score(x[ix_arr[row]], mean, sd) ) >= min_gap) {
|
173
|
+
st_normals = row + 1;
|
174
|
+
if (is_exp_transf) {
|
175
|
+
cluster.lower_lim = log(x[ix_arr[row + 1]] - min_gap * sd) * orig_sd + orig_mean;
|
176
|
+
} else if (is_log_transf) {
|
177
|
+
cluster.lower_lim = exp(x[ix_arr[row + 1]] - min_gap * sd) + log_minval;
|
178
|
+
} else {
|
179
|
+
cluster.lower_lim = x[ix_arr[row + 1]] - min_gap * sd;
|
180
|
+
}
|
181
|
+
cluster.display_lim_low = orig_x[ix_arr[row + 1]];
|
182
|
+
cluster.perc_above = (long double)(end - st_normals + 1) / (long double)(end - st + 1);
|
183
|
+
break;
|
184
|
+
}
|
185
|
+
if (z_score(x[ix_arr[row]], mean, sd) > -z_outlier) break;
|
186
|
+
|
187
|
+
}
|
188
|
+
if (st_normals == 0) {
|
189
|
+
has_low_values = false;
|
190
|
+
} else {
|
191
|
+
for (size_t row = st; row < st_normals; row++) {
|
192
|
+
|
193
|
+
/* assign outlier if it's a better cluster than previously assigned */
|
194
|
+
if (
|
195
|
+
outlier_scores[ix_arr[row]] >= 1.0 ||
|
196
|
+
(clusters[outlier_clusters[ix_arr[row]]].has_NA_branch && !cluster.has_NA_branch) ||
|
197
|
+
(
|
198
|
+
cluster.has_NA_branch == clusters[outlier_clusters[ix_arr[row]]].has_NA_branch
|
199
|
+
&&
|
200
|
+
(
|
201
|
+
tree_depth < outlier_depth[ix_arr[row]] ||
|
202
|
+
(
|
203
|
+
tree_depth == outlier_depth[ix_arr[row]] &&
|
204
|
+
clusters[outlier_clusters[ix_arr[row]]].cluster_size < (cnt - 2 * tail_size)
|
205
|
+
)
|
206
|
+
)
|
207
|
+
)
|
208
|
+
)
|
209
|
+
{
|
210
|
+
outlier_scores[ix_arr[row]] = chebyshyov_bound(z_score(x[ix_arr[row]], mean, sd));
|
211
|
+
if (is_na_or_inf(outlier_scores[ix_arr[row]])) outlier_scores[ix_arr[row]] = 0;
|
212
|
+
outlier_clusters[ix_arr[row]] = cluster_num;
|
213
|
+
outlier_trees[ix_arr[row]] = tree_num;
|
214
|
+
outlier_depth[ix_arr[row]] = tree_depth;
|
215
|
+
}
|
216
|
+
|
217
|
+
}
|
218
|
+
}
|
219
|
+
}
|
220
|
+
if (!has_low_values) {
|
221
|
+
cluster.perc_above = 1.0;
|
222
|
+
if (!is_log_transf && !is_exp_transf) {
|
223
|
+
|
224
|
+
if (isinf(left_tail)) {
|
225
|
+
cluster.lower_lim = x[ix_arr[st]] - min_gap * sd;
|
226
|
+
} else {
|
227
|
+
cluster.lower_lim = -HUGE_VAL;
|
228
|
+
}
|
229
|
+
|
230
|
+
} else if (is_exp_transf) {
|
231
|
+
cluster.lower_lim = log(x[ix_arr[st]] - min_gap * sd) * orig_sd + orig_mean;
|
232
|
+
} else {
|
233
|
+
cluster.lower_lim = exp(x[ix_arr[st]] - min_gap * sd) + log_minval;
|
234
|
+
}
|
235
|
+
|
236
|
+
cluster.display_lim_low = orig_x[ix_arr[st]];
|
237
|
+
|
238
|
+
}
|
239
|
+
|
240
|
+
if (has_high_values) {
|
241
|
+
for (size_t row = end; row > (end - tail_size); row--) {
|
242
|
+
|
243
|
+
if (( z_score(x[ix_arr[row]], mean, sd) - z_score(x[ix_arr[row - 1]], mean, sd) ) >= min_gap) {
|
244
|
+
end_normals = row - 1;
|
245
|
+
if (is_exp_transf) {
|
246
|
+
cluster.upper_lim = log(x[ix_arr[row - 1]] + min_gap * sd) * orig_sd + orig_mean;
|
247
|
+
} else if (is_log_transf) {
|
248
|
+
cluster.upper_lim = exp(x[ix_arr[row - 1]] + min_gap * sd) + log_minval;
|
249
|
+
} else {
|
250
|
+
cluster.upper_lim = x[ix_arr[row - 1]] + min_gap * sd;
|
251
|
+
}
|
252
|
+
cluster.display_lim_high = orig_x[ix_arr[row - 1]];
|
253
|
+
cluster.perc_below = (long double)(end_normals - st + 1) / (long double)(end - st + 1);
|
254
|
+
break;
|
255
|
+
}
|
256
|
+
if (z_score(x[ix_arr[row]], mean, sd) < z_outlier) break;
|
257
|
+
|
258
|
+
}
|
259
|
+
if (end_normals == 0) {
|
260
|
+
has_high_values = false;
|
261
|
+
} else {
|
262
|
+
for (size_t row = end; row > end_normals; row--) {
|
263
|
+
|
264
|
+
/* assign outlier if it's a better cluster than previously assigned - Note that it might produce slight mismatches
|
265
|
+
against the predict function (the latter is more trustable) due to the size of the cluster not yet being known
|
266
|
+
at the moment of determinining whether to overwrite previous in here */
|
267
|
+
if (
|
268
|
+
outlier_scores[ix_arr[row]] >= 1.0 ||
|
269
|
+
(clusters[outlier_clusters[ix_arr[row]]].has_NA_branch && !cluster.has_NA_branch) ||
|
270
|
+
(
|
271
|
+
cluster.has_NA_branch == clusters[outlier_clusters[ix_arr[row]]].has_NA_branch
|
272
|
+
&&
|
273
|
+
(
|
274
|
+
tree_depth < outlier_depth[ix_arr[row]] ||
|
275
|
+
(
|
276
|
+
tree_depth == outlier_depth[ix_arr[row]] &&
|
277
|
+
clusters[outlier_clusters[ix_arr[row]]].cluster_size < (cnt - 2 * tail_size)
|
278
|
+
)
|
279
|
+
)
|
280
|
+
)
|
281
|
+
)
|
282
|
+
{
|
283
|
+
outlier_scores[ix_arr[row]] = chebyshyov_bound(z_score(x[ix_arr[row]], mean, sd));
|
284
|
+
if (is_na_or_inf(outlier_scores[ix_arr[row]])) outlier_scores[ix_arr[row]] = 0;
|
285
|
+
outlier_clusters[ix_arr[row]] = cluster_num;
|
286
|
+
outlier_trees[ix_arr[row]] = tree_num;
|
287
|
+
outlier_depth[ix_arr[row]] = tree_depth;
|
288
|
+
}
|
289
|
+
|
290
|
+
}
|
291
|
+
}
|
292
|
+
}
|
293
|
+
if (!has_high_values) {
|
294
|
+
cluster.perc_below = 1.0;
|
295
|
+
if (!is_log_transf && !is_exp_transf) {
|
296
|
+
|
297
|
+
if (isinf(right_tail)) {
|
298
|
+
cluster.upper_lim = x[ix_arr[end]] + min_gap * sd;
|
299
|
+
} else {
|
300
|
+
cluster.upper_lim = HUGE_VAL;
|
301
|
+
}
|
302
|
+
} else if (is_exp_transf) {
|
303
|
+
cluster.upper_lim = log(x[ix_arr[end]] + min_gap * sd) * orig_sd + orig_mean;
|
304
|
+
} else {
|
305
|
+
cluster.upper_lim = exp(x[ix_arr[end]] + min_gap * sd) + log_minval;
|
306
|
+
}
|
307
|
+
|
308
|
+
cluster.display_lim_high = orig_x[ix_arr[end]];
|
309
|
+
}
|
310
|
+
|
311
|
+
/* save displayed statistics for cluster */
|
312
|
+
if (has_high_values || has_low_values || is_log_transf || is_exp_transf) {
|
313
|
+
size_t st_disp = has_low_values? st_normals : st;
|
314
|
+
size_t end_disp = has_high_values? end_normals : end;
|
315
|
+
running_mean = 0;
|
316
|
+
mean_prev = 0;
|
317
|
+
running_ssq = 0;
|
318
|
+
for (size_t row = st_disp; row <= end_disp; row++) {
|
319
|
+
xval = orig_x[ix_arr[row]];
|
320
|
+
running_mean += (xval - running_mean) / (long double)(row - st_disp + 1);
|
321
|
+
running_ssq += (xval - running_mean) * (xval - mean_prev);
|
322
|
+
mean_prev = running_mean;
|
323
|
+
}
|
324
|
+
cluster.cluster_size = end_disp - st_disp + 1;
|
325
|
+
cluster.display_mean = (double) running_mean;
|
326
|
+
cluster.display_sd = (double) sqrtl(running_ssq / (long double)(cluster.cluster_size - 1));
|
327
|
+
} else {
|
328
|
+
cluster.display_mean = cluster.cluster_mean;
|
329
|
+
cluster.display_sd = cluster.cluster_sd;
|
330
|
+
cluster.cluster_size = end - st + 1;
|
331
|
+
}
|
332
|
+
|
333
|
+
/* report whether outliers were found or not */
|
334
|
+
return has_low_values || has_high_values;
|
335
|
+
}
|
336
|
+
|
337
|
+
|
338
|
+
/* Characterize a homogeneous categorical cluster from the *full* data
|
339
|
+
*
|
340
|
+
* Function is meant for the data as it comes, before splitting it, as once split, it will
|
341
|
+
* not be able to detect these outliers. As such, it takes fewer parameters, since it can only
|
342
|
+
* be the first tree and cluster in a column. It assumes the outliers have already been identified.
|
343
|
+
*
|
344
|
+
* Parameters:
|
345
|
+
* - x[n]
|
346
|
+
* Array indicating the category to which each observation belongs.
|
347
|
+
* - ix_arr[n] (in)
|
348
|
+
* Indices to take from the array above.
|
349
|
+
* - st (in)
|
350
|
+
* Position at which ix_arr starts (inclusive).
|
351
|
+
* - end (in)
|
352
|
+
* Position at which ix_arr ends (inclusive).
|
353
|
+
* - ncateg (in)
|
354
|
+
* Number of categories in this column.
|
355
|
+
* - outlier_scores[n] (in, out)
|
356
|
+
* Array where to assign outlier scores (based on proportion) to each observation belonging to an outlier category.
|
357
|
+
* - outlier_clusters[n] (in, out)
|
358
|
+
* Array where to assign cluster number to each observation belonging to an outlier category.
|
359
|
+
* - outlier_trees[n] (in, out)
|
360
|
+
* Array where to assign tree number to each observation belonging to an outlier category.
|
361
|
+
* - outlier_depth[n] (in, out)
|
362
|
+
* Array where to assign tree depth to each observation belonging to an outlier category.
|
363
|
+
* - cluster (in, out)
|
364
|
+
* Outlier cluster object with statistics and classifications.
|
365
|
+
* - categ_counts[ncateg] (in)
|
366
|
+
* Array with the frequencies of each category in the data.
|
367
|
+
* - is_outlier[ncateg] (in)
|
368
|
+
* Array indicating which categories are to be considered as outliers (must be already calculated).
|
369
|
+
* - perc_next_most_comm (in)
|
370
|
+
* Proportion of the least common non-outlier category (must be already calculated).
|
371
|
+
*/
|
372
|
+
void define_categ_cluster_no_cond(int *restrict x, size_t *restrict ix_arr, size_t st, size_t end, size_t ncateg,
|
373
|
+
double *restrict outlier_scores, size_t *restrict outlier_clusters, size_t *restrict outlier_trees,
|
374
|
+
size_t *restrict outlier_depth, Cluster &cluster,
|
375
|
+
size_t *restrict categ_counts, char *restrict is_outlier, double perc_next_most_comm)
|
376
|
+
{
|
377
|
+
size_t cnt_common = end - st + 1;
|
378
|
+
cluster.cluster_size = cnt_common;
|
379
|
+
double pct_outl;
|
380
|
+
cluster.subset_common.assign(is_outlier, is_outlier + ncateg);
|
381
|
+
cluster.score_categ.resize(ncateg, 0);
|
382
|
+
|
383
|
+
|
384
|
+
for (size_t row = st; row <= end; row++) {
|
385
|
+
if (is_outlier[x[ix_arr[row]]]) {
|
386
|
+
cnt_common--;
|
387
|
+
pct_outl = (long double)categ_counts[ x[ix_arr[row]] ] / (long double)cluster.cluster_size;
|
388
|
+
pct_outl = pct_outl + sqrt(pct_outl * (1 - pct_outl) / (long double)cluster.cluster_size);
|
389
|
+
cluster.score_categ[ x[ix_arr[row]] ] = pct_outl;
|
390
|
+
outlier_scores[ix_arr[row]] = pct_outl;
|
391
|
+
outlier_clusters[ix_arr[row]] = 0;
|
392
|
+
outlier_trees[ix_arr[row]] = 0;
|
393
|
+
outlier_depth[ix_arr[row]] = 0;
|
394
|
+
}
|
395
|
+
}
|
396
|
+
cluster.perc_in_subset = (long double)cnt_common / (long double)cluster.cluster_size;
|
397
|
+
cluster.perc_next_most_comm = perc_next_most_comm;
|
398
|
+
}
|
399
|
+
|
400
|
+
|
401
|
+
/* Characterize a homogeneous categorical cluster form a subset of the data, or report if it's not homogeneous
|
402
|
+
*
|
403
|
+
* Function is meant to be called with subsets of the data only. Will calculate the counts inside it.
|
404
|
+
* In order to consider a category as outlier, it must:
|
405
|
+
* * Have a proportion smaller than its prior probability and than a condifence interval of its prior.
|
406
|
+
* * Have a large gap with respect to the next most-common category.
|
407
|
+
* * Be in a cluster in which few or no observations belong to a category meeting such conditions.
|
408
|
+
* It's oftentimes not possible to create a cluster with category frequencies that would produce outliers,
|
409
|
+
* in which case it will report whether the cluster should be dropped.
|
410
|
+
*
|
411
|
+
* Parameters:
|
412
|
+
* - x[n]
|
413
|
+
* Array indicating the category to which each observation belongs.
|
414
|
+
* - ix_arr[n] (in)
|
415
|
+
* Indices to take from the array above.
|
416
|
+
* - st (in)
|
417
|
+
* Position at which ix_arr starts (inclusive).
|
418
|
+
* - end (in)
|
419
|
+
* Position at which ix_arr ends (inclusive).
|
420
|
+
* - ncateg (in)
|
421
|
+
* Number of categories in this column.
|
422
|
+
* - by_maj (in)
|
423
|
+
* Model parameter. Default is 'false'. Indicates whether to detect outliers according to the number of non-majority
|
424
|
+
* obsevations compared to the expected number for each category.
|
425
|
+
* - outlier_scores[n] (in, out)
|
426
|
+
* Outlier scores (based on observed category proportion) that are already assigned to the observations from this column
|
427
|
+
* from previous runs of this function in larger subsets (should be started to 1).
|
428
|
+
* - outlier_clusters[n] (in, out)
|
429
|
+
* Cluster number under which an observation is the most anomalous.
|
430
|
+
* - outlier_trees[n] (in, out)
|
431
|
+
* Tree under which the outlier cluster assigned lies.
|
432
|
+
* - outlier_depth[n] (in, out)
|
433
|
+
* Tree depth at which the outlier cluster assigned is found.
|
434
|
+
* - cluster (in, out)
|
435
|
+
* Outlier cluster object with statistics and limits.
|
436
|
+
* - clusters (in)
|
437
|
+
* Vector containing all cluster already generated.
|
438
|
+
* - cluster_num (in)
|
439
|
+
* Number to give to this cluster.
|
440
|
+
* - tree_num (in)
|
441
|
+
* Number of the tree under which this cluster is to be found.
|
442
|
+
* - tree_depth (in)
|
443
|
+
* Distance form the tree root at which this tree is to be found.
|
444
|
+
* - max_perc_outliers (in)
|
445
|
+
* Model parameter. Default is 0.01.
|
446
|
+
* - z_norm (in)
|
447
|
+
* Model parameter. Default is 2.67.
|
448
|
+
* - z_outlier (in)
|
449
|
+
* Model parameter. Default is 8.0.
|
450
|
+
* - perc_threshold[ncateg] (in)
|
451
|
+
* Observed proportion below which a category can be considered as outlier.
|
452
|
+
* - prop_prior[ncateg] (in)
|
453
|
+
* Prior probability of each category in the full data (only used when passing 'by_maj' = 'true').
|
454
|
+
* - buffer_categ_counts[ncateg] (temp)
|
455
|
+
* Buffer where to save the observed frequencies of each category.
|
456
|
+
* - buffer_categ_pct[ncateg] (temp)
|
457
|
+
* Buffer where to save the observed proportion of each category.
|
458
|
+
* - buffer_categ_ix[ncateg] (temp)
|
459
|
+
* Buffer where to save the category numbers sorted by proportion.
|
460
|
+
* - buffer_outliers[ncateg] (temp)
|
461
|
+
* Buffer where to save the results of which categories are flagged as outliers
|
462
|
+
* before copying it to the cluster (will not copy if none is flagged).
|
463
|
+
* - drop_cluster (out)
|
464
|
+
* Whethet the cluster should be dropped (i.e. it was not possible to flag any present
|
465
|
+
* or non-present category as outlier).
|
466
|
+
*
|
467
|
+
* Returns:
|
468
|
+
* - Whether it identified any outliers or not.
|
469
|
+
*/
|
470
|
+
bool define_categ_cluster(int *restrict x, size_t *restrict ix_arr, size_t st, size_t end, size_t ncateg, bool by_maj,
|
471
|
+
double *restrict outlier_scores, size_t *restrict outlier_clusters, size_t *restrict outlier_trees,
|
472
|
+
size_t *restrict outlier_depth, Cluster &cluster, std::vector<Cluster> &clusters,
|
473
|
+
size_t cluster_num, size_t tree_num, size_t tree_depth,
|
474
|
+
double max_perc_outliers, double z_norm, double z_outlier,
|
475
|
+
long double *restrict perc_threshold, long double *restrict prop_prior,
|
476
|
+
size_t *restrict buffer_categ_counts, long double *restrict buffer_categ_pct,
|
477
|
+
size_t *restrict buffer_categ_ix, char *restrict buffer_outliers,
|
478
|
+
bool *restrict drop_cluster)
|
479
|
+
{
|
480
|
+
bool found_outliers, new_is_outlier;
|
481
|
+
size_t tot = end - st + 1;
|
482
|
+
size_t sz_maj = tot;
|
483
|
+
long double tot_dbl = (long double) tot;
|
484
|
+
size_t tail_size = (size_t) calculate_max_outliers(tot_dbl, max_perc_outliers);
|
485
|
+
cluster.perc_in_subset = 1;
|
486
|
+
double pct_outl;
|
487
|
+
|
488
|
+
/* calculate category counts */
|
489
|
+
memset(buffer_categ_counts, 0, ncateg * sizeof(size_t));
|
490
|
+
for (size_t row = st; row <= end; row++) {
|
491
|
+
buffer_categ_counts[ x[ix_arr[row]] ]++;
|
492
|
+
}
|
493
|
+
|
494
|
+
/* flag categories as outliers if appropriate */
|
495
|
+
if (!by_maj)
|
496
|
+
find_outlier_categories(buffer_categ_counts, ncateg, tot, max_perc_outliers,
|
497
|
+
perc_threshold, buffer_categ_ix, buffer_categ_pct,
|
498
|
+
z_norm, buffer_outliers, &found_outliers,
|
499
|
+
&new_is_outlier, &cluster.perc_next_most_comm);
|
500
|
+
else
|
501
|
+
find_outlier_categories_by_maj(buffer_categ_counts, ncateg, tot, max_perc_outliers,
|
502
|
+
prop_prior, z_outlier, buffer_outliers,
|
503
|
+
&found_outliers, &new_is_outlier, &cluster.categ_maj);
|
504
|
+
|
505
|
+
if (found_outliers) {
|
506
|
+
for (size_t row = st; row <= end; row++) {
|
507
|
+
if (buffer_outliers[ x[ix_arr[row]] ]) {
|
508
|
+
|
509
|
+
/* follow usual rules for preferring this cluster over others */
|
510
|
+
if (
|
511
|
+
outlier_scores[ix_arr[row]] >= 1.0 ||
|
512
|
+
(clusters[outlier_clusters[ix_arr[row]]].has_NA_branch && !cluster.has_NA_branch) ||
|
513
|
+
(
|
514
|
+
cluster.has_NA_branch == clusters[outlier_clusters[ix_arr[row]]].has_NA_branch
|
515
|
+
&&
|
516
|
+
(
|
517
|
+
tree_depth < outlier_depth[ix_arr[row]] ||
|
518
|
+
(
|
519
|
+
tree_depth == outlier_depth[ix_arr[row]] &&
|
520
|
+
clusters[outlier_clusters[ix_arr[row]]].cluster_size < (tot - tail_size)
|
521
|
+
)
|
522
|
+
)
|
523
|
+
)
|
524
|
+
)
|
525
|
+
{
|
526
|
+
if (!by_maj) {
|
527
|
+
pct_outl = (long double)buffer_categ_counts[ x[ix_arr[row]] ] / tot_dbl;
|
528
|
+
pct_outl = pct_outl + sqrt(pct_outl * (1 - pct_outl) / tot_dbl);
|
529
|
+
outlier_scores[ix_arr[row]] = pct_outl;
|
530
|
+
} else {
|
531
|
+
pct_outl = (long double)(tot - buffer_categ_counts[cluster.categ_maj]) / (tot_dbl * prop_prior[ x[ix_arr[row]] ]);
|
532
|
+
outlier_scores[ix_arr[row]] = square(pct_outl);
|
533
|
+
}
|
534
|
+
outlier_clusters[ix_arr[row]] = cluster_num;
|
535
|
+
outlier_trees[ix_arr[row]] = tree_num;
|
536
|
+
outlier_depth[ix_arr[row]] = tree_depth;
|
537
|
+
}
|
538
|
+
sz_maj--;
|
539
|
+
|
540
|
+
}
|
541
|
+
}
|
542
|
+
cluster.perc_in_subset = (long double)sz_maj / tot_dbl;
|
543
|
+
}
|
544
|
+
|
545
|
+
if (new_is_outlier && !found_outliers) {
|
546
|
+
cluster.perc_in_subset = 1.0;
|
547
|
+
}
|
548
|
+
|
549
|
+
if (new_is_outlier || found_outliers) {
|
550
|
+
*drop_cluster = false;
|
551
|
+
cluster.cluster_size = sz_maj;
|
552
|
+
cluster.subset_common.assign(buffer_outliers, buffer_outliers + ncateg);
|
553
|
+
cluster.score_categ.resize(ncateg, 0);
|
554
|
+
if (!by_maj) {
|
555
|
+
|
556
|
+
for (size_t cat = 0; cat < ncateg; cat++) {
|
557
|
+
if (cluster.subset_common[cat] > 0) {
|
558
|
+
pct_outl = (long double)buffer_categ_counts[cat] / tot_dbl;
|
559
|
+
cluster.score_categ[cat] = pct_outl + sqrt(pct_outl * (1 - pct_outl) / tot_dbl);
|
560
|
+
} else if (cluster.subset_common[cat] < 0) {
|
561
|
+
pct_outl = (long double)1 / (long double)(tot + 2);
|
562
|
+
cluster.score_categ[cat] = pct_outl + sqrt(pct_outl * (1 - pct_outl) / (long double)(tot + 2));
|
563
|
+
}
|
564
|
+
}
|
565
|
+
|
566
|
+
} else {
|
567
|
+
|
568
|
+
cluster.perc_in_subset = (long double) buffer_categ_counts[cluster.categ_maj] / tot_dbl;
|
569
|
+
for (size_t cat = 0; cat < ncateg; cat++) {
|
570
|
+
if (cat == cluster.categ_maj)
|
571
|
+
continue;
|
572
|
+
if (cluster.subset_common[cat] != 0) {
|
573
|
+
cluster.score_categ[cat] = (long double)(tot - buffer_categ_counts[cluster.categ_maj] + 1)
|
574
|
+
/ ((long double)(tot + 2) * prop_prior[cat]);
|
575
|
+
cluster.score_categ[cat] = square(cluster.score_categ[cat]);
|
576
|
+
}
|
577
|
+
}
|
578
|
+
|
579
|
+
}
|
580
|
+
} else {
|
581
|
+
*drop_cluster = true;
|
582
|
+
}
|
583
|
+
|
584
|
+
return found_outliers;
|
585
|
+
}
|
586
|
+
|
587
|
+
/* Convert in/not-in conditions to 'equals' or 'not equals' when they look for only 1 category */
|
588
|
+
void simplify_when_equal_cond(std::vector<Cluster> &clusters, int ncat_ord[])
|
589
|
+
{
|
590
|
+
|
591
|
+
int col_equal;
|
592
|
+
size_t size_subset;
|
593
|
+
size_t size_subset_excl;
|
594
|
+
for (size_t clust = 0; clust < clusters.size(); clust++) {
|
595
|
+
if (clusters[clust].split_type == IsNa) continue;
|
596
|
+
|
597
|
+
switch(clusters[clust].column_type) {
|
598
|
+
|
599
|
+
case Categorical:
|
600
|
+
{
|
601
|
+
|
602
|
+
col_equal = -1;
|
603
|
+
if (clusters[clust].split_subset.size() == 2) {
|
604
|
+
|
605
|
+
switch(col_equal = clusters[clust].split_type) {
|
606
|
+
case InSubset:
|
607
|
+
{
|
608
|
+
col_equal = clusters[clust].split_subset[0]? 0 : 1;
|
609
|
+
break;
|
610
|
+
}
|
611
|
+
|
612
|
+
case NotInSubset:
|
613
|
+
{
|
614
|
+
col_equal = clusters[clust].split_subset[0]? 1 : 0;
|
615
|
+
break;
|
616
|
+
}
|
617
|
+
|
618
|
+
case SingleCateg:
|
619
|
+
{
|
620
|
+
col_equal = clusters[clust].split_subset[0]? 0 : 1;
|
621
|
+
break;
|
622
|
+
}
|
623
|
+
}
|
624
|
+
clusters[clust].split_type = Equal;
|
625
|
+
|
626
|
+
} else {
|
627
|
+
|
628
|
+
size_subset_excl = std::accumulate(clusters[clust].split_subset.begin(), clusters[clust].split_subset.end(), (size_t)0,
|
629
|
+
[](const size_t a, const char b){return a + ((b < 0)? 1 : 0);});
|
630
|
+
if (size_subset_excl > 0) continue;
|
631
|
+
size_subset = std::accumulate(clusters[clust].split_subset.begin(), clusters[clust].split_subset.end(), (size_t)0,
|
632
|
+
[](const size_t a, const char b){return a + ((b > 0)? 1 : 0);});
|
633
|
+
if (size_subset == 1) {
|
634
|
+
|
635
|
+
do {col_equal++;} while (clusters[clust].split_subset[col_equal] <= 0);
|
636
|
+
if (clusters[clust].split_type == InSubset || clusters[clust].split_type == SingleCateg)
|
637
|
+
clusters[clust].split_type = Equal;
|
638
|
+
else
|
639
|
+
clusters[clust].split_type = NotEqual;
|
640
|
+
|
641
|
+
} else if (size_subset == (clusters[clust].split_subset.size() - 1)) {
|
642
|
+
|
643
|
+
do {col_equal++;} while (clusters[clust].split_subset[col_equal] != 0);
|
644
|
+
if (clusters[clust].split_type == NotInSubset)
|
645
|
+
clusters[clust].split_type = Equal;
|
646
|
+
else
|
647
|
+
clusters[clust].split_type = NotEqual;
|
648
|
+
|
649
|
+
}
|
650
|
+
|
651
|
+
}
|
652
|
+
if (col_equal >= 0) {
|
653
|
+
clusters[clust].split_subset.resize(0);
|
654
|
+
clusters[clust].split_lev = col_equal;
|
655
|
+
}
|
656
|
+
break;
|
657
|
+
}
|
658
|
+
|
659
|
+
|
660
|
+
case Ordinal:
|
661
|
+
{
|
662
|
+
|
663
|
+
if (clusters[clust].split_lev == 0) {
|
664
|
+
|
665
|
+
if (clusters[clust].split_type == LessOrEqual)
|
666
|
+
clusters[clust].split_type = Equal;
|
667
|
+
else
|
668
|
+
clusters[clust].split_type = NotEqual;
|
669
|
+
|
670
|
+
}
|
671
|
+
|
672
|
+
else if (clusters[clust].split_lev == (ncat_ord[clusters[clust].col_num] - 2)) {
|
673
|
+
|
674
|
+
clusters[clust].split_lev++;
|
675
|
+
if (clusters[clust].split_type == Greater)
|
676
|
+
clusters[clust].split_type = Equal;
|
677
|
+
else
|
678
|
+
clusters[clust].split_type = NotEqual;
|
679
|
+
|
680
|
+
}
|
681
|
+
break;
|
682
|
+
}
|
683
|
+
|
684
|
+
}
|
685
|
+
|
686
|
+
}
|
687
|
+
|
688
|
+
}
|
689
|
+
|
690
|
+
/*
|
691
|
+
* Convert in/not-in conditions to 'equals' when they look for only 1 category
|
692
|
+
* Note: unlike in the case of clusters, trees do not store the split type, but rather
|
693
|
+
* always assume left is in/l.e. and right the opposite, so it's not possible to
|
694
|
+
* simplify ordinal splits to equals (as the tree will not distinguish between
|
695
|
+
* an ordinal split with equals and another with l.e./g.e.). Thus, this part needs
|
696
|
+
* to be done in the function that prints the outlier conditions.
|
697
|
+
*/
|
698
|
+
void simplify_when_equal_cond(std::vector<ClusterTree> &trees, int ncat_ord[])
|
699
|
+
{
|
700
|
+
|
701
|
+
int col_equal;
|
702
|
+
size_t size_subset;
|
703
|
+
size_t size_subset_excl;
|
704
|
+
size_t temp_swap;
|
705
|
+
for (size_t tree = 0; tree < trees.size(); tree++) {
|
706
|
+
|
707
|
+
if (trees[tree].all_branches.size() == 0 && trees[tree].tree_left == 0 && trees[tree].tree_right == 0) continue;
|
708
|
+
if (trees[trees[tree].parent].all_branches.size() > 0 && trees[tree].split_this_branch == IsNa) continue;
|
709
|
+
switch(trees[tree].column_type) {
|
710
|
+
|
711
|
+
case Categorical:
|
712
|
+
{
|
713
|
+
size_subset_excl = std::accumulate(trees[tree].split_subset.begin(), trees[tree].split_subset.end(), (size_t)0,
|
714
|
+
[](const size_t a, const char b){return a + ((b < 0)? 1 : 0);});
|
715
|
+
if (size_subset_excl > 0) continue;
|
716
|
+
|
717
|
+
col_equal = -1;
|
718
|
+
if (trees[tree].split_subset.size() == 2) {
|
719
|
+
|
720
|
+
col_equal = 0;
|
721
|
+
if (trees[tree].split_subset[0] == 0) {
|
722
|
+
temp_swap = trees[tree].tree_left;
|
723
|
+
trees[tree].tree_left = trees[tree].tree_right;
|
724
|
+
trees[tree].tree_right = temp_swap;
|
725
|
+
}
|
726
|
+
if (trees[tree].tree_left > 0)
|
727
|
+
trees[trees[tree].tree_left].parent_branch = Equal;
|
728
|
+
if (trees[tree].tree_right > 0)
|
729
|
+
trees[trees[tree].tree_right].parent_branch = NotEqual;
|
730
|
+
|
731
|
+
if (trees[trees[tree].parent].all_branches.size() > 0) {
|
732
|
+
switch(trees[tree].split_this_branch) {
|
733
|
+
case InSubset:
|
734
|
+
{
|
735
|
+
trees[tree].split_this_branch = Equal;
|
736
|
+
break;
|
737
|
+
}
|
738
|
+
|
739
|
+
case NotInSubset:
|
740
|
+
{
|
741
|
+
trees[tree].split_this_branch = NotEqual;
|
742
|
+
break;
|
743
|
+
}
|
744
|
+
|
745
|
+
case SingleCateg:
|
746
|
+
{
|
747
|
+
trees[tree].split_this_branch = Equal;
|
748
|
+
break;
|
749
|
+
}
|
750
|
+
}
|
751
|
+
}
|
752
|
+
|
753
|
+
}
|
754
|
+
|
755
|
+
else {
|
756
|
+
|
757
|
+
size_subset = std::accumulate(trees[tree].split_subset.begin(), trees[tree].split_subset.end(), (size_t)0,
|
758
|
+
[](const size_t a, const char b){return a + ((b > 0)? 1 : 0);});
|
759
|
+
if (size_subset == 1) {
|
760
|
+
|
761
|
+
do {col_equal++;} while (trees[tree].split_subset[col_equal] <= 0);
|
762
|
+
if (trees[trees[tree].parent].all_branches.size() > 0) {
|
763
|
+
switch(trees[tree].split_this_branch) {
|
764
|
+
case InSubset:
|
765
|
+
{
|
766
|
+
trees[tree].split_this_branch = Equal;
|
767
|
+
break;
|
768
|
+
}
|
769
|
+
|
770
|
+
case NotInSubset:
|
771
|
+
{
|
772
|
+
trees[tree].split_this_branch = NotEqual;
|
773
|
+
break;
|
774
|
+
}
|
775
|
+
|
776
|
+
case SingleCateg:
|
777
|
+
{
|
778
|
+
trees[tree].split_this_branch = Equal;
|
779
|
+
break;
|
780
|
+
}
|
781
|
+
}
|
782
|
+
}
|
783
|
+
|
784
|
+
|
785
|
+
} else if (size_subset == (trees[tree].split_subset.size() - 1)) {
|
786
|
+
|
787
|
+
do {col_equal++;} while (trees[tree].split_subset[col_equal] != 0);
|
788
|
+
temp_swap = trees[tree].tree_left;
|
789
|
+
trees[tree].tree_left = trees[tree].tree_right;
|
790
|
+
trees[tree].tree_right = temp_swap;
|
791
|
+
if (trees[trees[tree].parent].all_branches.size() > 0) {
|
792
|
+
switch(trees[tree].split_this_branch) {
|
793
|
+
case InSubset:
|
794
|
+
{
|
795
|
+
trees[tree].split_this_branch = NotEqual;
|
796
|
+
break;
|
797
|
+
}
|
798
|
+
|
799
|
+
case NotInSubset:
|
800
|
+
{
|
801
|
+
trees[tree].split_this_branch = Equal;
|
802
|
+
break;
|
803
|
+
}
|
804
|
+
}
|
805
|
+
}
|
806
|
+
|
807
|
+
}
|
808
|
+
|
809
|
+
}
|
810
|
+
|
811
|
+
if (col_equal >= 0) {
|
812
|
+
trees[tree].split_subset.resize(0);
|
813
|
+
trees[tree].split_lev = col_equal;
|
814
|
+
if (trees[tree].tree_left > 0)
|
815
|
+
trees[trees[tree].tree_left].parent_branch = Equal;
|
816
|
+
if (trees[tree].tree_right > 0)
|
817
|
+
trees[trees[tree].tree_right].parent_branch = NotEqual;
|
818
|
+
|
819
|
+
}
|
820
|
+
break;
|
821
|
+
}
|
822
|
+
|
823
|
+
|
824
|
+
case Ordinal:
|
825
|
+
{
|
826
|
+
if (trees[trees[tree].parent].all_branches.size() == 0) continue;
|
827
|
+
|
828
|
+
if (trees[tree].split_lev == 0) {
|
829
|
+
|
830
|
+
if (trees[tree].split_this_branch == LessOrEqual)
|
831
|
+
trees[tree].split_this_branch = Equal;
|
832
|
+
else
|
833
|
+
trees[tree].split_this_branch = NotEqual;
|
834
|
+
|
835
|
+
}
|
836
|
+
|
837
|
+
else if (trees[tree].split_lev == (ncat_ord[trees[tree].col_num] - 2)) {
|
838
|
+
|
839
|
+
trees[tree].split_lev++;
|
840
|
+
if (trees[tree].split_this_branch == Greater)
|
841
|
+
trees[tree].split_this_branch = Equal;
|
842
|
+
else
|
843
|
+
trees[tree].split_this_branch = NotEqual;
|
844
|
+
|
845
|
+
}
|
846
|
+
break;
|
847
|
+
}
|
848
|
+
|
849
|
+
}
|
850
|
+
|
851
|
+
}
|
852
|
+
|
853
|
+
}
|
854
|
+
|
855
|
+
#ifdef TEST_MODE_DEFINE
|
856
|
+
/*
|
857
|
+
* Goodie to help with testing and debugging (not used in the final code)
|
858
|
+
*
|
859
|
+
* This function tries to unconnect unnecessary trees so that, if a tree has no clusters and its children
|
860
|
+
* don't have any clusters either, such tree would not be reached at prediction time. It will drop trees from the vector
|
861
|
+
* if they happen to lie at the end of it, but otherwise will just leave them there so as not to have to recalculate
|
862
|
+
* the tree indexes and avoid having to update them everywhere they are referenced (such as in identified outliers).
|
863
|
+
*
|
864
|
+
* This is only for categorical and ordinal columns, as numerical columns will always produce produce clusters when
|
865
|
+
* they have children.
|
866
|
+
*
|
867
|
+
* This is supposed to be done with the conditions at the end of each recursive function, but this piece of
|
868
|
+
* code can provide help in identifying errors when the code is modified.
|
869
|
+
*/
|
870
|
+
void prune_unused_trees(std::vector<ClusterTree> &trees)
|
871
|
+
{
|
872
|
+
/* TODO: when using 'follow_all', function should delete instead of disconnect by setting to zero */
|
873
|
+
if (trees.size() == 0) return;
|
874
|
+
for (size_t t = trees.size() - 1; t >= 0; t--) {
|
875
|
+
|
876
|
+
if (trees[t].binary_branches.size() > 0) {
|
877
|
+
for (size_t br = 0; br < trees[t].binary_branches.size(); br++) {
|
878
|
+
if (trees[t].binary_branches[br] == 0) continue;
|
879
|
+
if (trees[t].binary_branches[br] >= trees.size()) trees[t].binary_branches[br] = 0;
|
880
|
+
if (check_tree_is_not_needed(trees[trees[t].binary_branches[br]])) trees[t].binary_branches[br] = 0;
|
881
|
+
}
|
882
|
+
}
|
883
|
+
|
884
|
+
if (trees[t].all_branches.size() > 0) {
|
885
|
+
for (size_t br = 0; br < trees[t].all_branches.size(); br++) {
|
886
|
+
if (trees[t].all_branches[br] == 0) continue;
|
887
|
+
if (trees[t].all_branches[br] >= trees.size()) trees[t].all_branches[br] = 0;
|
888
|
+
if (check_tree_is_not_needed(trees[trees[t].all_branches[br]])) trees[t].all_branches[br] = 0;
|
889
|
+
}
|
890
|
+
}
|
891
|
+
|
892
|
+
|
893
|
+
if (check_tree_is_not_needed(trees[t])) {
|
894
|
+
|
895
|
+
/* disconnect tree from parent */
|
896
|
+
switch(trees[t].parent_branch) {
|
897
|
+
case IsNa:
|
898
|
+
{
|
899
|
+
trees[trees[t].parent].tree_NA = 0;
|
900
|
+
break;
|
901
|
+
}
|
902
|
+
|
903
|
+
case LessOrEqual:
|
904
|
+
{
|
905
|
+
trees[trees[t].parent].tree_left = 0;
|
906
|
+
break;
|
907
|
+
}
|
908
|
+
|
909
|
+
case Greater:
|
910
|
+
{
|
911
|
+
trees[trees[t].parent].tree_right = 0;
|
912
|
+
break;
|
913
|
+
}
|
914
|
+
|
915
|
+
case InSubset:
|
916
|
+
{
|
917
|
+
trees[trees[t].parent].tree_left = 0;
|
918
|
+
break;
|
919
|
+
}
|
920
|
+
|
921
|
+
case NotInSubset:
|
922
|
+
{
|
923
|
+
trees[trees[t].parent].tree_right = 0;
|
924
|
+
break;
|
925
|
+
}
|
926
|
+
|
927
|
+
}
|
928
|
+
|
929
|
+
if (t == (trees.size() - 1)) trees.pop_back();
|
930
|
+
}
|
931
|
+
if (t == 0) break;
|
932
|
+
}
|
933
|
+
}
|
934
|
+
#endif
|
935
|
+
|
936
|
+
/* Check whether a tree has no clusters and no children with clusters either */
|
937
|
+
bool check_tree_is_not_needed(ClusterTree &tree)
|
938
|
+
{
|
939
|
+
return
|
940
|
+
tree.tree_NA == 0 && tree.tree_left == 0 && tree.tree_right == 0 &&
|
941
|
+
tree.clusters.size() == 0 &&
|
942
|
+
(tree.binary_branches.size() == 0 || *std::max_element(tree.binary_branches.begin(), tree.binary_branches.end()) == 0) &&
|
943
|
+
(tree.all_branches.size() == 0 || *std::max_element(tree.all_branches.begin(), tree.all_branches.end()) == 0)
|
944
|
+
;
|
945
|
+
}
|
946
|
+
|
947
|
+
/*
|
948
|
+
* These functions simply check what's the minimum/maximum value that could identify an observation
|
949
|
+
* as outlier in any cluster, or which categories could be possibly flagged as outliers in any cluster.
|
950
|
+
* This info is redundant, as outliers can be identified by following splits, but it can help speed up
|
951
|
+
* things at prediction time by not having to even bother checking a column if the value is within
|
952
|
+
* non-flaggable limits.
|
953
|
+
*/
|
954
|
+
void calculate_cluster_minimums(ModelOutputs &model_outputs, size_t col)
|
955
|
+
{
|
956
|
+
for (size_t cl = 0; cl < model_outputs.all_clusters[col].size(); cl++) {
|
957
|
+
model_outputs.min_outlier_any_cl[col] = fmax(model_outputs.min_outlier_any_cl[col], model_outputs.all_clusters[col][cl].lower_lim);
|
958
|
+
model_outputs.max_outlier_any_cl[col] = fmin(model_outputs.max_outlier_any_cl[col], model_outputs.all_clusters[col][cl].upper_lim);
|
959
|
+
}
|
960
|
+
|
961
|
+
}
|
962
|
+
|
963
|
+
void calculate_cluster_poss_categs(ModelOutputs &model_outputs, size_t col, size_t col_rel)
|
964
|
+
{
|
965
|
+
if (model_outputs.all_clusters[col].size() == 0) return;
|
966
|
+
model_outputs.cat_outlier_any_cl[col_rel].resize(model_outputs.all_clusters[col][0].subset_common.size(), 0);
|
967
|
+
for (size_t cl = 0; cl < model_outputs.all_clusters[col].size(); cl++) {
|
968
|
+
for (size_t cat = 0; cat < model_outputs.all_clusters[col][cl].subset_common.size(); cat++) {
|
969
|
+
if (model_outputs.all_clusters[col][cl].subset_common[cat] != 0) model_outputs.cat_outlier_any_cl[col_rel][cat] = true;
|
970
|
+
}
|
971
|
+
}
|
972
|
+
}
|