outliertree 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,328 @@
1
+ /********************************************************************************************************************
2
+ * Explainable outlier detection
3
+ *
4
+ * Tries to detect outliers by generating decision trees that attempt to predict the values of each column based on
5
+ * each other column, testing in each branch of every tried split (if it meets some minimum criteria) whether there
6
+ * are observations that seem too distant from the others in a 1-D distribution for the column that the split tries
7
+ * to "predict" (will not generate a score for each observation).
8
+ * Splits are based on gain, while outlierness is based on confidence intervals.
9
+ * Similar in spirit to the GritBot software developed by RuleQuest research. Reference article is:
10
+ * Cortes, David. "Explainable outlier detection through decision tree conditioning."
11
+ * arXiv preprint arXiv:2001.00636 (2020).
12
+ *
13
+ *
14
+ * Copyright 2020 David Cortes.
15
+ *
16
+ * Written for C++11 standard and OpenMP 2.0 or later. Code is meant to be wrapped into scripting languages
17
+ * such as R or Python.
18
+ *
19
+ * This file is part of OutlierTree.
20
+ *
21
+ * OutlierTree is free software: you can redistribute it and/or modify
22
+ * it under the terms of the GNU General Public License as published by
23
+ * the Free Software Foundation, either version 3 of the License, or
24
+ * (at your option) any later version.
25
+ *
26
+ * OutlierTree is distributed in the hope that it will be useful,
27
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
28
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29
+ * GNU General Public License for more details.
30
+ *
31
+ * You should have received a copy of the GNU General Public License
32
+ * along with OutlierTree. If not, see <https://www.gnu.org/licenses/>.
33
+ ********************************************************************************************************************/
34
+ #include "outlier_tree.hpp"
35
+
36
+
37
+ /* Check whether to consider any category as outlier, based on current counts and prior probabilities
38
+ *
39
+ * Function is to be applied to some subset of the data obtained by splitting by one or more columns.
40
+ * For outliers before any split there is a separate function. Note that since it required current
41
+ * probability to be lower than prior probability in order to consider as outlier, it cannot be
42
+ * used with the full data (only with subsets).
43
+ *
44
+ * Parameters:
45
+ * - categ_counts[ncateg] (in)
46
+ * Counts of each category in the subset (including non-present categories).
47
+ * - ncateg (in)
48
+ * Number of categories for this column (including non-present categories).
49
+ * - tot (in)
50
+ * Number of rows in the subset.
51
+ * - max_perc_outliers (in)
52
+ * Model parameter. Default value is 0.01.
53
+ * - perc_threshold[ncateg] (in)
54
+ * Threshold for the proportion/probability of each category below which it can be considered
55
+ * to be an outlier in a subset of the data. Note that in addition it will build a confidence
56
+ * interval here which might make it even smaller.
57
+ * - buffer_ix[ncateg] (temp)
58
+ * Buffer where to store indices of categories sorted by proportion.
59
+ * - buffer_perc[ncateg] (temp)
60
+ * Buffer where to store proportions of counts.
61
+ * - z_norm (in)
62
+ * Model parameter. Default value is 2.67.
63
+ * - is_outlier[ncateg] (out)
64
+ * Array where to define whether any category is an outlier. Values will be as follows:
65
+ * (-1) -> Category had zero count, but would be an outlier if it appeared among this group
66
+ * 0 -> Category is not an outlier
67
+ * (+1) -> Category is an outlier
68
+ * - found_outliers (out)
69
+ * Whether there were any outliers identified among the counts.
70
+ * - new_is_outlier (out)
71
+ * Whether any of the categories with zero count would be flagged as outlier if they appeared in this group.
72
+ * - next_most_comm (out)
73
+ * Proportion of the least common category that is not flagged as outlier.
74
+ */
75
+ void find_outlier_categories(size_t categ_counts[], size_t ncateg, size_t tot, double max_perc_outliers,
76
+ long double perc_threshold[], size_t buffer_ix[], long double buffer_perc[],
77
+ double z_norm, char is_outlier[], bool *found_outliers, bool *new_is_outlier,
78
+ double *next_most_comm)
79
+ {
80
+ //TODO: must also establish bounds for new, unseen categories
81
+
82
+ /* initialize parameters as needed */
83
+ *found_outliers = false;
84
+ *new_is_outlier = false;
85
+ size_t st_non_zero = 0;
86
+ size_t end_tail = 0;
87
+ size_t max_outliers = (size_t) calculate_max_cat_outliers((long double)tot, max_perc_outliers, z_norm);
88
+ long double tot_dbl = (long double) tot;
89
+ long double pct_unseen = (long double)1 / (long double)(tot + 1);
90
+ size_t size_tail = 0;
91
+
92
+ /* reset the temporary arrays and fill them */
93
+ memset(is_outlier, 0, ncateg * sizeof(char));
94
+ for (size_t cat = 0; cat < ncateg; cat++) {
95
+ buffer_ix[cat] = cat;
96
+ buffer_perc[cat] = (categ_counts[cat] > 0)? ((long double)categ_counts[cat] / tot_dbl) : 0;
97
+ }
98
+
99
+ /* sort the categories by counts */
100
+ std::sort(buffer_ix, buffer_ix + ncateg,
101
+ [&categ_counts](const size_t a, const size_t b){return categ_counts[a] < categ_counts[b];});
102
+
103
+ /* find the first non-zero */
104
+ for (size_t cat = 0; cat < ncateg; cat++) {
105
+ if (categ_counts[ buffer_ix[cat] ] > 0) {
106
+ st_non_zero = cat;
107
+ break;
108
+ }
109
+ }
110
+
111
+ /* check that least common is not common enough to be normal */
112
+ if (categ_counts[ buffer_ix[st_non_zero] ] > max_outliers) return;
113
+
114
+ /* find tail among non-zero proportions
115
+ * a tail is considered to be so if:
116
+ * - the difference is above z_norm sd's of either proportion
117
+ * - the difference is greater than some fraction of the larger
118
+ * - the actual proportion here is lower than a CI of the prior proportion
119
+ * - the actual proportion here is half or less of the prior proportion
120
+ */
121
+ for (size_t cat = st_non_zero; cat < ncateg - 1; cat++) {
122
+ if (
123
+ (
124
+ (buffer_perc[buffer_ix[cat + 1]] - buffer_perc[buffer_ix[cat]])
125
+ >
126
+ z_norm * sqrtl(
127
+ fmaxl(
128
+ buffer_perc[buffer_ix[cat + 1]] * ((long double)1 - buffer_perc[buffer_ix[cat + 1]]),
129
+ buffer_perc[buffer_ix[cat]] * ((long double)1 - buffer_perc[buffer_ix[cat]])
130
+ )
131
+ / tot_dbl
132
+ )
133
+ )
134
+ &&
135
+ (
136
+ buffer_perc[buffer_ix[cat + 1]] * 0.5 > buffer_perc[buffer_ix[cat]]
137
+ )
138
+ )
139
+ {
140
+ end_tail = cat;
141
+ *next_most_comm = buffer_perc[buffer_ix[cat + 1]];
142
+ break;
143
+ }
144
+ }
145
+
146
+ /* if the tail is too long, don't identify any as outlier, but see if unseen categories (with prior > 0) would create a new tail */
147
+ for (size_t cat = st_non_zero; cat <= end_tail; cat++) size_tail += categ_counts[ buffer_ix[cat] ];
148
+
149
+ if (size_tail >= max_outliers) {
150
+
151
+ if (
152
+ st_non_zero == 0 ||
153
+ // ((long double)buffer_ix[buffer_ix[st_non_zero]] / (tot_dbl + 1)) * 0.5 <= pct_unseen ||
154
+ ( ((long double)buffer_ix[buffer_ix[st_non_zero]] * 0.5) / (tot_dbl + 1)) <= pct_unseen ||
155
+ ((long double)(buffer_ix[buffer_ix[st_non_zero]] - 1) / (tot_dbl + 1))
156
+ - (long double)z_norm * sqrtl(buffer_perc[buffer_ix[st_non_zero]] * ((long double)1 - buffer_perc[buffer_ix[st_non_zero]]) / tot_dbl)
157
+ >= pct_unseen
158
+ ) return;
159
+
160
+ for (size_t cat = 0; cat < st_non_zero; cat++) {
161
+ if (perc_threshold[buffer_ix[cat]] > pct_unseen) {
162
+ *new_is_outlier = true;
163
+ is_outlier[buffer_ix[cat]] = -1;
164
+ }
165
+ }
166
+ *next_most_comm = buffer_perc[buffer_ix[st_non_zero]];
167
+ return;
168
+
169
+ }
170
+
171
+ /* now determine if any category in the tail is an outlier */
172
+ for (size_t cat = st_non_zero; cat <= end_tail; cat++) {
173
+
174
+ /* must have a proportion below CI and below half of prior */
175
+ if (buffer_perc[buffer_ix[cat]] < perc_threshold[buffer_ix[cat]]) {
176
+ is_outlier[buffer_ix[cat]] = 1;
177
+ *found_outliers = true;
178
+ }
179
+ }
180
+
181
+ /* check if any new categories would be outliers */
182
+ if (st_non_zero > 0) {
183
+ for (size_t cat = 0; cat < st_non_zero; cat++) {
184
+ if (perc_threshold[buffer_ix[cat]] > pct_unseen) {
185
+ *new_is_outlier = true;
186
+ is_outlier[buffer_ix[cat]] = -1;
187
+ }
188
+ }
189
+ }
190
+ if (*new_is_outlier && !(*found_outliers)) {
191
+ *next_most_comm = buffer_perc[buffer_ix[st_non_zero]];
192
+ }
193
+
194
+ }
195
+
196
+ /* Check whether to consider any category as outlier, based on majority category and prior probabilties
197
+ *
198
+ * Function is to be applied to some subset of the data obtained by splitting by one or more columns.
199
+ * For outliers before any split there is a separate function. This is an alternative to the "tail"
200
+ * approach above which is more in line with GritBot.
201
+ *
202
+ * Parameters:
203
+ * - categ_counts[ncateg] (in)
204
+ * Counts of each category in the subset (including non-present categories).
205
+ * - ncateg (in)
206
+ * Number of categories for this column (including non-present categories).
207
+ * - tot (in)
208
+ * Number of rows in the subset.
209
+ * - max_perc_outliers (in)
210
+ * Model parameter. Default value is 0.01.
211
+ * - prior_prob[ncateg] (in)
212
+ * Proportions that each category had in the full data.
213
+ * - z_outlier (in)
214
+ * Model parameter. Default value is 8.0
215
+ * - is_outlier[ncateg] (out)
216
+ * Array where to define whether any category is an outlier. Values will be as follows:
217
+ * (-1) -> Category had zero count, but would be an outlier if it appeared among this group
218
+ * 0 -> Category is not an outlier
219
+ * (+1) -> Category is an outlier
220
+ * - found_outliers (out)
221
+ * Whether there were any outliers identified among the counts.
222
+ * - new_is_outlier (out)
223
+ * Whether any of the categories with zero count would be flagged as outlier if they appeared in this group.
224
+ * - categ_maj (out)
225
+ * Category to which the majority of the observations belong.
226
+ */
227
+ void find_outlier_categories_by_maj(size_t categ_counts[], size_t ncateg, size_t tot, double max_perc_outliers,
228
+ long double prior_prob[], double z_outlier, char is_outlier[],
229
+ bool *found_outliers, bool *new_is_outlier, int *categ_maj)
230
+ {
231
+ /* initialize parameters as needed */
232
+ *found_outliers = false;
233
+ *new_is_outlier = false;
234
+ memset(is_outlier, 0, ncateg * sizeof(char));
235
+ size_t max_outliers = (size_t) calculate_max_outliers((long double)tot, max_perc_outliers);
236
+ long double tot_dbl = (long double) (tot + 1);
237
+ size_t n_non_maj;
238
+ long double thr_prop = (double)1 / square(z_outlier);
239
+
240
+ /* check if any can be considered as outlier */
241
+ size_t *ptr_maj = std::max_element(categ_counts, categ_counts + ncateg);
242
+ *categ_maj = (int)(ptr_maj - categ_counts);
243
+ n_non_maj = tot - *ptr_maj;
244
+ if (n_non_maj > max_outliers)
245
+ return;
246
+
247
+ /* determine proportions and check for outlierness */
248
+ long double n_non_maj_dbl = (long double) n_non_maj;
249
+ for (size_t cat = 0; cat < ncateg; cat++) {
250
+
251
+ if ((int)cat == *categ_maj) continue;
252
+
253
+ if ( (n_non_maj_dbl / (tot_dbl * prior_prob[cat])) < thr_prop ) {
254
+ if (categ_counts[cat]) {
255
+ is_outlier[cat] = 1;
256
+ *found_outliers = true;
257
+ } else {
258
+ is_outlier[cat] = -1;
259
+ *new_is_outlier = true;
260
+ }
261
+ }
262
+ }
263
+
264
+ /* TODO: implement formula for flagging unsen categories (not in the sample, nor the full data) as outliers */
265
+ }
266
+
267
+
268
+ /* Check whether to consider any category as outlier before splitting, based on prior counts
269
+ *
270
+ * Follows very rough criteria: there can be at most 1-3 outliers depending on size of dataset,
271
+ * and the next most common category must have a count of at least 250.
272
+ *
273
+ * Parameters:
274
+ * - categ_counts[ncateg] (in)
275
+ * Frequencies of each category in the full data.
276
+ * - ncateg (in)
277
+ * Number of categories with non-zero count.
278
+ * - tot (in)
279
+ * Number of rows.
280
+ * - is_outlier[ncateg] (out)
281
+ * Array indicating whether any category is outlier (0 = non-outlier, 1 = outlier).
282
+ * - next_most_comm (out)
283
+ * Proportion of the least common non-outlier category.
284
+ */
285
+ bool find_outlier_categories_no_cond(size_t categ_counts[], size_t ncateg, size_t tot,
286
+ char is_outlier[], double *next_most_comm)
287
+ {
288
+ /* if sample is too small, don't flag any as outliers */
289
+ if (tot < 1000) return false;
290
+
291
+ /* set a very low outlier threshold with a hard limit of 3 */
292
+ size_t max_outliers = (tot < 10000)? 1 : ((tot < 100000)? 2 : 3);
293
+
294
+ /* will only consider a category as outlier if the next most common is very common */
295
+ size_t max_next_most_comm = 250;
296
+
297
+ /* look if there's any category meeting the first condition and none meeting the second one */
298
+ bool has_outlier_cat = false;
299
+ memset(is_outlier, 0, sizeof(char) * ncateg);
300
+ for (size_t cat = 0; cat < ncateg; cat++) {
301
+ if (categ_counts[cat] > max_outliers && categ_counts[cat] < max_next_most_comm) {
302
+ has_outlier_cat = false;
303
+ break;
304
+ }
305
+
306
+ if (categ_counts[cat] > 0 && categ_counts[cat] <= max_outliers) {
307
+ /* can only have 1 outlier category in the whole column */
308
+ if (has_outlier_cat) { has_outlier_cat = false; break; }
309
+
310
+ has_outlier_cat = true;
311
+ is_outlier[cat] = 1;
312
+ }
313
+
314
+ }
315
+
316
+ /* if outlier is found, find next most common frequency for printed statistics */
317
+ if (has_outlier_cat) {
318
+ size_t next_most_comm_cat = INT_MAX;
319
+ for (size_t cat = 0; cat < ncateg; cat++) {
320
+ if (categ_counts[cat] > 0 && !is_outlier[cat]) {
321
+ next_most_comm_cat = std::min(next_most_comm_cat, categ_counts[cat]);
322
+ }
323
+ }
324
+ *next_most_comm = (long double)next_most_comm_cat / (long double)tot;
325
+ }
326
+
327
+ return has_outlier_cat;
328
+ }
@@ -0,0 +1,972 @@
1
+ /********************************************************************************************************************
2
+ * Explainable outlier detection
3
+ *
4
+ * Tries to detect outliers by generating decision trees that attempt to predict the values of each column based on
5
+ * each other column, testing in each branch of every tried split (if it meets some minimum criteria) whether there
6
+ * are observations that seem too distant from the others in a 1-D distribution for the column that the split tries
7
+ * to "predict" (will not generate a score for each observation).
8
+ * Splits are based on gain, while outlierness is based on confidence intervals.
9
+ * Similar in spirit to the GritBot software developed by RuleQuest research. Reference article is:
10
+ * Cortes, David. "Explainable outlier detection through decision tree conditioning."
11
+ * arXiv preprint arXiv:2001.00636 (2020).
12
+ *
13
+ *
14
+ * Copyright 2020 David Cortes.
15
+ *
16
+ * Written for C++11 standard and OpenMP 2.0 or later. Code is meant to be wrapped into scripting languages
17
+ * such as R or Python.
18
+ *
19
+ * This file is part of OutlierTree.
20
+ *
21
+ * OutlierTree is free software: you can redistribute it and/or modify
22
+ * it under the terms of the GNU General Public License as published by
23
+ * the Free Software Foundation, either version 3 of the License, or
24
+ * (at your option) any later version.
25
+ *
26
+ * OutlierTree is distributed in the hope that it will be useful,
27
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
28
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29
+ * GNU General Public License for more details.
30
+ *
31
+ * You should have received a copy of the GNU General Public License
32
+ * along with OutlierTree. If not, see <https://www.gnu.org/licenses/>.
33
+ ********************************************************************************************************************/
34
+ #include "outlier_tree.hpp"
35
+
36
+
37
+
38
+ /* Characterize a homogenous 1-dimensional cluster
39
+ *
40
+ * Calcualtes limits and display statistics on the distribution of one numerical variable,
41
+ * flagging potential outliers if found. Can be run on the full data or on subsets obtained from splitting
42
+ * by other variables.
43
+ *
44
+ * In order to flag an observation as outlier, it must:
45
+ * * Be in a very small/large percentile of the subset passed here.
46
+ * * Have a large absolute Z value (standardized and centered).
47
+ * * Have a large gap in the Z value with respect to the next largest/smallest ovservation.
48
+ * * Not be in a long tail (unless the variable was transformed by exponentiating or taking logarithm).
49
+ *
50
+ * Parameters:
51
+ * - x[n] (in)
52
+ * Variable for which to define the cluster.
53
+ * - ix_arr[n] (in)
54
+ * Indices to take from the array above.
55
+ * - st (in)
56
+ * Position at which ix_arr starts (inclusive).
57
+ * - end (in)
58
+ * Position at which ix_arr ends (inclusive).
59
+ * - outlier_scores[n] (in, out)
60
+ * Outlier scores (based on chebyshyov's inequality) that are already assigned to the observations from this column
61
+ * from previous runs of this function in larger subsets (should be started to 1).
62
+ * - outlier_clusters[n] (in, out)
63
+ * Cluster number under which an observation is the most anomalous.
64
+ * - outlier_trees[n] (in, out)
65
+ * Tree under which the outlier cluster assigned lies.
66
+ * - outlier_depth[n] (in, out)
67
+ * Tree depth at which the outlier cluster assigned is found.
68
+ * - cluster (in, out)
69
+ * Outlier cluster object with statistics and limits.
70
+ * - clusters (in)
71
+ * Vector containing all cluster already generated.
72
+ * - cluster_num (in)
73
+ * Number to give to this cluster.
74
+ * - tree_num (in)
75
+ * Number of the tree under which this cluster is to be found.
76
+ * - tree_depth (in)
77
+ * Distance form the tree root at which this tree is to be found.
78
+ * - is_log_transf (in)
79
+ * Whether the column 'x' has undergone a logarithmic transformation.
80
+ * - log_minval (in)
81
+ * Value that was added to 'x' before taking its logarithm (if it was log-transformed).
82
+ * - is_exp_transf (in)
83
+ * Whether the column 'x' has undergone an exponential transformation on its standardized values.
84
+ * - orig_mean (in)
85
+ * Mean of the variable 'x' before being standardized (if it was exponentiated).
86
+ * - orig_sd (in)
87
+ * Standard deviation of the variable 'x' before being standardized (if it was exponentiated).
88
+ * - left_tail (in)
89
+ * Value of 'x' after which it is considered a long tail, in which outliers will not be searched for.
90
+ * - rught_tail (in)
91
+ * Value of 'x' before which it is considered a long tail, in which outliers will not be searched for.
92
+ * - orig_x (in)
93
+ * Original values of 'x' if it was transformed (log or exp).
94
+ * - max_perc_outliers (in)
95
+ * Model parameter. Default is 0.01.
96
+ * - z_norm (in)
97
+ * Model parameter. Default is 2.67.
98
+ * - z_outlier (in)
99
+ * Model parameter. Default is 8.0. Must be greater than z_norm.
100
+ *
101
+ * Returns:
102
+ * - Whether there were any outliers detected.
103
+ */
104
+ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_t st, size_t end,
105
+ double *restrict outlier_scores, size_t *restrict outlier_clusters, size_t *restrict outlier_trees,
106
+ size_t *restrict outlier_depth, Cluster &cluster, std::vector<Cluster> &clusters,
107
+ size_t cluster_num, size_t tree_num, size_t tree_depth,
108
+ bool is_log_transf, double log_minval, bool is_exp_transf, double orig_mean, double orig_sd,
109
+ double left_tail, double right_tail, double *restrict orig_x,
110
+ double max_perc_outliers, double z_norm, double z_outlier)
111
+ {
112
+
113
+ /* TODO: this function could try to determine if the distribution is multimodal, and if so,
114
+ take only the most extreme means/sd for outlier comparisons */
115
+
116
+ /* TODO: statistics like SD, mean; are already available from the splitting function which
117
+ is called right before this, so these should *only* need to be recalculated them if the column
118
+ has undergone log or exp transform */
119
+
120
+ /* NAs and Inf should have already been removed, and outliers with fewer conditionals already discarded */
121
+ bool has_low_values = false;
122
+ bool has_high_values = false;
123
+ long double running_mean = 0;
124
+ long double mean_prev = 0;
125
+ long double running_ssq = 0;
126
+ double xval;
127
+ double mean;
128
+ double sd;
129
+ size_t cnt;
130
+ size_t tail_size = (size_t) calculate_max_outliers((long double)(end - st + 1), max_perc_outliers);
131
+ size_t st_non_tail = st + tail_size;
132
+ size_t end_non_tail = end - tail_size;
133
+ size_t st_normals = 0;
134
+ size_t end_normals = 0;
135
+ double min_gap = z_outlier - z_norm;
136
+
137
+ /* TODO: here it's not necessary to sort the whole data, only top/bottom N */
138
+
139
+ /* sort the data */
140
+ std::sort(ix_arr + st, ix_arr + end + 1, [&x](const size_t a, const size_t b){return x[a] < x[b];});
141
+
142
+ /* calculate statistics with tails and previous outliers excluded */
143
+ cnt = end_non_tail - st_non_tail + 1;
144
+ for (size_t row = st_non_tail; row <= end_non_tail; row++) {
145
+ xval = x[ ix_arr[row] ];
146
+ running_mean += (xval - running_mean) / (long double)(row - st_non_tail + 1);
147
+ running_ssq += (xval - running_mean) * (xval - mean_prev);
148
+ mean_prev = running_mean;
149
+
150
+ }
151
+ mean = (double) running_mean;
152
+ sd = (double) sqrtl(running_ssq / (long double)(cnt - 1));
153
+
154
+ /* adjust SD heuristically to account for reduced size, by (N + tail)/(N-tail) --- note that cnt = N-2*tail */
155
+ sd *= (long double)(cnt + 3 * tail_size) / (long double)(cnt + tail_size);
156
+ /* re-adjust if there's a one-sided tail and no transformation was applies */
157
+ if ((!isinf(left_tail) || !isinf(right_tail)) && !is_log_transf && !is_exp_transf) {
158
+ sd *= 0.5;
159
+ }
160
+ cluster.cluster_mean = mean;
161
+ cluster.cluster_sd = sd;
162
+ cnt = end - st + 1;
163
+
164
+ /* see if the minimum and/or maximum values qualify for outliers */
165
+ if (-z_score(x[ix_arr[st]], mean, sd) >= z_outlier && x[ix_arr[st]] > left_tail) has_low_values = true;
166
+ if ( z_score(x[ix_arr[end]], mean, sd) >= z_outlier && x[ix_arr[end]] < right_tail) has_high_values = true;
167
+
168
+ /* look for a large gap in the z-scores */
169
+ if (has_low_values) {
170
+ for (size_t row = st; row < st + tail_size; row++) {
171
+
172
+ if (( z_score(x[ix_arr[row + 1]], mean, sd) - z_score(x[ix_arr[row]], mean, sd) ) >= min_gap) {
173
+ st_normals = row + 1;
174
+ if (is_exp_transf) {
175
+ cluster.lower_lim = log(x[ix_arr[row + 1]] - min_gap * sd) * orig_sd + orig_mean;
176
+ } else if (is_log_transf) {
177
+ cluster.lower_lim = exp(x[ix_arr[row + 1]] - min_gap * sd) + log_minval;
178
+ } else {
179
+ cluster.lower_lim = x[ix_arr[row + 1]] - min_gap * sd;
180
+ }
181
+ cluster.display_lim_low = orig_x[ix_arr[row + 1]];
182
+ cluster.perc_above = (long double)(end - st_normals + 1) / (long double)(end - st + 1);
183
+ break;
184
+ }
185
+ if (z_score(x[ix_arr[row]], mean, sd) > -z_outlier) break;
186
+
187
+ }
188
+ if (st_normals == 0) {
189
+ has_low_values = false;
190
+ } else {
191
+ for (size_t row = st; row < st_normals; row++) {
192
+
193
+ /* assign outlier if it's a better cluster than previously assigned */
194
+ if (
195
+ outlier_scores[ix_arr[row]] >= 1.0 ||
196
+ (clusters[outlier_clusters[ix_arr[row]]].has_NA_branch && !cluster.has_NA_branch) ||
197
+ (
198
+ cluster.has_NA_branch == clusters[outlier_clusters[ix_arr[row]]].has_NA_branch
199
+ &&
200
+ (
201
+ tree_depth < outlier_depth[ix_arr[row]] ||
202
+ (
203
+ tree_depth == outlier_depth[ix_arr[row]] &&
204
+ clusters[outlier_clusters[ix_arr[row]]].cluster_size < (cnt - 2 * tail_size)
205
+ )
206
+ )
207
+ )
208
+ )
209
+ {
210
+ outlier_scores[ix_arr[row]] = chebyshyov_bound(z_score(x[ix_arr[row]], mean, sd));
211
+ if (is_na_or_inf(outlier_scores[ix_arr[row]])) outlier_scores[ix_arr[row]] = 0;
212
+ outlier_clusters[ix_arr[row]] = cluster_num;
213
+ outlier_trees[ix_arr[row]] = tree_num;
214
+ outlier_depth[ix_arr[row]] = tree_depth;
215
+ }
216
+
217
+ }
218
+ }
219
+ }
220
+ if (!has_low_values) {
221
+ cluster.perc_above = 1.0;
222
+ if (!is_log_transf && !is_exp_transf) {
223
+
224
+ if (isinf(left_tail)) {
225
+ cluster.lower_lim = x[ix_arr[st]] - min_gap * sd;
226
+ } else {
227
+ cluster.lower_lim = -HUGE_VAL;
228
+ }
229
+
230
+ } else if (is_exp_transf) {
231
+ cluster.lower_lim = log(x[ix_arr[st]] - min_gap * sd) * orig_sd + orig_mean;
232
+ } else {
233
+ cluster.lower_lim = exp(x[ix_arr[st]] - min_gap * sd) + log_minval;
234
+ }
235
+
236
+ cluster.display_lim_low = orig_x[ix_arr[st]];
237
+
238
+ }
239
+
240
+ if (has_high_values) {
241
+ for (size_t row = end; row > (end - tail_size); row--) {
242
+
243
+ if (( z_score(x[ix_arr[row]], mean, sd) - z_score(x[ix_arr[row - 1]], mean, sd) ) >= min_gap) {
244
+ end_normals = row - 1;
245
+ if (is_exp_transf) {
246
+ cluster.upper_lim = log(x[ix_arr[row - 1]] + min_gap * sd) * orig_sd + orig_mean;
247
+ } else if (is_log_transf) {
248
+ cluster.upper_lim = exp(x[ix_arr[row - 1]] + min_gap * sd) + log_minval;
249
+ } else {
250
+ cluster.upper_lim = x[ix_arr[row - 1]] + min_gap * sd;
251
+ }
252
+ cluster.display_lim_high = orig_x[ix_arr[row - 1]];
253
+ cluster.perc_below = (long double)(end_normals - st + 1) / (long double)(end - st + 1);
254
+ break;
255
+ }
256
+ if (z_score(x[ix_arr[row]], mean, sd) < z_outlier) break;
257
+
258
+ }
259
+ if (end_normals == 0) {
260
+ has_high_values = false;
261
+ } else {
262
+ for (size_t row = end; row > end_normals; row--) {
263
+
264
+ /* assign outlier if it's a better cluster than previously assigned - Note that it might produce slight mismatches
265
+ against the predict function (the latter is more trustable) due to the size of the cluster not yet being known
266
+ at the moment of determinining whether to overwrite previous in here */
267
+ if (
268
+ outlier_scores[ix_arr[row]] >= 1.0 ||
269
+ (clusters[outlier_clusters[ix_arr[row]]].has_NA_branch && !cluster.has_NA_branch) ||
270
+ (
271
+ cluster.has_NA_branch == clusters[outlier_clusters[ix_arr[row]]].has_NA_branch
272
+ &&
273
+ (
274
+ tree_depth < outlier_depth[ix_arr[row]] ||
275
+ (
276
+ tree_depth == outlier_depth[ix_arr[row]] &&
277
+ clusters[outlier_clusters[ix_arr[row]]].cluster_size < (cnt - 2 * tail_size)
278
+ )
279
+ )
280
+ )
281
+ )
282
+ {
283
+ outlier_scores[ix_arr[row]] = chebyshyov_bound(z_score(x[ix_arr[row]], mean, sd));
284
+ if (is_na_or_inf(outlier_scores[ix_arr[row]])) outlier_scores[ix_arr[row]] = 0;
285
+ outlier_clusters[ix_arr[row]] = cluster_num;
286
+ outlier_trees[ix_arr[row]] = tree_num;
287
+ outlier_depth[ix_arr[row]] = tree_depth;
288
+ }
289
+
290
+ }
291
+ }
292
+ }
293
+ if (!has_high_values) {
294
+ cluster.perc_below = 1.0;
295
+ if (!is_log_transf && !is_exp_transf) {
296
+
297
+ if (isinf(right_tail)) {
298
+ cluster.upper_lim = x[ix_arr[end]] + min_gap * sd;
299
+ } else {
300
+ cluster.upper_lim = HUGE_VAL;
301
+ }
302
+ } else if (is_exp_transf) {
303
+ cluster.upper_lim = log(x[ix_arr[end]] + min_gap * sd) * orig_sd + orig_mean;
304
+ } else {
305
+ cluster.upper_lim = exp(x[ix_arr[end]] + min_gap * sd) + log_minval;
306
+ }
307
+
308
+ cluster.display_lim_high = orig_x[ix_arr[end]];
309
+ }
310
+
311
+ /* save displayed statistics for cluster */
312
+ if (has_high_values || has_low_values || is_log_transf || is_exp_transf) {
313
+ size_t st_disp = has_low_values? st_normals : st;
314
+ size_t end_disp = has_high_values? end_normals : end;
315
+ running_mean = 0;
316
+ mean_prev = 0;
317
+ running_ssq = 0;
318
+ for (size_t row = st_disp; row <= end_disp; row++) {
319
+ xval = orig_x[ix_arr[row]];
320
+ running_mean += (xval - running_mean) / (long double)(row - st_disp + 1);
321
+ running_ssq += (xval - running_mean) * (xval - mean_prev);
322
+ mean_prev = running_mean;
323
+ }
324
+ cluster.cluster_size = end_disp - st_disp + 1;
325
+ cluster.display_mean = (double) running_mean;
326
+ cluster.display_sd = (double) sqrtl(running_ssq / (long double)(cluster.cluster_size - 1));
327
+ } else {
328
+ cluster.display_mean = cluster.cluster_mean;
329
+ cluster.display_sd = cluster.cluster_sd;
330
+ cluster.cluster_size = end - st + 1;
331
+ }
332
+
333
+ /* report whether outliers were found or not */
334
+ return has_low_values || has_high_values;
335
+ }
336
+
337
+
338
+ /* Characterize a homogeneous categorical cluster from the *full* data
339
+ *
340
+ * Function is meant for the data as it comes, before splitting it, as once split, it will
341
+ * not be able to detect these outliers. As such, it takes fewer parameters, since it can only
342
+ * be the first tree and cluster in a column. It assumes the outliers have already been identified.
343
+ *
344
+ * Parameters:
345
+ * - x[n]
346
+ * Array indicating the category to which each observation belongs.
347
+ * - ix_arr[n] (in)
348
+ * Indices to take from the array above.
349
+ * - st (in)
350
+ * Position at which ix_arr starts (inclusive).
351
+ * - end (in)
352
+ * Position at which ix_arr ends (inclusive).
353
+ * - ncateg (in)
354
+ * Number of categories in this column.
355
+ * - outlier_scores[n] (in, out)
356
+ * Array where to assign outlier scores (based on proportion) to each observation belonging to an outlier category.
357
+ * - outlier_clusters[n] (in, out)
358
+ * Array where to assign cluster number to each observation belonging to an outlier category.
359
+ * - outlier_trees[n] (in, out)
360
+ * Array where to assign tree number to each observation belonging to an outlier category.
361
+ * - outlier_depth[n] (in, out)
362
+ * Array where to assign tree depth to each observation belonging to an outlier category.
363
+ * - cluster (in, out)
364
+ * Outlier cluster object with statistics and classifications.
365
+ * - categ_counts[ncateg] (in)
366
+ * Array with the frequencies of each category in the data.
367
+ * - is_outlier[ncateg] (in)
368
+ * Array indicating which categories are to be considered as outliers (must be already calculated).
369
+ * - perc_next_most_comm (in)
370
+ * Proportion of the least common non-outlier category (must be already calculated).
371
+ */
372
+ void define_categ_cluster_no_cond(int *restrict x, size_t *restrict ix_arr, size_t st, size_t end, size_t ncateg,
373
+ double *restrict outlier_scores, size_t *restrict outlier_clusters, size_t *restrict outlier_trees,
374
+ size_t *restrict outlier_depth, Cluster &cluster,
375
+ size_t *restrict categ_counts, char *restrict is_outlier, double perc_next_most_comm)
376
+ {
377
+ size_t cnt_common = end - st + 1;
378
+ cluster.cluster_size = cnt_common;
379
+ double pct_outl;
380
+ cluster.subset_common.assign(is_outlier, is_outlier + ncateg);
381
+ cluster.score_categ.resize(ncateg, 0);
382
+
383
+
384
+ for (size_t row = st; row <= end; row++) {
385
+ if (is_outlier[x[ix_arr[row]]]) {
386
+ cnt_common--;
387
+ pct_outl = (long double)categ_counts[ x[ix_arr[row]] ] / (long double)cluster.cluster_size;
388
+ pct_outl = pct_outl + sqrt(pct_outl * (1 - pct_outl) / (long double)cluster.cluster_size);
389
+ cluster.score_categ[ x[ix_arr[row]] ] = pct_outl;
390
+ outlier_scores[ix_arr[row]] = pct_outl;
391
+ outlier_clusters[ix_arr[row]] = 0;
392
+ outlier_trees[ix_arr[row]] = 0;
393
+ outlier_depth[ix_arr[row]] = 0;
394
+ }
395
+ }
396
+ cluster.perc_in_subset = (long double)cnt_common / (long double)cluster.cluster_size;
397
+ cluster.perc_next_most_comm = perc_next_most_comm;
398
+ }
399
+
400
+
401
+ /* Characterize a homogeneous categorical cluster form a subset of the data, or report if it's not homogeneous
402
+ *
403
+ * Function is meant to be called with subsets of the data only. Will calculate the counts inside it.
404
+ * In order to consider a category as outlier, it must:
405
+ * * Have a proportion smaller than its prior probability and than a condifence interval of its prior.
406
+ * * Have a large gap with respect to the next most-common category.
407
+ * * Be in a cluster in which few or no observations belong to a category meeting such conditions.
408
+ * It's oftentimes not possible to create a cluster with category frequencies that would produce outliers,
409
+ * in which case it will report whether the cluster should be dropped.
410
+ *
411
+ * Parameters:
412
+ * - x[n]
413
+ * Array indicating the category to which each observation belongs.
414
+ * - ix_arr[n] (in)
415
+ * Indices to take from the array above.
416
+ * - st (in)
417
+ * Position at which ix_arr starts (inclusive).
418
+ * - end (in)
419
+ * Position at which ix_arr ends (inclusive).
420
+ * - ncateg (in)
421
+ * Number of categories in this column.
422
+ * - by_maj (in)
423
+ * Model parameter. Default is 'false'. Indicates whether to detect outliers according to the number of non-majority
424
+ * obsevations compared to the expected number for each category.
425
+ * - outlier_scores[n] (in, out)
426
+ * Outlier scores (based on observed category proportion) that are already assigned to the observations from this column
427
+ * from previous runs of this function in larger subsets (should be started to 1).
428
+ * - outlier_clusters[n] (in, out)
429
+ * Cluster number under which an observation is the most anomalous.
430
+ * - outlier_trees[n] (in, out)
431
+ * Tree under which the outlier cluster assigned lies.
432
+ * - outlier_depth[n] (in, out)
433
+ * Tree depth at which the outlier cluster assigned is found.
434
+ * - cluster (in, out)
435
+ * Outlier cluster object with statistics and limits.
436
+ * - clusters (in)
437
+ * Vector containing all cluster already generated.
438
+ * - cluster_num (in)
439
+ * Number to give to this cluster.
440
+ * - tree_num (in)
441
+ * Number of the tree under which this cluster is to be found.
442
+ * - tree_depth (in)
443
+ * Distance form the tree root at which this tree is to be found.
444
+ * - max_perc_outliers (in)
445
+ * Model parameter. Default is 0.01.
446
+ * - z_norm (in)
447
+ * Model parameter. Default is 2.67.
448
+ * - z_outlier (in)
449
+ * Model parameter. Default is 8.0.
450
+ * - perc_threshold[ncateg] (in)
451
+ * Observed proportion below which a category can be considered as outlier.
452
+ * - prop_prior[ncateg] (in)
453
+ * Prior probability of each category in the full data (only used when passing 'by_maj' = 'true').
454
+ * - buffer_categ_counts[ncateg] (temp)
455
+ * Buffer where to save the observed frequencies of each category.
456
+ * - buffer_categ_pct[ncateg] (temp)
457
+ * Buffer where to save the observed proportion of each category.
458
+ * - buffer_categ_ix[ncateg] (temp)
459
+ * Buffer where to save the category numbers sorted by proportion.
460
+ * - buffer_outliers[ncateg] (temp)
461
+ * Buffer where to save the results of which categories are flagged as outliers
462
+ * before copying it to the cluster (will not copy if none is flagged).
463
+ * - drop_cluster (out)
464
+ * Whethet the cluster should be dropped (i.e. it was not possible to flag any present
465
+ * or non-present category as outlier).
466
+ *
467
+ * Returns:
468
+ * - Whether it identified any outliers or not.
469
+ */
470
+ bool define_categ_cluster(int *restrict x, size_t *restrict ix_arr, size_t st, size_t end, size_t ncateg, bool by_maj,
471
+ double *restrict outlier_scores, size_t *restrict outlier_clusters, size_t *restrict outlier_trees,
472
+ size_t *restrict outlier_depth, Cluster &cluster, std::vector<Cluster> &clusters,
473
+ size_t cluster_num, size_t tree_num, size_t tree_depth,
474
+ double max_perc_outliers, double z_norm, double z_outlier,
475
+ long double *restrict perc_threshold, long double *restrict prop_prior,
476
+ size_t *restrict buffer_categ_counts, long double *restrict buffer_categ_pct,
477
+ size_t *restrict buffer_categ_ix, char *restrict buffer_outliers,
478
+ bool *restrict drop_cluster)
479
+ {
480
+ bool found_outliers, new_is_outlier;
481
+ size_t tot = end - st + 1;
482
+ size_t sz_maj = tot;
483
+ long double tot_dbl = (long double) tot;
484
+ size_t tail_size = (size_t) calculate_max_outliers(tot_dbl, max_perc_outliers);
485
+ cluster.perc_in_subset = 1;
486
+ double pct_outl;
487
+
488
+ /* calculate category counts */
489
+ memset(buffer_categ_counts, 0, ncateg * sizeof(size_t));
490
+ for (size_t row = st; row <= end; row++) {
491
+ buffer_categ_counts[ x[ix_arr[row]] ]++;
492
+ }
493
+
494
+ /* flag categories as outliers if appropriate */
495
+ if (!by_maj)
496
+ find_outlier_categories(buffer_categ_counts, ncateg, tot, max_perc_outliers,
497
+ perc_threshold, buffer_categ_ix, buffer_categ_pct,
498
+ z_norm, buffer_outliers, &found_outliers,
499
+ &new_is_outlier, &cluster.perc_next_most_comm);
500
+ else
501
+ find_outlier_categories_by_maj(buffer_categ_counts, ncateg, tot, max_perc_outliers,
502
+ prop_prior, z_outlier, buffer_outliers,
503
+ &found_outliers, &new_is_outlier, &cluster.categ_maj);
504
+
505
+ if (found_outliers) {
506
+ for (size_t row = st; row <= end; row++) {
507
+ if (buffer_outliers[ x[ix_arr[row]] ]) {
508
+
509
+ /* follow usual rules for preferring this cluster over others */
510
+ if (
511
+ outlier_scores[ix_arr[row]] >= 1.0 ||
512
+ (clusters[outlier_clusters[ix_arr[row]]].has_NA_branch && !cluster.has_NA_branch) ||
513
+ (
514
+ cluster.has_NA_branch == clusters[outlier_clusters[ix_arr[row]]].has_NA_branch
515
+ &&
516
+ (
517
+ tree_depth < outlier_depth[ix_arr[row]] ||
518
+ (
519
+ tree_depth == outlier_depth[ix_arr[row]] &&
520
+ clusters[outlier_clusters[ix_arr[row]]].cluster_size < (tot - tail_size)
521
+ )
522
+ )
523
+ )
524
+ )
525
+ {
526
+ if (!by_maj) {
527
+ pct_outl = (long double)buffer_categ_counts[ x[ix_arr[row]] ] / tot_dbl;
528
+ pct_outl = pct_outl + sqrt(pct_outl * (1 - pct_outl) / tot_dbl);
529
+ outlier_scores[ix_arr[row]] = pct_outl;
530
+ } else {
531
+ pct_outl = (long double)(tot - buffer_categ_counts[cluster.categ_maj]) / (tot_dbl * prop_prior[ x[ix_arr[row]] ]);
532
+ outlier_scores[ix_arr[row]] = square(pct_outl);
533
+ }
534
+ outlier_clusters[ix_arr[row]] = cluster_num;
535
+ outlier_trees[ix_arr[row]] = tree_num;
536
+ outlier_depth[ix_arr[row]] = tree_depth;
537
+ }
538
+ sz_maj--;
539
+
540
+ }
541
+ }
542
+ cluster.perc_in_subset = (long double)sz_maj / tot_dbl;
543
+ }
544
+
545
+ if (new_is_outlier && !found_outliers) {
546
+ cluster.perc_in_subset = 1.0;
547
+ }
548
+
549
+ if (new_is_outlier || found_outliers) {
550
+ *drop_cluster = false;
551
+ cluster.cluster_size = sz_maj;
552
+ cluster.subset_common.assign(buffer_outliers, buffer_outliers + ncateg);
553
+ cluster.score_categ.resize(ncateg, 0);
554
+ if (!by_maj) {
555
+
556
+ for (size_t cat = 0; cat < ncateg; cat++) {
557
+ if (cluster.subset_common[cat] > 0) {
558
+ pct_outl = (long double)buffer_categ_counts[cat] / tot_dbl;
559
+ cluster.score_categ[cat] = pct_outl + sqrt(pct_outl * (1 - pct_outl) / tot_dbl);
560
+ } else if (cluster.subset_common[cat] < 0) {
561
+ pct_outl = (long double)1 / (long double)(tot + 2);
562
+ cluster.score_categ[cat] = pct_outl + sqrt(pct_outl * (1 - pct_outl) / (long double)(tot + 2));
563
+ }
564
+ }
565
+
566
+ } else {
567
+
568
+ cluster.perc_in_subset = (long double) buffer_categ_counts[cluster.categ_maj] / tot_dbl;
569
+ for (size_t cat = 0; cat < ncateg; cat++) {
570
+ if (cat == cluster.categ_maj)
571
+ continue;
572
+ if (cluster.subset_common[cat] != 0) {
573
+ cluster.score_categ[cat] = (long double)(tot - buffer_categ_counts[cluster.categ_maj] + 1)
574
+ / ((long double)(tot + 2) * prop_prior[cat]);
575
+ cluster.score_categ[cat] = square(cluster.score_categ[cat]);
576
+ }
577
+ }
578
+
579
+ }
580
+ } else {
581
+ *drop_cluster = true;
582
+ }
583
+
584
+ return found_outliers;
585
+ }
586
+
587
+ /* Convert in/not-in conditions to 'equals' or 'not equals' when they look for only 1 category */
588
+ void simplify_when_equal_cond(std::vector<Cluster> &clusters, int ncat_ord[])
589
+ {
590
+
591
+ int col_equal;
592
+ size_t size_subset;
593
+ size_t size_subset_excl;
594
+ for (size_t clust = 0; clust < clusters.size(); clust++) {
595
+ if (clusters[clust].split_type == IsNa) continue;
596
+
597
+ switch(clusters[clust].column_type) {
598
+
599
+ case Categorical:
600
+ {
601
+
602
+ col_equal = -1;
603
+ if (clusters[clust].split_subset.size() == 2) {
604
+
605
+ switch(col_equal = clusters[clust].split_type) {
606
+ case InSubset:
607
+ {
608
+ col_equal = clusters[clust].split_subset[0]? 0 : 1;
609
+ break;
610
+ }
611
+
612
+ case NotInSubset:
613
+ {
614
+ col_equal = clusters[clust].split_subset[0]? 1 : 0;
615
+ break;
616
+ }
617
+
618
+ case SingleCateg:
619
+ {
620
+ col_equal = clusters[clust].split_subset[0]? 0 : 1;
621
+ break;
622
+ }
623
+ }
624
+ clusters[clust].split_type = Equal;
625
+
626
+ } else {
627
+
628
+ size_subset_excl = std::accumulate(clusters[clust].split_subset.begin(), clusters[clust].split_subset.end(), (size_t)0,
629
+ [](const size_t a, const char b){return a + ((b < 0)? 1 : 0);});
630
+ if (size_subset_excl > 0) continue;
631
+ size_subset = std::accumulate(clusters[clust].split_subset.begin(), clusters[clust].split_subset.end(), (size_t)0,
632
+ [](const size_t a, const char b){return a + ((b > 0)? 1 : 0);});
633
+ if (size_subset == 1) {
634
+
635
+ do {col_equal++;} while (clusters[clust].split_subset[col_equal] <= 0);
636
+ if (clusters[clust].split_type == InSubset || clusters[clust].split_type == SingleCateg)
637
+ clusters[clust].split_type = Equal;
638
+ else
639
+ clusters[clust].split_type = NotEqual;
640
+
641
+ } else if (size_subset == (clusters[clust].split_subset.size() - 1)) {
642
+
643
+ do {col_equal++;} while (clusters[clust].split_subset[col_equal] != 0);
644
+ if (clusters[clust].split_type == NotInSubset)
645
+ clusters[clust].split_type = Equal;
646
+ else
647
+ clusters[clust].split_type = NotEqual;
648
+
649
+ }
650
+
651
+ }
652
+ if (col_equal >= 0) {
653
+ clusters[clust].split_subset.resize(0);
654
+ clusters[clust].split_lev = col_equal;
655
+ }
656
+ break;
657
+ }
658
+
659
+
660
+ case Ordinal:
661
+ {
662
+
663
+ if (clusters[clust].split_lev == 0) {
664
+
665
+ if (clusters[clust].split_type == LessOrEqual)
666
+ clusters[clust].split_type = Equal;
667
+ else
668
+ clusters[clust].split_type = NotEqual;
669
+
670
+ }
671
+
672
+ else if (clusters[clust].split_lev == (ncat_ord[clusters[clust].col_num] - 2)) {
673
+
674
+ clusters[clust].split_lev++;
675
+ if (clusters[clust].split_type == Greater)
676
+ clusters[clust].split_type = Equal;
677
+ else
678
+ clusters[clust].split_type = NotEqual;
679
+
680
+ }
681
+ break;
682
+ }
683
+
684
+ }
685
+
686
+ }
687
+
688
+ }
689
+
690
+ /*
691
+ * Convert in/not-in conditions to 'equals' when they look for only 1 category
692
+ * Note: unlike in the case of clusters, trees do not store the split type, but rather
693
+ * always assume left is in/l.e. and right the opposite, so it's not possible to
694
+ * simplify ordinal splits to equals (as the tree will not distinguish between
695
+ * an ordinal split with equals and another with l.e./g.e.). Thus, this part needs
696
+ * to be done in the function that prints the outlier conditions.
697
+ */
698
+ void simplify_when_equal_cond(std::vector<ClusterTree> &trees, int ncat_ord[])
699
+ {
700
+
701
+ int col_equal;
702
+ size_t size_subset;
703
+ size_t size_subset_excl;
704
+ size_t temp_swap;
705
+ for (size_t tree = 0; tree < trees.size(); tree++) {
706
+
707
+ if (trees[tree].all_branches.size() == 0 && trees[tree].tree_left == 0 && trees[tree].tree_right == 0) continue;
708
+ if (trees[trees[tree].parent].all_branches.size() > 0 && trees[tree].split_this_branch == IsNa) continue;
709
+ switch(trees[tree].column_type) {
710
+
711
+ case Categorical:
712
+ {
713
+ size_subset_excl = std::accumulate(trees[tree].split_subset.begin(), trees[tree].split_subset.end(), (size_t)0,
714
+ [](const size_t a, const char b){return a + ((b < 0)? 1 : 0);});
715
+ if (size_subset_excl > 0) continue;
716
+
717
+ col_equal = -1;
718
+ if (trees[tree].split_subset.size() == 2) {
719
+
720
+ col_equal = 0;
721
+ if (trees[tree].split_subset[0] == 0) {
722
+ temp_swap = trees[tree].tree_left;
723
+ trees[tree].tree_left = trees[tree].tree_right;
724
+ trees[tree].tree_right = temp_swap;
725
+ }
726
+ if (trees[tree].tree_left > 0)
727
+ trees[trees[tree].tree_left].parent_branch = Equal;
728
+ if (trees[tree].tree_right > 0)
729
+ trees[trees[tree].tree_right].parent_branch = NotEqual;
730
+
731
+ if (trees[trees[tree].parent].all_branches.size() > 0) {
732
+ switch(trees[tree].split_this_branch) {
733
+ case InSubset:
734
+ {
735
+ trees[tree].split_this_branch = Equal;
736
+ break;
737
+ }
738
+
739
+ case NotInSubset:
740
+ {
741
+ trees[tree].split_this_branch = NotEqual;
742
+ break;
743
+ }
744
+
745
+ case SingleCateg:
746
+ {
747
+ trees[tree].split_this_branch = Equal;
748
+ break;
749
+ }
750
+ }
751
+ }
752
+
753
+ }
754
+
755
+ else {
756
+
757
+ size_subset = std::accumulate(trees[tree].split_subset.begin(), trees[tree].split_subset.end(), (size_t)0,
758
+ [](const size_t a, const char b){return a + ((b > 0)? 1 : 0);});
759
+ if (size_subset == 1) {
760
+
761
+ do {col_equal++;} while (trees[tree].split_subset[col_equal] <= 0);
762
+ if (trees[trees[tree].parent].all_branches.size() > 0) {
763
+ switch(trees[tree].split_this_branch) {
764
+ case InSubset:
765
+ {
766
+ trees[tree].split_this_branch = Equal;
767
+ break;
768
+ }
769
+
770
+ case NotInSubset:
771
+ {
772
+ trees[tree].split_this_branch = NotEqual;
773
+ break;
774
+ }
775
+
776
+ case SingleCateg:
777
+ {
778
+ trees[tree].split_this_branch = Equal;
779
+ break;
780
+ }
781
+ }
782
+ }
783
+
784
+
785
+ } else if (size_subset == (trees[tree].split_subset.size() - 1)) {
786
+
787
+ do {col_equal++;} while (trees[tree].split_subset[col_equal] != 0);
788
+ temp_swap = trees[tree].tree_left;
789
+ trees[tree].tree_left = trees[tree].tree_right;
790
+ trees[tree].tree_right = temp_swap;
791
+ if (trees[trees[tree].parent].all_branches.size() > 0) {
792
+ switch(trees[tree].split_this_branch) {
793
+ case InSubset:
794
+ {
795
+ trees[tree].split_this_branch = NotEqual;
796
+ break;
797
+ }
798
+
799
+ case NotInSubset:
800
+ {
801
+ trees[tree].split_this_branch = Equal;
802
+ break;
803
+ }
804
+ }
805
+ }
806
+
807
+ }
808
+
809
+ }
810
+
811
+ if (col_equal >= 0) {
812
+ trees[tree].split_subset.resize(0);
813
+ trees[tree].split_lev = col_equal;
814
+ if (trees[tree].tree_left > 0)
815
+ trees[trees[tree].tree_left].parent_branch = Equal;
816
+ if (trees[tree].tree_right > 0)
817
+ trees[trees[tree].tree_right].parent_branch = NotEqual;
818
+
819
+ }
820
+ break;
821
+ }
822
+
823
+
824
+ case Ordinal:
825
+ {
826
+ if (trees[trees[tree].parent].all_branches.size() == 0) continue;
827
+
828
+ if (trees[tree].split_lev == 0) {
829
+
830
+ if (trees[tree].split_this_branch == LessOrEqual)
831
+ trees[tree].split_this_branch = Equal;
832
+ else
833
+ trees[tree].split_this_branch = NotEqual;
834
+
835
+ }
836
+
837
+ else if (trees[tree].split_lev == (ncat_ord[trees[tree].col_num] - 2)) {
838
+
839
+ trees[tree].split_lev++;
840
+ if (trees[tree].split_this_branch == Greater)
841
+ trees[tree].split_this_branch = Equal;
842
+ else
843
+ trees[tree].split_this_branch = NotEqual;
844
+
845
+ }
846
+ break;
847
+ }
848
+
849
+ }
850
+
851
+ }
852
+
853
+ }
854
+
855
+ #ifdef TEST_MODE_DEFINE
856
+ /*
857
+ * Goodie to help with testing and debugging (not used in the final code)
858
+ *
859
+ * This function tries to unconnect unnecessary trees so that, if a tree has no clusters and its children
860
+ * don't have any clusters either, such tree would not be reached at prediction time. It will drop trees from the vector
861
+ * if they happen to lie at the end of it, but otherwise will just leave them there so as not to have to recalculate
862
+ * the tree indexes and avoid having to update them everywhere they are referenced (such as in identified outliers).
863
+ *
864
+ * This is only for categorical and ordinal columns, as numerical columns will always produce produce clusters when
865
+ * they have children.
866
+ *
867
+ * This is supposed to be done with the conditions at the end of each recursive function, but this piece of
868
+ * code can provide help in identifying errors when the code is modified.
869
+ */
870
+ void prune_unused_trees(std::vector<ClusterTree> &trees)
871
+ {
872
+ /* TODO: when using 'follow_all', function should delete instead of disconnect by setting to zero */
873
+ if (trees.size() == 0) return;
874
+ for (size_t t = trees.size() - 1; t >= 0; t--) {
875
+
876
+ if (trees[t].binary_branches.size() > 0) {
877
+ for (size_t br = 0; br < trees[t].binary_branches.size(); br++) {
878
+ if (trees[t].binary_branches[br] == 0) continue;
879
+ if (trees[t].binary_branches[br] >= trees.size()) trees[t].binary_branches[br] = 0;
880
+ if (check_tree_is_not_needed(trees[trees[t].binary_branches[br]])) trees[t].binary_branches[br] = 0;
881
+ }
882
+ }
883
+
884
+ if (trees[t].all_branches.size() > 0) {
885
+ for (size_t br = 0; br < trees[t].all_branches.size(); br++) {
886
+ if (trees[t].all_branches[br] == 0) continue;
887
+ if (trees[t].all_branches[br] >= trees.size()) trees[t].all_branches[br] = 0;
888
+ if (check_tree_is_not_needed(trees[trees[t].all_branches[br]])) trees[t].all_branches[br] = 0;
889
+ }
890
+ }
891
+
892
+
893
+ if (check_tree_is_not_needed(trees[t])) {
894
+
895
+ /* disconnect tree from parent */
896
+ switch(trees[t].parent_branch) {
897
+ case IsNa:
898
+ {
899
+ trees[trees[t].parent].tree_NA = 0;
900
+ break;
901
+ }
902
+
903
+ case LessOrEqual:
904
+ {
905
+ trees[trees[t].parent].tree_left = 0;
906
+ break;
907
+ }
908
+
909
+ case Greater:
910
+ {
911
+ trees[trees[t].parent].tree_right = 0;
912
+ break;
913
+ }
914
+
915
+ case InSubset:
916
+ {
917
+ trees[trees[t].parent].tree_left = 0;
918
+ break;
919
+ }
920
+
921
+ case NotInSubset:
922
+ {
923
+ trees[trees[t].parent].tree_right = 0;
924
+ break;
925
+ }
926
+
927
+ }
928
+
929
+ if (t == (trees.size() - 1)) trees.pop_back();
930
+ }
931
+ if (t == 0) break;
932
+ }
933
+ }
934
+ #endif
935
+
936
+ /* Check whether a tree has no clusters and no children with clusters either */
937
+ bool check_tree_is_not_needed(ClusterTree &tree)
938
+ {
939
+ return
940
+ tree.tree_NA == 0 && tree.tree_left == 0 && tree.tree_right == 0 &&
941
+ tree.clusters.size() == 0 &&
942
+ (tree.binary_branches.size() == 0 || *std::max_element(tree.binary_branches.begin(), tree.binary_branches.end()) == 0) &&
943
+ (tree.all_branches.size() == 0 || *std::max_element(tree.all_branches.begin(), tree.all_branches.end()) == 0)
944
+ ;
945
+ }
946
+
947
+ /*
948
+ * These functions simply check what's the minimum/maximum value that could identify an observation
949
+ * as outlier in any cluster, or which categories could be possibly flagged as outliers in any cluster.
950
+ * This info is redundant, as outliers can be identified by following splits, but it can help speed up
951
+ * things at prediction time by not having to even bother checking a column if the value is within
952
+ * non-flaggable limits.
953
+ */
954
+ void calculate_cluster_minimums(ModelOutputs &model_outputs, size_t col)
955
+ {
956
+ for (size_t cl = 0; cl < model_outputs.all_clusters[col].size(); cl++) {
957
+ model_outputs.min_outlier_any_cl[col] = fmax(model_outputs.min_outlier_any_cl[col], model_outputs.all_clusters[col][cl].lower_lim);
958
+ model_outputs.max_outlier_any_cl[col] = fmin(model_outputs.max_outlier_any_cl[col], model_outputs.all_clusters[col][cl].upper_lim);
959
+ }
960
+
961
+ }
962
+
963
+ void calculate_cluster_poss_categs(ModelOutputs &model_outputs, size_t col, size_t col_rel)
964
+ {
965
+ if (model_outputs.all_clusters[col].size() == 0) return;
966
+ model_outputs.cat_outlier_any_cl[col_rel].resize(model_outputs.all_clusters[col][0].subset_common.size(), 0);
967
+ for (size_t cl = 0; cl < model_outputs.all_clusters[col].size(); cl++) {
968
+ for (size_t cat = 0; cat < model_outputs.all_clusters[col][cl].subset_common.size(); cat++) {
969
+ if (model_outputs.all_clusters[col][cl].subset_common[cat] != 0) model_outputs.cat_outlier_any_cl[col_rel][cat] = true;
970
+ }
971
+ }
972
+ }