outliertree 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,1932 @@
1
+ /********************************************************************************************************************
2
+ * Explainable outlier detection
3
+ *
4
+ * Tries to detect outliers by generating decision trees that attempt to predict the values of each column based on
5
+ * each other column, testing in each branch of every tried split (if it meets some minimum criteria) whether there
6
+ * are observations that seem too distant from the others in a 1-D distribution for the column that the split tries
7
+ * to "predict" (will not generate a score for each observation).
8
+ * Splits are based on gain, while outlierness is based on confidence intervals.
9
+ * Similar in spirit to the GritBot software developed by RuleQuest research. Reference article is:
10
+ * Cortes, David. "Explainable outlier detection through decision tree conditioning."
11
+ * arXiv preprint arXiv:2001.00636 (2020).
12
+ *
13
+ *
14
+ * Copyright 2020 David Cortes.
15
+ *
16
+ * Written for C++11 standard and OpenMP 2.0 or later. Code is meant to be wrapped into scripting languages
17
+ * such as R or Python.
18
+ *
19
+ * This file is part of OutlierTree.
20
+ *
21
+ * OutlierTree is free software: you can redistribute it and/or modify
22
+ * it under the terms of the GNU General Public License as published by
23
+ * the Free Software Foundation, either version 3 of the License, or
24
+ * (at your option) any later version.
25
+ *
26
+ * OutlierTree is distributed in the hope that it will be useful,
27
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
28
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
29
+ * GNU General Public License for more details.
30
+ *
31
+ * You should have received a copy of the GNU General Public License
32
+ * along with OutlierTree. If not, see <https://www.gnu.org/licenses/>.
33
+ ********************************************************************************************************************/
34
+ #include "outlier_tree.hpp"
35
+
36
+ /* Fit outliers model based on conditional distributions obtaines thorugh decision-tree splitting
37
+ *
38
+ * Note1: the function here will not perform any data validation - it must be done from outside already.
39
+ * Note2: the data types (double/int) were chosen due to R's internal representations of data structures,
40
+ * which only supports those types.
41
+ *
42
+ * Parameters:
43
+ * - model_outputs (out)
44
+ * Struct with the model outputs required for prediction time (trees and clusters) and information about identified outliers
45
+ * required to display their statistics. If there was any previous information from fitting the model to other data, it will
46
+ * be overwritten.
47
+ * - numeric_data[n * m1] (in)
48
+ * Array with numerical columns in the data. Must be ordered by columns like Fortran arrays.
49
+ * Missing values should be encoded as NaN. Infinite values in most sections are treated as NaN too.
50
+ * Binary or boolean columns must be passed as categorical.
51
+ * If there are no numerical columns, pass NULL.
52
+ * - ncols_numeric (in)
53
+ * Number of numeric columns in the array 'numeric_data'.
54
+ * - categorical_data[n * m2] (in)
55
+ * Array with categorical columns in the data. Must be ordered by columns like Fortran arrays.
56
+ * Negative numbers will be interpreted as missing values. Numeration must start at zero and be
57
+ * contiguous (i.e. if there's category 2, must also have category 1).
58
+ * If there are no categorical columns, pass NULL.
59
+ * - ncols_categ (in)
60
+ * Numer of categorical columns in the array 'categorical_data'.
61
+ * - ncat[m2] (in)
62
+ * Numer of categories in each categorical column. If there are no categorical columns, pass NULL.
63
+ * - ordinal_data[n * m3] (in)
64
+ * Array with ordinal categorical columns in the data. Must be ordered by columns like Fortran arrays.
65
+ * Same rules as for categorical data. Note that the order will only be taken into consideration when
66
+ * producing splits by these columns, but outliers are still detected in the same way as for categoricals.
67
+ * Binary or boolean columns must be passed as categorical (i.e. minimum categories in a column is 3).
68
+ * If there are no ordinal columns, pass NULL.
69
+ * - ncols_ord (in)
70
+ * Numer of ordinal columns in the array 'ordinal_data'.
71
+ * - ncat_ord[m3] (in)
72
+ * Numer of categories in each ordinal column. If there are no categorical columns, pass NULL.
73
+ * - nrows (in)
74
+ * Numer of rows in the arrays passed above.
75
+ * - cols_ignore[m1 + m2 + m3] (in)
76
+ * Boolean array indicating which columns should only be used as splitting criterion for other columns,
77
+ * while being ignored at the moment of finding outlier values in them. Pass NULL if outliers are to be
78
+ * searched for in all columns (this is the default).
79
+ * - nthreads (in)
80
+ * Numer of parallel threads to use. Should not be higher than the number of columns.
81
+ * Note that the more threads used, the more memory will need to be allocated.
82
+ * - categ_as_bin (in)
83
+ * Whether to binarize categorical columns at each category to split them by another categorical column.
84
+ * If this is false and 'cat_bruteforce_subset' is also false, then when splitting a categorical or ordinal
85
+ * variable by another categorical, it will have one branch per category of the splitting column. Ignored
86
+ * when splitting by numerical and ordinal. Overrides 'cat_bruteforce_subset' when passing true.
87
+ * - ord_as_bin (in)
88
+ * Same as above, but binarization is by less/greater than a level in the order.
89
+ * - cat_bruteforce_subset (in)
90
+ * Whether to do a brute-force search over all possible binary splits of grouped subsets of categories when
91
+ * splitting a categorical or ordinal column by another categorical column. If this is false and 'categ_as_bin'
92
+ * is also false, then when splitting a categorical or ordinal variable by another categorical, it will have
93
+ * one branch per category of the splitting column. Ignored when splitting by numerical and ordinal.
94
+ * Will be ignored when passing 'categ_as_bin' = true.
95
+ * - categ_from_maj (in)
96
+ * Whether to flag outliers in categorical variables according to the number of observations not belonging to
97
+ * the majority class (formula will be (n-n_maj)/(n * p_prior) < 1/(z_outlier^2) for each category). If passing
98
+ * 'false', will instead look for outliers in categorical variables based on being a minority and having a gap
99
+ * with respect to other categories, even if there is no dominant majority.
100
+ * - max_depth (in)
101
+ * Max depth of decision trees that generate conditional distributions (subsets of the data) in which to look
102
+ * for outliers.
103
+ * - max_perc_outliers (in)
104
+ * Model parameter. Approximate maximum percentage of outlier observations in each cluster. Default value is 0.01.
105
+ * - min_size_numeric (in)
106
+ * Minimum size that numeric clusters and splits on numeric variables can have. Default value is 35.
107
+ * - min_size_categ (in)
108
+ * Same but for categoricals. Default value is 75.
109
+ * - min_gain (in)
110
+ * Minimum gain that a split must produce in order not to discard it. Default value is 0.01 (in GritBot it's 0.000001).
111
+ * - gain_as_pct (in)
112
+ * Whether the gain above should be taken in absolute terms (sd_full - (n1*sd1 + n2*sd2)/n), or as a percentage
113
+ * ( (sd_full - (n1*sd1 + n2*sd2)/n) / sd_full ) (Replace 'sd' with shannon entropy for categorical variables).
114
+ * Taking it in absolute terms will prefer making more splits on columns that have a large variance, while taking it
115
+ * as a percentage might be more restrictive on them and might create deeper trees in some columns.
116
+ * - follow_all (in)
117
+ * Whether to create new tree branches (and continue creating new splits from all of them) from every split that meets them
118
+ * minimum gain or not. Doing so (which GritBot doesn't) will make the procedure much slower, but can flag more observations
119
+ * as outliers (with a much larger false-positive rate). Default is 'false'.
120
+ * - z_norm (in)
121
+ * Maximum Z value that is considered as normal in a distribution. Default value is 2.67 (percentile 99)
122
+ * - z_outlier (in)
123
+ * Minimum Z value that can be considered as outlier in numerical columns. Not used for categorical or ordinal columns.
124
+ *
125
+ * Returns:
126
+ * Whether any outliers were identified in the data to which the model was fit.
127
+ */
128
+ bool fit_outliers_models(ModelOutputs &model_outputs,
129
+ double *restrict numeric_data, size_t ncols_numeric,
130
+ int *restrict categorical_data, size_t ncols_categ, int *restrict ncat,
131
+ int *restrict ordinal_data, size_t ncols_ord, int *restrict ncat_ord,
132
+ size_t nrows, char *restrict cols_ignore, int nthreads,
133
+ bool categ_as_bin, bool ord_as_bin, bool cat_bruteforce_subset, bool categ_from_maj, bool take_mid,
134
+ size_t max_depth, double max_perc_outliers, size_t min_size_numeric, size_t min_size_categ,
135
+ double min_gain, bool gain_as_pct, bool follow_all, double z_norm, double z_outlier)
136
+ {
137
+
138
+ /* put parameters and data into structs to avoid passing too many function arguments each time */
139
+ double z_tail = z_outlier - z_norm;
140
+ ModelParams model_params = {
141
+ categ_as_bin, ord_as_bin, cat_bruteforce_subset, categ_from_maj, take_mid,
142
+ max_depth, max_perc_outliers, min_size_numeric, min_size_categ,
143
+ min_gain, gain_as_pct, follow_all, z_norm, z_outlier, z_tail,
144
+ std::vector<long double>()
145
+ };
146
+
147
+ size_t tot_cols = ncols_numeric + ncols_categ + ncols_ord;
148
+ InputData input_data = {
149
+ numeric_data, ncols_numeric, categorical_data, ncols_categ, ncat,
150
+ ordinal_data, ncols_ord, ncat_ord, nrows, tot_cols, std::vector<char>(),
151
+ std::vector<char>(), -1, std::vector<size_t>(),
152
+ };
153
+
154
+ model_outputs.ncat.assign(ncat, ncat + ncols_categ);
155
+ model_outputs.ncat_ord.assign(ncat_ord, ncat_ord + ncols_ord);
156
+ model_outputs.ncols_numeric = ncols_numeric;
157
+ model_outputs.ncols_categ = ncols_categ;
158
+ model_outputs.ncols_ord = ncols_ord;
159
+ model_outputs.max_depth = max_depth;
160
+ model_outputs.min_outlier_any_cl.resize(model_outputs.ncols_numeric, -HUGE_VAL);
161
+ model_outputs.max_outlier_any_cl.resize(model_outputs.ncols_numeric, HUGE_VAL);
162
+ model_outputs.cat_outlier_any_cl.resize(model_outputs.ncols_categ + model_outputs.ncols_ord);
163
+
164
+ if (tot_cols < (size_t)nthreads)
165
+ nthreads = (int) tot_cols;
166
+ #ifndef _OPENMP
167
+ std::vector<Workspace> workspace(1);
168
+ #else
169
+ std::vector<Workspace> workspace(nthreads);
170
+ #endif
171
+ workspace.shrink_to_fit();
172
+
173
+ /* in case the model was already fit from before */
174
+ model_outputs.all_clusters.clear();
175
+ model_outputs.all_trees.clear();
176
+ allocate_row_outputs(model_outputs, nrows, max_depth);
177
+
178
+ /* initialize info holders as needed */
179
+ bool found_outliers = false;
180
+ input_data.has_NA.resize(tot_cols, false);
181
+ input_data.skip_col.resize(tot_cols, false);
182
+ model_outputs.start_ix_cat_counts.resize(ncols_categ + ncols_ord + 1);
183
+ model_outputs.col_transf.resize(ncols_numeric, NoTransf);
184
+ model_outputs.transf_offset.resize(ncols_numeric);
185
+ model_outputs.sd_div.resize(ncols_numeric);
186
+ model_outputs.min_decimals_col.resize(ncols_numeric);
187
+
188
+ /* determine maximum number of categories in a column, allocate arrays for category counts and proportions */
189
+ model_outputs.start_ix_cat_counts[0] = 0;
190
+ if (tot_cols > ncols_numeric) {
191
+ input_data.max_categ = calculate_category_indices(&model_outputs.start_ix_cat_counts[0], input_data.ncat, input_data.ncols_categ,
192
+ (bool*) &input_data.skip_col[ncols_numeric]);
193
+ input_data.max_categ = calculate_category_indices(&model_outputs.start_ix_cat_counts[input_data.ncols_categ], input_data.ncat_ord, input_data.ncols_ord,
194
+ (bool*) &input_data.skip_col[input_data.ncols_numeric + input_data.ncols_categ], input_data.max_categ);
195
+ } else {
196
+ input_data.max_categ = 0;
197
+ }
198
+
199
+ /* now allocate arrays for proportions */
200
+ input_data.cat_counts.resize(model_outputs.start_ix_cat_counts[ncols_categ + ncols_ord], 0);
201
+ model_params.prop_small.resize(model_outputs.start_ix_cat_counts[ncols_categ + ncols_ord]);
202
+ model_outputs.prop_categ.resize(model_outputs.start_ix_cat_counts[ncols_categ + ncols_ord]);
203
+
204
+ /* calculate prior probabilities for categorical variables (in parallel), see if any is unsplittable */
205
+ if (tot_cols > ncols_numeric) {
206
+ #pragma omp parallel
207
+ {
208
+ #pragma omp sections
209
+ {
210
+
211
+ #pragma omp section
212
+ {
213
+ if (ncols_categ > 0) {
214
+ calculate_all_cat_counts(&model_outputs.start_ix_cat_counts[0], &input_data.cat_counts[0], input_data.ncat,
215
+ input_data.categorical_data, input_data.ncols_categ, input_data.nrows,
216
+ (bool*) &input_data.has_NA[ncols_numeric], (bool*) &input_data.skip_col[input_data.ncols_numeric],
217
+ std::min(input_data.ncols_categ, (size_t)std::max(1, nthreads - 1)) );
218
+
219
+ check_cat_col_unsplittable(&model_outputs.start_ix_cat_counts[0], &input_data.cat_counts[0], input_data.ncat,
220
+ input_data.ncols_categ, std::min(model_params.min_size_numeric, model_params.min_size_categ), input_data.nrows,
221
+ (bool*) &input_data.skip_col[input_data.ncols_numeric],
222
+ std::min(input_data.ncols_categ, (size_t)std::max(1, nthreads - 1)));
223
+ }
224
+
225
+
226
+ }
227
+
228
+ #pragma omp section
229
+ {
230
+ if (ncols_ord > 0) {
231
+ calculate_all_cat_counts(&model_outputs.start_ix_cat_counts[input_data.ncols_categ], &input_data.cat_counts[0], input_data.ncat_ord,
232
+ input_data.ordinal_data, input_data.ncols_ord, input_data.nrows,
233
+ (bool*) &input_data.has_NA[input_data.ncols_numeric + input_data.ncols_categ],
234
+ (bool*) &input_data.skip_col[input_data.ncols_numeric + input_data.ncols_categ],
235
+ std::max((int)1, nthreads - (int)input_data.ncols_categ) );
236
+
237
+ check_cat_col_unsplittable(&model_outputs.start_ix_cat_counts[input_data.ncols_categ], &input_data.cat_counts[0], input_data.ncat_ord,
238
+ ncols_ord, std::min(model_params.min_size_numeric, model_params.min_size_categ), input_data.nrows,
239
+ (bool*) &input_data.skip_col[input_data.ncols_numeric + input_data.ncols_categ],
240
+ std::max((int)1, nthreads - (int)input_data.ncols_categ));
241
+ }
242
+ }
243
+ }
244
+
245
+ }
246
+
247
+
248
+ /* calculate proprotion limit and CI for each category of each column */
249
+ calculate_lowerlim_proportion(&model_params.prop_small[0], &model_outputs.prop_categ[0], &model_outputs.start_ix_cat_counts[0],
250
+ &input_data.cat_counts[0], input_data.ncols_categ, input_data.nrows, model_params.z_norm, model_params.z_tail);
251
+ calculate_lowerlim_proportion(&model_params.prop_small[0], &model_outputs.prop_categ[0], &model_outputs.start_ix_cat_counts[input_data.ncols_categ],
252
+ &input_data.cat_counts[0], input_data.ncols_ord, input_data.nrows, model_params.z_norm, model_params.z_tail);
253
+ }
254
+
255
+ /* for numerical columns, check if they have NAs or if total variance is too small */
256
+ check_missing_no_variance(input_data.numeric_data, input_data.ncols_numeric, input_data.nrows,
257
+ (bool*) &input_data.has_NA[0], (bool*) &input_data.skip_col[0],
258
+ model_outputs.min_decimals_col.data(), nthreads);
259
+
260
+ /* determine an approximate size for the output clusters, and reserve memory right away */
261
+ model_outputs.all_clusters.resize(tot_cols);
262
+ model_outputs.all_trees.resize(tot_cols);
263
+ #pragma omp parallel for shared(model_outputs, input_data, model_params, tot_cols)
264
+ for (size_t_for col = 0; col < tot_cols; col++) {
265
+ if (input_data.skip_col[col]) continue;
266
+ if (cols_ignore != NULL && cols_ignore[col]) continue;
267
+ model_outputs.all_clusters[col].reserve(tot_cols * std::min(2 * input_data.nrows, pow2(model_params.max_depth + 1)));
268
+ model_outputs.all_trees[col].reserve( square(model_params.max_depth) );
269
+ /* this is not exact as categoricals and ordinals can also be split multiple times */
270
+ }
271
+
272
+
273
+ /* now run the procedure on each column separately */
274
+ int tid;
275
+ nthreads = std::min(nthreads, (int)(ncols_numeric + ncols_categ + ncols_ord));
276
+ #pragma omp parallel for num_threads(nthreads) schedule(dynamic, 1) private(tid) shared(workspace, model_outputs, input_data, model_params, tot_cols)
277
+ for (size_t_for col = 0; col < tot_cols; col++) {
278
+
279
+ if (cols_ignore != NULL && cols_ignore[col]) continue;
280
+ if (input_data.skip_col[col] && col < input_data.ncols_numeric) continue;
281
+ tid = omp_get_thread_num();
282
+
283
+ /* re-use thread-private memory if possible */
284
+ if (!check_workspace_is_allocated(workspace[tid]))
285
+ allocate_thread_workspace(workspace[tid], input_data.nrows, input_data.max_categ);
286
+
287
+ /* numerical column */
288
+ if (col < input_data.ncols_numeric) {
289
+ process_numeric_col(model_outputs.all_clusters[col],
290
+ model_outputs.all_trees[col],
291
+ col,
292
+ workspace[tid],
293
+ input_data,
294
+ model_params, model_outputs);
295
+ calculate_cluster_minimums(model_outputs, col);
296
+ }
297
+
298
+ /* categorical column */
299
+ else if (col < (input_data.ncols_numeric + input_data.ncols_categ)) {
300
+ process_categ_col(model_outputs.all_clusters[col],
301
+ model_outputs.all_trees[col],
302
+ col, false,
303
+ workspace[tid],
304
+ input_data,
305
+ model_params, model_outputs);
306
+ calculate_cluster_poss_categs(model_outputs, col, col - input_data.ncols_numeric);
307
+ }
308
+
309
+ /* ordinal column */
310
+ else {
311
+ process_categ_col(model_outputs.all_clusters[col],
312
+ model_outputs.all_trees[col],
313
+ col, true,
314
+ workspace[tid],
315
+ input_data,
316
+ model_params, model_outputs);
317
+ calculate_cluster_poss_categs(model_outputs, col, col - input_data.ncols_numeric);
318
+ }
319
+
320
+ /* shrink the dynamic vectors to what ended up used only */
321
+ #ifdef TEST_MODE_DEFINE
322
+ prune_unused_trees(model_outputs.all_trees[col]);
323
+ #endif
324
+ if (
325
+ model_outputs.all_clusters[col].size() == 0 ||
326
+ model_outputs.all_trees[col].size() == 0 ||
327
+ check_tree_is_not_needed(model_outputs.all_trees[col][0])
328
+ )
329
+ {
330
+ model_outputs.all_trees[col].clear();
331
+ model_outputs.all_clusters[col].clear();
332
+ }
333
+ model_outputs.all_trees[col].shrink_to_fit();
334
+ model_outputs.all_clusters[col].shrink_to_fit();
335
+
336
+ /* simplify single-elements in subset to 'equals' or 'not equals' */
337
+ simplify_when_equal_cond(model_outputs.all_clusters[col], ncat_ord);
338
+ simplify_when_equal_cond(model_outputs.all_trees[col], ncat_ord);
339
+
340
+ /* remember only the best (rarest) value for each row */
341
+ #pragma omp critical
342
+ if (workspace[tid].col_has_outliers) {
343
+
344
+ found_outliers = true;
345
+ for (size_t row = 0; row < input_data.nrows; row++) {
346
+
347
+ if (workspace[tid].outlier_scores[row] < 1.0) {
348
+
349
+ if (
350
+ model_outputs.outlier_scores_final[row] >= 1.0 ||
351
+ (
352
+ workspace[tid].outlier_depth[row] < model_outputs.outlier_depth_final[row] &&
353
+ (
354
+ !model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch ||
355
+ model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
356
+ )
357
+ ) ||
358
+ (
359
+ model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch &&
360
+ !model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch
361
+ ) ||
362
+ (
363
+ workspace[tid].outlier_depth[row] == model_outputs.outlier_depth_final[row] &&
364
+ model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch
365
+ ==
366
+ model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
367
+ &&
368
+ model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].cluster_size
369
+ <
370
+ model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].cluster_size
371
+ ) ||
372
+ (
373
+ workspace[tid].outlier_depth[row] == model_outputs.outlier_depth_final[row] &&
374
+ model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].cluster_size
375
+ ==
376
+ model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].cluster_size
377
+ &&
378
+ model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch
379
+ ==
380
+ model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
381
+ &&
382
+ workspace[tid].outlier_scores[row] < model_outputs.outlier_scores_final[row]
383
+ )
384
+ )
385
+ {
386
+ model_outputs.outlier_scores_final[row] = workspace[tid].outlier_scores[row];
387
+ model_outputs.outlier_clusters_final[row] = workspace[tid].outlier_clusters[row];
388
+ model_outputs.outlier_trees_final[row] = workspace[tid].outlier_trees[row];
389
+ model_outputs.outlier_depth_final[row] = workspace[tid].outlier_depth[row];
390
+ model_outputs.outlier_columns_final[row] = col;
391
+ }
392
+ }
393
+
394
+ }
395
+ }
396
+
397
+
398
+ }
399
+
400
+ /* once finished, determine how many decimals to report for numerical outliers */
401
+ if (found_outliers)
402
+ calc_min_decimals_to_print(model_outputs, input_data.numeric_data, nthreads);
403
+
404
+ #ifdef TEST_MODE_DEFINE
405
+ for (size_t col = 0; col < tot_cols; col++) {
406
+ std::cout << "col " << col << " has " << model_outputs.all_clusters[col].size() << " clusters [" << model_outputs.all_trees[col].size() << " trees]" << std::endl;
407
+ }
408
+
409
+ find_new_outliers(numeric_data,
410
+ categorical_data,
411
+ ordinal_data,
412
+ nrows, nthreads, model_outputs);
413
+
414
+
415
+ // /* extract data for only one row */
416
+ // std::vector<double> num_data_row(ncols_numeric);
417
+ // std::vector<int> cat_data_row(ncols_categ);
418
+ // std::vector<int> ord_data_row(ncols_ord);
419
+ // size_t chosen_row = 38;
420
+ // for (size_t rowcol = 0; rowcol < ncols_numeric; rowcol++)
421
+ // num_data_row.at(rowcol) = numeric_data[chosen_row + rowcol * nrows];
422
+ // for (size_t rowcol = 0; rowcol < ncols_categ; rowcol++)
423
+ // cat_data_row.at(rowcol) = categorical_data[chosen_row + rowcol * nrows];
424
+ // for (size_t rowcol = 0; rowcol < ncols_ord; rowcol++)
425
+ // ord_data_row.at(rowcol) = ordinal_data[chosen_row + rowcol * nrows];
426
+
427
+
428
+ // find_new_outliers(&num_data_row[0],
429
+ // &cat_data_row[0],
430
+ // &ord_data_row[0],
431
+ // 1, 1, model_outputs);
432
+ // calc_min_printable_digits(model_outputs);
433
+ #endif
434
+
435
+ return found_outliers;
436
+ }
437
+
438
+ void process_numeric_col(std::vector<Cluster> &cluster_root,
439
+ std::vector<ClusterTree> &tree_root,
440
+ size_t target_col_num,
441
+ Workspace &workspace,
442
+ InputData &input_data,
443
+ ModelParams &model_params,
444
+ ModelOutputs &model_outputs)
445
+ {
446
+ /* discard NAs and infinites */
447
+ workspace.target_col_num = target_col_num;
448
+ workspace.target_numeric_col = input_data.numeric_data + target_col_num * input_data.nrows;
449
+ workspace.orig_target_col = workspace.target_numeric_col;
450
+ workspace.end = input_data.nrows - 1;
451
+ workspace.st = move_NAs_to_front(&workspace.ix_arr[0], workspace.target_numeric_col, 0, workspace.end, true);
452
+ workspace.col_has_outliers = false;
453
+
454
+ /* check for problematic distributions - need to sort data first */
455
+ std::sort(&workspace.ix_arr[0] + workspace.st, &workspace.ix_arr[0] + workspace.end + 1,
456
+ [&workspace](const size_t a, const size_t b){return workspace.target_numeric_col[a] < workspace.target_numeric_col[b];});
457
+
458
+ long double running_mean = 0;
459
+ long double mean_prev = 0;
460
+ long double running_ssq = 0;
461
+ double xval;
462
+ for (size_t row = workspace.st; row <= workspace.end; row++) {
463
+ xval = workspace.target_numeric_col[workspace.ix_arr[row]];
464
+ running_mean += (xval - running_mean) / (long double)(row - workspace.st + 1);
465
+ running_ssq += (xval - running_mean) * (xval - mean_prev);
466
+ mean_prev = running_mean;
467
+ }
468
+
469
+ check_for_tails(&workspace.ix_arr[0], workspace.st, workspace.end, workspace.target_numeric_col,
470
+ model_params.z_norm, model_params.max_perc_outliers,
471
+ &workspace.buffer_transf_y[0], (double)running_mean,
472
+ (double)sqrtl(running_ssq / (long double)(workspace.end - workspace.st)),
473
+ &workspace.left_tail, &workspace.right_tail,
474
+ &workspace.exp_transf, &workspace.log_transf);
475
+
476
+ /* if it's double-tailed, skip it as this model doesn't work properly with this */
477
+ if ( (workspace.exp_transf || !isinf(workspace.left_tail)) && (workspace.log_transf || !isinf(workspace.right_tail)) ) return;
478
+
479
+ /* apply log or exp transformation if necessary */
480
+ if (workspace.exp_transf) {
481
+
482
+ workspace.orig_mean = (double) running_mean;
483
+ workspace.orig_sd = (double) sqrtl(running_ssq / (long double)(workspace.end - workspace.st));
484
+ for (size_t row = workspace.st; row <= workspace.end; row++) {
485
+ workspace.buffer_transf_y[workspace.ix_arr[row]] = exp(z_score(workspace.target_numeric_col[workspace.ix_arr[row]], workspace.orig_mean, workspace.orig_sd));
486
+ }
487
+ workspace.target_numeric_col = &workspace.buffer_transf_y[0];
488
+ model_outputs.col_transf[workspace.target_col_num] = Exp;
489
+ model_outputs.transf_offset[workspace.target_col_num] = workspace.orig_mean;
490
+ model_outputs.sd_div[workspace.target_col_num] = workspace.orig_sd;
491
+
492
+
493
+ } else if (workspace.log_transf) {
494
+
495
+ if (workspace.target_numeric_col[workspace.ix_arr[workspace.st]] == 0) {
496
+ workspace.log_minval = -1;
497
+ } else {
498
+ workspace.log_minval = workspace.target_numeric_col[workspace.ix_arr[workspace.st]] - 1e-3;
499
+ }
500
+
501
+ for (size_t row = workspace.st; row <= workspace.end; row++) {
502
+ workspace.buffer_transf_y[workspace.ix_arr[row]] = log(workspace.target_numeric_col[workspace.ix_arr[row]] - workspace.log_minval);
503
+ }
504
+ workspace.target_numeric_col = &workspace.buffer_transf_y[0];
505
+ model_outputs.col_transf[workspace.target_col_num] = Log;
506
+ model_outputs.transf_offset[workspace.target_col_num] = workspace.log_minval;
507
+
508
+ }
509
+
510
+ /* create a cluster with no conditions */
511
+ workspace.clusters = &cluster_root;
512
+ workspace.tree = &tree_root;
513
+ std::fill(workspace.outlier_scores.begin(), workspace.outlier_scores.end(), (double)1.0);
514
+ workspace.tree->emplace_back(0, Root);
515
+
516
+ workspace.clusters->emplace_back(NoType, Root);
517
+ workspace.col_has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.st,
518
+ workspace.end, &workspace.outlier_scores[0],
519
+ &workspace.outlier_clusters[0], &workspace.outlier_trees[0], &workspace.outlier_depth[0],
520
+ workspace.clusters->back(), *(workspace.clusters), 0, 0, 0,
521
+ workspace.log_transf, workspace.log_minval, workspace.exp_transf,
522
+ workspace.orig_mean, workspace.orig_sd,
523
+ workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
524
+ model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
525
+ workspace.tree->back().clusters.push_back(0);
526
+
527
+ /* remove outliers if any were found */
528
+ if (workspace.has_outliers)
529
+ workspace.st = move_outliers_to_front(&workspace.ix_arr[0], &workspace.outlier_scores[0], workspace.st, workspace.end);
530
+
531
+ /* update statistics if they've changed */
532
+ if (workspace.has_outliers || workspace.exp_transf || workspace.log_transf)
533
+ workspace.sd_y = calc_sd(&workspace.ix_arr[0], workspace.target_numeric_col,
534
+ workspace.st, workspace.end, &workspace.mean_y);
535
+ else
536
+ workspace.sd_y = sqrtl(running_ssq / (long double)(workspace.end - workspace.st));
537
+
538
+ if (model_params.max_depth > 0) recursive_split_numeric(workspace, input_data, model_params, 0, false);
539
+ }
540
+
541
+ void recursive_split_numeric(Workspace &workspace,
542
+ InputData &input_data,
543
+ ModelParams &model_params,
544
+ size_t curr_depth, bool is_NA_branch)
545
+ {
546
+ workspace.best_gain = -HUGE_VAL;
547
+ workspace.column_type_best = NoType;
548
+ workspace.lev_has_outliers = false;
549
+ if (curr_depth > 0) workspace.sd_y = calc_sd(&workspace.ix_arr[0], workspace.target_numeric_col,
550
+ workspace.st, workspace.end, &workspace.mean_y);
551
+
552
+ /* these are used to keep track of where to continue after calling a further recursion */
553
+ size_t ix1, ix2, ix3;
554
+ SplitType spl1, spl2;
555
+ size_t tree_from = workspace.tree->size() - 1;
556
+
557
+ /* when using 'follow_all' need to keep track of a lot more things */
558
+ std::unique_ptr<RecursionState> state_backup;
559
+ if (model_params.follow_all) state_backup = std::unique_ptr<RecursionState>(new RecursionState);
560
+
561
+
562
+ /* procedure: split with each other column */
563
+
564
+ /* first numeric */
565
+ for (size_t col = 0; col < input_data.ncols_numeric; col++) {
566
+
567
+ if (col == workspace.target_col_num) continue;
568
+ if (input_data.skip_col[col]) continue;
569
+ split_numericx_numericy(&workspace.ix_arr[0], workspace.st, workspace.end, input_data.numeric_data + col * input_data.nrows,
570
+ workspace.target_numeric_col, workspace.sd_y, (bool)(input_data.has_NA[col]), model_params.min_size_numeric,
571
+ model_params.take_mid, &workspace.buffer_sd[0], &(workspace.this_gain), &(workspace.this_split_point),
572
+ &(workspace.this_split_ix), &(workspace.this_split_NA));
573
+ if (model_params.gain_as_pct) workspace.this_gain /= workspace.sd_y;
574
+
575
+ /* if the gain is not insignificant, check clusters created by this split */
576
+ if (workspace.this_gain >= model_params.min_gain) {
577
+
578
+ /* NA branch */
579
+ if (workspace.this_split_NA > workspace.st &&
580
+ (workspace.this_split_NA - workspace.st) > model_params.min_size_numeric) {
581
+
582
+ (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
583
+ workspace.clusters->emplace_back(Numeric, col, IsNa, -HUGE_VAL, true);
584
+ workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.st,
585
+ workspace.this_split_NA - 1, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
586
+ &workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
587
+ workspace.clusters->size() - 1, tree_from, curr_depth + 1,
588
+ workspace.log_transf, workspace.log_minval, workspace.exp_transf,
589
+ workspace.orig_mean, workspace.orig_sd,
590
+ workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
591
+ model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
592
+ workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
593
+
594
+ if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
595
+ (*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
596
+ workspace.tree->emplace_back(tree_from, col, HUGE_VAL, IsNa);
597
+ backup_recursion_state(workspace, *state_backup);
598
+ workspace.end = workspace.this_split_NA - 1;
599
+ recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, true);
600
+ restore_recursion_state(workspace, *state_backup);
601
+ }
602
+
603
+ }
604
+
605
+ /* left branch */
606
+ (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
607
+ workspace.clusters->emplace_back(Numeric, col, LessOrEqual, workspace.this_split_point, is_NA_branch);
608
+ workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.this_split_NA,
609
+ workspace.this_split_ix, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
610
+ &workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
611
+ workspace.clusters->size() - 1, tree_from, curr_depth + 1,
612
+ workspace.log_transf, workspace.log_minval, workspace.exp_transf,
613
+ workspace.orig_mean, workspace.orig_sd,
614
+ workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
615
+ model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
616
+ workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
617
+
618
+ if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
619
+ (*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
620
+ workspace.tree->emplace_back(tree_from, col, workspace.this_split_point, LessOrEqual);
621
+ backup_recursion_state(workspace, *state_backup);
622
+ workspace.st = workspace.this_split_NA;
623
+ workspace.end = workspace.this_split_ix;
624
+ recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
625
+ restore_recursion_state(workspace, *state_backup);
626
+ }
627
+
628
+
629
+ /* right branch */
630
+ (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
631
+ workspace.clusters->emplace_back(Numeric, col, Greater, workspace.this_split_point, is_NA_branch);
632
+ workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.this_split_ix + 1,
633
+ workspace.end, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
634
+ &workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
635
+ workspace.clusters->size() - 1, tree_from, curr_depth + 1,
636
+ workspace.log_transf, workspace.log_minval, workspace.exp_transf,
637
+ workspace.orig_mean, workspace.orig_sd,
638
+ workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
639
+ model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
640
+ workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
641
+
642
+ if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
643
+ (*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
644
+ workspace.tree->emplace_back(tree_from, col, workspace.this_split_point, Greater);
645
+ backup_recursion_state(workspace, *state_backup);
646
+ workspace.st = workspace.this_split_ix + 1;
647
+ recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
648
+ restore_recursion_state(workspace, *state_backup);
649
+ }
650
+
651
+
652
+ /* if this is the best split, remember it for later */
653
+ if (workspace.this_gain > workspace.best_gain && !model_params.follow_all) {
654
+ workspace.best_gain = workspace.this_gain;
655
+ workspace.column_type_best = Numeric;
656
+ workspace.col_best = col;
657
+ workspace.split_point_best = workspace.this_split_point;
658
+ }
659
+
660
+ }
661
+
662
+ }
663
+
664
+ /* then categorical */
665
+ for (size_t col = 0; col < input_data.ncols_categ; col++) {
666
+
667
+ if (input_data.skip_col[col + input_data.ncols_numeric]) continue;
668
+
669
+ split_categx_numericy(&workspace.ix_arr[0], workspace.st, workspace.end, input_data.categorical_data + col * input_data.nrows,
670
+ workspace.target_numeric_col, workspace.sd_y, workspace.mean_y, false, input_data.ncat[col], &workspace.buffer_cat_cnt[0],
671
+ &workspace.buffer_cat_sum[0], &workspace.buffer_cat_sum_sq[0], &workspace.buffer_cat_sorted[0],
672
+ (bool)(input_data.has_NA[col + input_data.ncols_numeric]), model_params.min_size_numeric,
673
+ &(workspace.this_gain), &workspace.buffer_subset_categ[0], NULL);
674
+ if (model_params.gain_as_pct) workspace.this_gain /= workspace.sd_y;
675
+
676
+ if (workspace.this_gain >= model_params.min_gain) {
677
+
678
+ /* data is not arranged inside the splitting function, need to now assign to the branches as determined */
679
+ divide_subset_split(&workspace.ix_arr[0], input_data.categorical_data + col * input_data.nrows, workspace.st, workspace.end,
680
+ &workspace.buffer_subset_categ[0], input_data.ncat[col], (bool)(workspace.buffer_cat_cnt[input_data.ncat[col]] > 0),
681
+ &(workspace.this_split_NA), &(workspace.this_split_ix));
682
+
683
+ /* NA branch */
684
+ if ((workspace.this_split_NA - workspace.st) > model_params.min_size_numeric) {
685
+
686
+ (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
687
+ workspace.clusters->emplace_back(Categorical, col, IsNa, (char*)NULL, (int)0, true);
688
+ workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.st,
689
+ workspace.this_split_NA - 1, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
690
+ &workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
691
+ workspace.clusters->size() - 1, tree_from, curr_depth + 1,
692
+ workspace.log_transf, workspace.log_minval, workspace.exp_transf,
693
+ workspace.orig_mean, workspace.orig_sd,
694
+ workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
695
+ model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
696
+ workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
697
+
698
+ if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
699
+ (*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
700
+ workspace.tree->emplace_back(tree_from, col, IsNa, (char*)NULL, 0);
701
+ backup_recursion_state(workspace, *state_backup);
702
+ workspace.end = workspace.this_split_NA - 1;
703
+ recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, true);
704
+ restore_recursion_state(workspace, *state_backup);
705
+ }
706
+
707
+ }
708
+
709
+ /* left branch */
710
+ (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
711
+ workspace.clusters->emplace_back(Categorical, col, InSubset, &workspace.buffer_subset_categ[0], input_data.ncat[col], is_NA_branch);
712
+ workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.this_split_NA,
713
+ workspace.this_split_ix - 1, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
714
+ &workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
715
+ workspace.clusters->size() - 1, tree_from, curr_depth + 1,
716
+ workspace.log_transf, workspace.log_minval, workspace.exp_transf,
717
+ workspace.orig_mean, workspace.orig_sd,
718
+ workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
719
+ model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
720
+ workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
721
+
722
+ if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
723
+ (*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
724
+ workspace.tree->emplace_back(tree_from, col, InSubset, &workspace.buffer_subset_categ[0], input_data.ncat[col]);
725
+ backup_recursion_state(workspace, *state_backup);
726
+ workspace.st = workspace.this_split_NA;
727
+ workspace.end = workspace.this_split_ix - 1;
728
+ recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
729
+ restore_recursion_state(workspace, *state_backup);
730
+ }
731
+
732
+ /* right branch */
733
+ (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
734
+ workspace.clusters->emplace_back(Categorical, col, NotInSubset, &workspace.buffer_subset_categ[0], input_data.ncat[col], is_NA_branch);
735
+ workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.this_split_ix,
736
+ workspace.end, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
737
+ &workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
738
+ workspace.clusters->size() - 1, tree_from, curr_depth + 1,
739
+ workspace.log_transf, workspace.log_minval, workspace.exp_transf,
740
+ workspace.orig_mean, workspace.orig_sd,
741
+ workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
742
+ model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
743
+ workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
744
+
745
+ if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
746
+ (*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
747
+ workspace.tree->emplace_back(tree_from, col, NotInSubset, &workspace.buffer_subset_categ[0], input_data.ncat[col]);
748
+ backup_recursion_state(workspace, *state_backup);
749
+ workspace.st = workspace.this_split_ix;
750
+ recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
751
+ restore_recursion_state(workspace, *state_backup);
752
+ }
753
+
754
+ if (workspace.this_gain > workspace.best_gain && !model_params.follow_all) {
755
+ workspace.best_gain = workspace.this_gain;
756
+ workspace.column_type_best = Categorical;
757
+ workspace.col_best = col;
758
+ memcpy(&workspace.buffer_subset_categ_best[0], &workspace.buffer_subset_categ[0], input_data.ncat[col] * sizeof(char));
759
+ }
760
+
761
+ }
762
+
763
+ }
764
+
765
+ /* then ordinal */
766
+ for (size_t col = 0; col < input_data.ncols_ord; col++) {
767
+
768
+ if (input_data.skip_col[col + input_data.ncols_numeric + input_data.ncols_categ]) continue;
769
+
770
+ /* same code as for categorical, but this time with split level as int instead of boolean array as subset */
771
+ split_categx_numericy(&workspace.ix_arr[0], workspace.st, workspace.end, input_data.ordinal_data + col * input_data.nrows,
772
+ workspace.target_numeric_col, workspace.sd_y, workspace.mean_y, true, input_data.ncat_ord[col], &workspace.buffer_cat_cnt[0],
773
+ &workspace.buffer_cat_sum[0], &workspace.buffer_cat_sum_sq[0], &workspace.buffer_cat_sorted[0],
774
+ (bool)(input_data.has_NA[col + input_data.ncols_numeric + input_data.ncols_categ]), model_params.min_size_numeric,
775
+ &(workspace.this_gain), &workspace.buffer_subset_categ[0], &(workspace.this_split_lev));
776
+ if (model_params.gain_as_pct) workspace.this_gain /= workspace.sd_y;
777
+
778
+ if (workspace.this_gain >= model_params.min_gain) {
779
+
780
+ divide_subset_split(&workspace.ix_arr[0], input_data.ordinal_data + col * input_data.nrows, workspace.st, workspace.end,
781
+ workspace.this_split_lev, (bool)(workspace.buffer_cat_cnt[ input_data.ncat_ord[col] ] > 0),
782
+ &(workspace.this_split_NA), &(workspace.this_split_ix) );
783
+
784
+ if ((workspace.this_split_NA - workspace.st) > model_params.min_size_numeric) {
785
+
786
+ (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
787
+ workspace.clusters->emplace_back(Ordinal, col, IsNa, (int)0, true);
788
+ workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.st,
789
+ workspace.this_split_NA - 1, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
790
+ &workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
791
+ workspace.clusters->size() - 1, tree_from, curr_depth + 1,
792
+ workspace.log_transf, workspace.log_minval, workspace.exp_transf,
793
+ workspace.orig_mean, workspace.orig_sd,
794
+ workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
795
+ model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
796
+ workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
797
+
798
+ if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
799
+ (*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
800
+ workspace.tree->emplace_back(tree_from, col, (int)-1, IsNa);
801
+ backup_recursion_state(workspace, *state_backup);
802
+ workspace.end = workspace.this_split_NA - 1;
803
+ recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, true);
804
+ restore_recursion_state(workspace, *state_backup);
805
+ }
806
+
807
+ }
808
+
809
+ /* left branch */
810
+ (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
811
+ workspace.clusters->emplace_back(Ordinal, col, LessOrEqual, workspace.this_split_lev, is_NA_branch);
812
+ workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.this_split_NA,
813
+ workspace.this_split_ix - 1, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
814
+ &workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
815
+ workspace.clusters->size() - 1, tree_from, curr_depth + 1,
816
+ workspace.log_transf, workspace.log_minval, workspace.exp_transf,
817
+ workspace.orig_mean, workspace.orig_sd,
818
+ workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
819
+ model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
820
+ workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
821
+
822
+ if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
823
+ (*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
824
+ workspace.tree->emplace_back(tree_from, col, workspace.this_split_lev, LessOrEqual);
825
+ backup_recursion_state(workspace, *state_backup);
826
+ workspace.st = workspace.this_split_NA;
827
+ workspace.end = workspace.this_split_ix - 1;
828
+ recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
829
+ restore_recursion_state(workspace, *state_backup);
830
+ }
831
+
832
+
833
+
834
+ /* right branch */
835
+ (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
836
+ workspace.clusters->emplace_back(Ordinal, col, Greater, workspace.this_split_lev, is_NA_branch);
837
+ workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.this_split_ix,
838
+ workspace.end, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
839
+ &workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
840
+ workspace.clusters->size() - 1, tree_from, curr_depth + 1,
841
+ workspace.log_transf, workspace.log_minval, workspace.exp_transf,
842
+ workspace.orig_mean, workspace.orig_sd,
843
+ workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
844
+ model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
845
+ workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
846
+
847
+ if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
848
+ (*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
849
+ workspace.tree->emplace_back(tree_from, col, workspace.this_split_lev, Greater);
850
+ backup_recursion_state(workspace, *state_backup);
851
+ workspace.st = workspace.this_split_ix;
852
+ recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
853
+ restore_recursion_state(workspace, *state_backup);
854
+ }
855
+
856
+ if (workspace.this_gain > workspace.best_gain && !model_params.follow_all) {
857
+ workspace.best_gain = workspace.this_gain;
858
+ workspace.column_type_best = Ordinal;
859
+ workspace.col_best = col;
860
+ workspace.split_lev_best = workspace.this_split_lev;
861
+ }
862
+
863
+ }
864
+
865
+ }
866
+
867
+ /* avoid unnecessary memory usage */
868
+ workspace.col_has_outliers = workspace.lev_has_outliers? true : workspace.col_has_outliers;
869
+ (*workspace.tree)[tree_from].clusters.shrink_to_fit();
870
+ if ((*workspace.tree)[tree_from].all_branches.size() > 0) (*workspace.tree)[tree_from].all_branches.shrink_to_fit();
871
+
872
+
873
+ /* continue splitting further if meeting threshold criteria */
874
+ if (workspace.best_gain >= model_params.min_gain && !model_params.follow_all) {
875
+
876
+ /* check if depth limit is reached */
877
+ curr_depth++;
878
+ if (curr_depth >= model_params.max_depth) return;
879
+
880
+ /* discard outliers if any */
881
+ if (workspace.lev_has_outliers)
882
+ workspace.st = move_outliers_to_front(&workspace.ix_arr[0], &workspace.outlier_scores[0], workspace.st, workspace.end);
883
+
884
+ /* assign rows to their corresponding branch */
885
+ switch(workspace.column_type_best) {
886
+ case Numeric:
887
+ {
888
+ divide_subset_split(&workspace.ix_arr[0], input_data.numeric_data + workspace.col_best * input_data.nrows,
889
+ workspace.st, workspace.end, workspace.split_point_best,
890
+ (bool)(input_data.has_NA[workspace.col_best]),
891
+ &(workspace.this_split_NA), &(workspace.this_split_ix) );
892
+ spl1 = LessOrEqual; spl2 = Greater;
893
+ set_tree_as_numeric(workspace.tree->back(), workspace.split_point_best, workspace.col_best);
894
+ break;
895
+ }
896
+
897
+ case Categorical:
898
+ {
899
+ divide_subset_split(&workspace.ix_arr[0], input_data.categorical_data + workspace.col_best * input_data.nrows,
900
+ workspace.st, workspace.end, &workspace.buffer_subset_categ_best[0], input_data.ncat[workspace.col_best],
901
+ (bool)(input_data.has_NA[workspace.col_best + input_data.ncols_numeric]),
902
+ &(workspace.this_split_NA), &(workspace.this_split_ix) );
903
+ spl1 = InSubset; spl2 = NotInSubset;
904
+ set_tree_as_categorical(workspace.tree->back(), input_data.ncat[workspace.col_best],
905
+ &workspace.buffer_subset_categ_best[0], workspace.col_best);
906
+ break;
907
+ }
908
+
909
+ case Ordinal:
910
+ {
911
+ divide_subset_split(&workspace.ix_arr[0], input_data.ordinal_data + workspace.col_best * input_data.nrows,
912
+ workspace.st, workspace.end, workspace.split_lev_best,
913
+ (bool)(input_data.has_NA[workspace.col_best + input_data.ncols_numeric + input_data.ncols_categ]),
914
+ &(workspace.this_split_NA), &(workspace.this_split_ix) );
915
+ spl1 = LessOrEqual; spl2 = Greater;
916
+ set_tree_as_ordinal(workspace.tree->back(), workspace.split_lev_best, workspace.col_best);
917
+ break;
918
+ }
919
+
920
+
921
+ }
922
+
923
+ /* continue splitting recursively - need to remember from where */
924
+ ix1 = workspace.this_split_NA;
925
+ ix2 = workspace.this_split_ix;
926
+ ix3 = workspace.end;
927
+
928
+ /* NA branch */
929
+ if (workspace.st > workspace.this_split_NA &&
930
+ (workspace.st - workspace.this_split_NA) >= 2 * model_params.min_size_numeric) {
931
+
932
+ workspace.end = ix1 - 1;
933
+ (*workspace.tree)[tree_from].tree_NA = workspace.tree->size();
934
+ workspace.tree->emplace_back(tree_from, IsNa);
935
+ recursive_split_numeric(workspace, input_data, model_params, curr_depth, true);
936
+ }
937
+
938
+ /* left branch */
939
+ if ((ix2 - ix1) >= 2 * model_params.min_size_numeric) {
940
+ workspace.st = ix1;
941
+ workspace.end = ix2 - 1;
942
+ (*workspace.tree)[tree_from].tree_left = workspace.tree->size();
943
+ workspace.tree->emplace_back(tree_from, spl1);
944
+ recursive_split_numeric(workspace, input_data, model_params, curr_depth, is_NA_branch);
945
+ }
946
+
947
+ /* right branch */
948
+ if ((ix3 - ix2 + 1) >= 2 * model_params.min_size_numeric) {
949
+ workspace.st = ix2;
950
+ workspace.end = ix3;
951
+ (*workspace.tree)[tree_from].tree_right = workspace.tree->size();
952
+ workspace.tree->emplace_back(tree_from, spl2);
953
+ recursive_split_numeric(workspace, input_data, model_params, curr_depth, is_NA_branch);
954
+ }
955
+
956
+ }
957
+
958
+ /* if tree has no clusters and no subtrees, disconnect it from parent and then drop */
959
+ if (check_tree_is_not_needed((*workspace.tree)[tree_from])) {
960
+
961
+ if (tree_from == 0) {
962
+ workspace.tree->clear();
963
+ } else if ((*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.size() > 0) {
964
+ (*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.pop_back();
965
+ workspace.tree->pop_back();
966
+ } else {
967
+ switch((*workspace.tree)[tree_from].parent_branch) {
968
+
969
+ case IsNa:
970
+ {
971
+ (*workspace.tree)[(*workspace.tree)[tree_from].parent].tree_NA = 0;
972
+ break;
973
+ }
974
+
975
+ case LessOrEqual:
976
+ {
977
+ (*workspace.tree)[(*workspace.tree)[tree_from].parent].tree_left = 0;
978
+ break;
979
+ }
980
+
981
+ case Greater:
982
+ {
983
+ (*workspace.tree)[(*workspace.tree)[tree_from].parent].tree_right = 0;
984
+ break;
985
+ }
986
+
987
+ case InSubset:
988
+ {
989
+ (*workspace.tree)[(*workspace.tree)[tree_from].parent].tree_left = 0;
990
+ break;
991
+ }
992
+
993
+ case NotInSubset:
994
+ {
995
+ (*workspace.tree)[(*workspace.tree)[tree_from].parent].tree_right = 0;
996
+ break;
997
+ }
998
+ }
999
+ workspace.tree->pop_back();
1000
+ }
1001
+ }
1002
+
1003
+ }
1004
+
1005
+ void process_categ_col(std::vector<Cluster> &cluster_root,
1006
+ std::vector<ClusterTree> &tree_root,
1007
+ size_t target_col_num, bool is_ord,
1008
+ Workspace &workspace,
1009
+ InputData &input_data,
1010
+ ModelParams &model_params,
1011
+ ModelOutputs &model_outputs)
1012
+ {
1013
+ if (model_params.max_depth <= 0) return;
1014
+
1015
+ /* extract necesary info from column and discard NAs */
1016
+ workspace.target_col_is_ord = is_ord;
1017
+ workspace.target_col_num = target_col_num - input_data.ncols_numeric;
1018
+ if (!workspace.target_col_is_ord) {
1019
+ workspace.target_categ_col = input_data.categorical_data + workspace.target_col_num * input_data.nrows;
1020
+ workspace.ncat_this = input_data.ncat[workspace.target_col_num];
1021
+ } else {
1022
+ workspace.target_categ_col = input_data.ordinal_data + (workspace.target_col_num - input_data.ncols_categ) * input_data.nrows;
1023
+ workspace.ncat_this = input_data.ncat_ord[workspace.target_col_num - input_data.ncols_categ];
1024
+ }
1025
+ workspace.untransf_target_col = workspace.target_categ_col;
1026
+ workspace.end = input_data.nrows - 1;
1027
+ workspace.st = move_NAs_to_front(&workspace.ix_arr[0], workspace.target_categ_col, 0, workspace.end);
1028
+ workspace.col_has_outliers = false;
1029
+ workspace.col_is_bin = workspace.ncat_this <= 2;
1030
+ workspace.prop_small_this = &model_params.prop_small[ model_outputs.start_ix_cat_counts[workspace.target_col_num] ];
1031
+ workspace.prior_prob = &model_outputs.prop_categ[ model_outputs.start_ix_cat_counts[workspace.target_col_num] ];
1032
+
1033
+ /* create cluster root and reset outlier scores for this column */
1034
+ workspace.clusters = &cluster_root;
1035
+ workspace.tree = &tree_root;
1036
+ std::fill(workspace.outlier_scores.begin(), workspace.outlier_scores.end(), (double)1.0);
1037
+ workspace.tree->emplace_back(0, Root);
1038
+
1039
+
1040
+ /* at first, see if there's a category with 1-2 observations among only categories with large counts */
1041
+ workspace.col_has_outliers = find_outlier_categories_no_cond(&input_data.cat_counts[ model_outputs.start_ix_cat_counts[workspace.target_col_num] ],
1042
+ workspace.ncat_this, workspace.end - workspace.st + 1,
1043
+ &workspace.buffer_subset_categ[0], &(workspace.orig_mean));
1044
+
1045
+ /* if there is any such case, create a cluster for them */
1046
+ if (workspace.col_has_outliers) {
1047
+ workspace.tree->back().clusters.push_back(0);
1048
+ workspace.clusters->emplace_back(NoType, Root);
1049
+ define_categ_cluster_no_cond(workspace.untransf_target_col, &workspace.ix_arr[0], workspace.st, workspace.end, workspace.ncat_this,
1050
+ &workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
1051
+ &workspace.outlier_depth[0], workspace.clusters->back(),
1052
+ &input_data.cat_counts[ model_outputs.start_ix_cat_counts[workspace.target_col_num] ],
1053
+ &workspace.buffer_subset_categ[0], workspace.orig_mean);
1054
+ workspace.st = move_outliers_to_front(&workspace.ix_arr[0], &workspace.outlier_scores[0], workspace.st, workspace.end);
1055
+ }
1056
+
1057
+ /* if no conditional outliers are required, stop there */
1058
+ if (model_params.max_depth == 0) return;
1059
+
1060
+ /* if the rest of the data is all one category, do not process it any further */
1061
+ if (workspace.ncat_this == 2 && workspace.col_has_outliers) return;
1062
+
1063
+ /* if there isn't a single catchable outlier category, skip */
1064
+ bool should_skip = true;
1065
+ for (int cat = 0; cat < workspace.ncat_this; cat++) {
1066
+
1067
+ if (workspace.prop_small_this[cat] > (long double)1 / (long double)(workspace.end - workspace.st + 1 - model_params.min_size_categ))
1068
+ should_skip = false;
1069
+ }
1070
+ if (should_skip) return;
1071
+
1072
+
1073
+ /* if the column is already binary, or if using multiple categories, or if there are no more categorical columns, split the data as-is */
1074
+ if (
1075
+ (!model_params.categ_as_bin && !workspace.target_col_is_ord) ||
1076
+ (!model_params.ord_as_bin && workspace.target_col_is_ord) ||
1077
+ workspace.col_is_bin ||
1078
+ input_data.ncols_categ == (1 - ((workspace.target_col_is_ord)? 1 : 0))
1079
+ )
1080
+ {
1081
+
1082
+ /* calculate base information */
1083
+ workspace.base_info = total_info(&input_data.cat_counts[ model_outputs.start_ix_cat_counts[workspace.target_col_num] ],
1084
+ workspace.ncat_this, workspace.end - workspace.st + 1);
1085
+ workspace.base_info_orig = workspace.base_info;
1086
+
1087
+ /* then split */
1088
+ recursive_split_categ(workspace, input_data, model_params, 0, false);
1089
+ }
1090
+
1091
+
1092
+ else {
1093
+ /* otherwise, process the column 1 category at a time */
1094
+ size_t st_orig = workspace.st;
1095
+ size_t end_orig = workspace.end;
1096
+ size_t cat_counts_bin[2];
1097
+ workspace.col_is_bin = true;
1098
+ workspace.already_split_main = false;
1099
+ workspace.base_info_orig = total_info(&input_data.cat_counts[ model_outputs.start_ix_cat_counts[workspace.target_col_num] ],
1100
+ workspace.ncat_this, workspace.end - workspace.st + 1);
1101
+ workspace.tree->back().column_type = NoType;
1102
+
1103
+
1104
+ for (int cat = 0; cat < workspace.ncat_this - ((workspace.target_col_is_ord)? 1 : 0); cat++) {
1105
+
1106
+ workspace.st = st_orig;
1107
+ workspace.end = end_orig;
1108
+
1109
+ /* convert to binary */
1110
+ if (!workspace.target_col_is_ord) {
1111
+
1112
+ for (size_t row = workspace.st; row <= workspace.end; row++) {
1113
+ workspace.buffer_bin_y[workspace.ix_arr[row]] = (workspace.untransf_target_col[workspace.ix_arr[row]] == cat)? 1 : 0;
1114
+ }
1115
+ cat_counts_bin[0] = workspace.end - workspace.st + 1 - input_data.cat_counts[ cat + model_outputs.start_ix_cat_counts[workspace.target_col_num] ];
1116
+ cat_counts_bin[1] = input_data.cat_counts[ cat + model_outputs.start_ix_cat_counts[workspace.target_col_num] ];
1117
+
1118
+ } else {
1119
+
1120
+ for (size_t row = workspace.st; row <= workspace.end; row++) {
1121
+ workspace.buffer_bin_y[workspace.ix_arr[row]] = (workspace.untransf_target_col[workspace.ix_arr[row]] <= cat)? 1 : 0;
1122
+ }
1123
+ cat_counts_bin[0] = 0;
1124
+ cat_counts_bin[1] = workspace.end - workspace.st + 1;
1125
+ for (int catcat = 0; catcat <= cat; catcat++) {
1126
+ cat_counts_bin[0] += input_data.cat_counts[ catcat + model_outputs.start_ix_cat_counts[workspace.target_col_num] ];
1127
+ cat_counts_bin[1] -= input_data.cat_counts[ catcat + model_outputs.start_ix_cat_counts[workspace.target_col_num] ];
1128
+ }
1129
+
1130
+ }
1131
+
1132
+ if (cat_counts_bin[0] > 0 && cat_counts_bin[1] > 0) {
1133
+ workspace.target_categ_col = &workspace.buffer_bin_y[0];
1134
+ workspace.base_info = total_info(cat_counts_bin, 2, workspace.end - workspace.st + 1);
1135
+ (*workspace.tree)[0].binary_branches.push_back(workspace.tree->size());
1136
+ workspace.tree->emplace_back(0, SubTrees);
1137
+ recursive_split_categ(workspace, input_data, model_params, 0, false);
1138
+ }
1139
+
1140
+ }
1141
+ (*workspace.tree)[0].binary_branches.shrink_to_fit();
1142
+
1143
+ }
1144
+
1145
+ }
1146
+
1147
+
1148
+ void recursive_split_categ(Workspace &workspace,
1149
+ InputData &input_data,
1150
+ ModelParams &model_params,
1151
+ size_t curr_depth, bool is_NA_branch)
1152
+ {
1153
+ /* idea is the same as its numeric counterpart, only splitting by another categorical
1154
+ is less clear how to do and offers different options */
1155
+ workspace.best_gain = -HUGE_VAL;
1156
+ workspace.column_type_best = NoType;
1157
+ workspace.lev_has_outliers = false;
1158
+ size_t ix1, ix2, ix3;
1159
+ SplitType spl1, spl2;
1160
+ size_t tree_from = workspace.tree->size() - 1;
1161
+
1162
+ /* when using 'follow_all' need to keep track of a lot more things */
1163
+ std::unique_ptr<RecursionState> state_backup;
1164
+ if (model_params.follow_all) state_backup = std::unique_ptr<RecursionState>(new RecursionState);
1165
+
1166
+ if (curr_depth > 0) {
1167
+ workspace.base_info_orig = total_info(&workspace.ix_arr[0], workspace.untransf_target_col, workspace.st, workspace.end,
1168
+ workspace.ncat_this, &workspace.buffer_cat_cnt[0]);
1169
+
1170
+ /* check that there's still more than 1 category */
1171
+ size_t ncat_present = 0;
1172
+ for (int cat = 0; cat < workspace.ncat_this; cat++) {
1173
+ ncat_present += (workspace.buffer_cat_cnt[cat])? 1 : 0;
1174
+ if (ncat_present >= 2) break;
1175
+ }
1176
+ if (ncat_present < 2) goto drop_if_not_needed;
1177
+ if (workspace.col_is_bin && workspace.ncat_this > 2) {
1178
+ workspace.base_info = total_info(&workspace.ix_arr[0], workspace.target_categ_col, workspace.st, workspace.end,
1179
+ 2, &workspace.buffer_cat_cnt[0]);
1180
+ if (workspace.buffer_cat_cnt[0] < model_params.min_size_categ || workspace.buffer_cat_cnt[1] == model_params.min_size_categ) goto drop_if_not_needed;
1181
+ } else {
1182
+ workspace.base_info = workspace.base_info_orig;
1183
+ }
1184
+ }
1185
+
1186
+ /* split with each other column */
1187
+
1188
+
1189
+ /* first numeric */
1190
+ for (size_t col = 0; col < input_data.ncols_numeric; col++) {
1191
+
1192
+ if (curr_depth == 0 && workspace.col_is_bin && workspace.ncat_this > 2 && workspace.already_split_main) break;
1193
+ if (input_data.skip_col[col]) continue;
1194
+ split_numericx_categy(&workspace.ix_arr[0], workspace.st, workspace.end, input_data.numeric_data + col * input_data.nrows,
1195
+ workspace.untransf_target_col, workspace.ncat_this, workspace.base_info_orig,
1196
+ &workspace.buffer_cat_cnt[0], (bool)(input_data.has_NA[col]), model_params.min_size_categ,
1197
+ model_params.take_mid, &(workspace.this_gain), &(workspace.this_split_point),
1198
+ &(workspace.this_split_ix), &(workspace.this_split_NA));
1199
+ if (model_params.gain_as_pct) workspace.this_gain /= workspace.base_info_orig;
1200
+
1201
+ if (workspace.this_gain >= model_params.min_gain) {
1202
+
1203
+ /* NA branch */
1204
+ if (workspace.this_split_NA > workspace.st &&
1205
+ (workspace.this_split_NA - workspace.st) > model_params.min_size_categ) {
1206
+
1207
+ (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
1208
+ workspace.clusters->emplace_back(Numeric, col, IsNa, -HUGE_VAL, true);
1209
+ workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
1210
+ &workspace.ix_arr[0], workspace.st, workspace.this_split_NA - 1,
1211
+ workspace.ncat_this, model_params.categ_from_maj,
1212
+ &workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
1213
+ &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
1214
+ workspace.clusters->size() - 1, tree_from, curr_depth + 1,
1215
+ model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
1216
+ workspace.prop_small_this, workspace.prior_prob,
1217
+ &workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
1218
+ &workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
1219
+ workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
1220
+ if (workspace.drop_cluster) {
1221
+ workspace.clusters->pop_back();
1222
+ (*workspace.tree)[tree_from].clusters.pop_back();
1223
+ }
1224
+
1225
+ if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
1226
+ (*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
1227
+ workspace.tree->emplace_back(tree_from, col, HUGE_VAL, IsNa);
1228
+ backup_recursion_state(workspace, *state_backup);
1229
+ workspace.end = workspace.this_split_NA - 1;
1230
+ recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, true);
1231
+ restore_recursion_state(workspace, *state_backup);
1232
+ }
1233
+
1234
+ }
1235
+
1236
+ /* left branch */
1237
+ (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
1238
+ workspace.clusters->emplace_back(Numeric, col, LessOrEqual, workspace.this_split_point, is_NA_branch);
1239
+ workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
1240
+ &workspace.ix_arr[0], workspace.this_split_NA, workspace.this_split_ix,
1241
+ workspace.ncat_this, model_params.categ_from_maj,
1242
+ &workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
1243
+ &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
1244
+ workspace.clusters->size() - 1, tree_from, curr_depth + 1,
1245
+ model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
1246
+ workspace.prop_small_this, workspace.prior_prob,
1247
+ &workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
1248
+ &workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
1249
+ workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
1250
+ if (workspace.drop_cluster) {
1251
+ workspace.clusters->pop_back();
1252
+ (*workspace.tree)[tree_from].clusters.pop_back();
1253
+ }
1254
+
1255
+ if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
1256
+ (*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
1257
+ workspace.tree->emplace_back(tree_from, col, workspace.this_split_point, LessOrEqual);
1258
+ backup_recursion_state(workspace, *state_backup);
1259
+ workspace.st = workspace.this_split_NA;
1260
+ workspace.end = workspace.this_split_ix;
1261
+ recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
1262
+ restore_recursion_state(workspace, *state_backup);
1263
+ }
1264
+
1265
+
1266
+ /* right branch */
1267
+ (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
1268
+ workspace.clusters->emplace_back(Numeric, col, Greater, workspace.this_split_point, is_NA_branch);
1269
+ workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
1270
+ &workspace.ix_arr[0], workspace.this_split_ix + 1, workspace.end,
1271
+ workspace.ncat_this, model_params.categ_from_maj,
1272
+ &workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
1273
+ &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
1274
+ workspace.clusters->size() - 1, tree_from, curr_depth + 1,
1275
+ model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
1276
+ workspace.prop_small_this, workspace.prior_prob,
1277
+ &workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
1278
+ &workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
1279
+ workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
1280
+ if (workspace.drop_cluster) {
1281
+ workspace.clusters->pop_back();
1282
+ (*workspace.tree)[tree_from].clusters.pop_back();
1283
+ }
1284
+
1285
+ if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
1286
+ (*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
1287
+ workspace.tree->emplace_back(tree_from, col, workspace.this_split_point, Greater);
1288
+ backup_recursion_state(workspace, *state_backup);
1289
+ workspace.st = workspace.this_split_ix + 1;
1290
+ recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
1291
+ restore_recursion_state(workspace, *state_backup);
1292
+ }
1293
+
1294
+
1295
+ /* if this is the best split, remember it for later */
1296
+ if (workspace.this_gain > workspace.best_gain) {
1297
+ workspace.best_gain = workspace.this_gain;
1298
+ workspace.column_type_best = Numeric;
1299
+ workspace.col_best = col;
1300
+ workspace.split_point_best = workspace.this_split_point;
1301
+ }
1302
+
1303
+ }
1304
+
1305
+ }
1306
+
1307
+
1308
+ /* then categorical */
1309
+ for (size_t col = 0; col < input_data.ncols_categ; col++) {
1310
+
1311
+ /* TODO: could make a pre-check that the splitting column up to this recursion still has
1312
+ more than 1 category, and skip for this and further recursions otherwise */
1313
+
1314
+ if (col == workspace.target_col_num && !workspace.target_col_is_ord) continue;
1315
+ if (input_data.skip_col[col + input_data.ncols_numeric]) continue;
1316
+
1317
+ if (workspace.col_is_bin) {
1318
+
1319
+ split_categx_biny(&workspace.ix_arr[0], workspace.st, workspace.end,
1320
+ input_data.categorical_data + col * input_data.nrows, workspace.target_categ_col,
1321
+ input_data.ncat[col], workspace.base_info, &workspace.buffer_cat_cnt[0],
1322
+ &workspace.buffer_crosstab[0], &workspace.buffer_cat_sorted[0],
1323
+ (bool)(input_data.has_NA[col + input_data.ncols_numeric]), model_params.min_size_categ,
1324
+ &(workspace.this_gain), &workspace.buffer_subset_categ[0]);
1325
+
1326
+ /* If it was forcibly binarized, need to calculate the gain on the original categories to make it comparable */
1327
+ if (
1328
+ !isinf(workspace.this_gain) &&
1329
+ (
1330
+ (!workspace.target_col_is_ord && input_data.ncat[workspace.target_col_num] > 2) ||
1331
+ (workspace.target_col_is_ord && input_data.ncat_ord[workspace.target_col_num - input_data.ncols_categ] > 2)
1332
+ )
1333
+ )
1334
+ {
1335
+ divide_subset_split(&workspace.ix_arr[0], input_data.categorical_data + col * input_data.nrows,
1336
+ workspace.st, workspace.end, &workspace.buffer_subset_categ[0], input_data.ncat[col],
1337
+ (bool)input_data.has_NA[col + input_data.ncols_numeric],
1338
+ &(workspace.this_split_NA), &(workspace.this_split_ix) );
1339
+ workspace.this_gain = categ_gain_from_split(&workspace.ix_arr[0], workspace.untransf_target_col, workspace.st,
1340
+ workspace.this_split_NA, workspace.this_split_ix, workspace.end,
1341
+ workspace.ncat_this, &workspace.buffer_cat_cnt[0], workspace.base_info_orig);
1342
+ }
1343
+
1344
+ } else {
1345
+
1346
+ if (model_params.cat_bruteforce_subset && input_data.ncat[col] > 2) {
1347
+ split_categx_categy_subset(&workspace.ix_arr[0], workspace.st, workspace.end,
1348
+ input_data.categorical_data + col * input_data.nrows, workspace.target_categ_col,
1349
+ input_data.ncat[col], workspace.ncat_this, workspace.base_info_orig,
1350
+ &workspace.buffer_cat_sorted[0], &workspace.buffer_crosstab[0], &workspace.buffer_cat_cnt[0],
1351
+ (bool)(input_data.has_NA[col + input_data.ncols_numeric]), model_params.min_size_categ,
1352
+ &(workspace.this_gain), &workspace.buffer_subset_categ[0]);
1353
+ } else {
1354
+ split_categx_categy_separate(&workspace.ix_arr[0], workspace.st, workspace.end,
1355
+ input_data.categorical_data + col * input_data.nrows, workspace.target_categ_col,
1356
+ input_data.ncat[col], workspace.ncat_this, workspace.base_info_orig,
1357
+ &workspace.buffer_cat_cnt[0], &workspace.buffer_crosstab[0],
1358
+ (bool)(input_data.has_NA[col + input_data.ncols_numeric]),
1359
+ model_params.min_size_categ, &(workspace.this_gain));
1360
+ }
1361
+
1362
+ }
1363
+
1364
+ if (model_params.gain_as_pct) workspace.this_gain /= workspace.base_info_orig;
1365
+ if (workspace.this_gain >= model_params.min_gain) {
1366
+
1367
+ /* NA branch */
1368
+ workspace.this_split_NA = move_NAs_to_front(&workspace.ix_arr[0], input_data.categorical_data + col * input_data.nrows, workspace.st, workspace.end);
1369
+ if ((workspace.this_split_NA - workspace.st) > model_params.min_size_categ) {
1370
+
1371
+ (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
1372
+ workspace.clusters->emplace_back(Categorical, col, IsNa, (char*)NULL, (int)0, true);
1373
+ workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
1374
+ &workspace.ix_arr[0], workspace.st, workspace.this_split_NA - 1,
1375
+ workspace.ncat_this, model_params.categ_from_maj,
1376
+ &workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
1377
+ &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
1378
+ workspace.clusters->size() - 1, tree_from, curr_depth + 1,
1379
+ model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
1380
+ workspace.prop_small_this, workspace.prior_prob,
1381
+ &workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
1382
+ &workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
1383
+ workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
1384
+ if (workspace.drop_cluster) {
1385
+ workspace.clusters->pop_back();
1386
+ (*workspace.tree)[tree_from].clusters.pop_back();
1387
+ }
1388
+
1389
+ if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
1390
+ (*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
1391
+ workspace.tree->emplace_back(tree_from, col, IsNa, (char*)NULL, 0);
1392
+ backup_recursion_state(workspace, *state_backup);
1393
+ workspace.end = workspace.this_split_NA - 1;
1394
+ recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, true);
1395
+ restore_recursion_state(workspace, *state_backup);
1396
+ }
1397
+
1398
+ }
1399
+
1400
+ if (!model_params.cat_bruteforce_subset && !workspace.col_is_bin && input_data.ncat[col] > 2) {
1401
+
1402
+ /* sort by the splitting variable and iterate over to determine the split points */
1403
+ workspace.temp_ptr_x = input_data.categorical_data + col * input_data.nrows;
1404
+ std::sort(&workspace.ix_arr[0] + workspace.this_split_NA, &workspace.ix_arr[0] + workspace.end + 1,
1405
+ [&workspace](const size_t a, const size_t b){return workspace.temp_ptr_x[a] < workspace.temp_ptr_x[b];});
1406
+ workspace.this_split_ix = workspace.this_split_NA;
1407
+
1408
+ /* TODO: should instead use std::lower_bound to calculate the start and end indices of each category */
1409
+ for (size_t row = workspace.this_split_NA + 1; row <= workspace.end; row++) {
1410
+
1411
+ /* if the next observation is in a different category, then the split ends here */
1412
+ if (workspace.temp_ptr_x[workspace.ix_arr[row]] != workspace.temp_ptr_x[workspace.ix_arr[row-1]]) {
1413
+
1414
+ if ((row - workspace.this_split_ix) >= model_params.min_size_categ) {
1415
+
1416
+ (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
1417
+ workspace.clusters->emplace_back(col, workspace.temp_ptr_x[workspace.ix_arr[row-1]], input_data.ncat[col], is_NA_branch);
1418
+ workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
1419
+ &workspace.ix_arr[0], workspace.this_split_ix, row - 1,
1420
+ workspace.ncat_this, model_params.categ_from_maj,
1421
+ &workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
1422
+ &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
1423
+ workspace.clusters->size() - 1, tree_from, curr_depth + 1,
1424
+ model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
1425
+ workspace.prop_small_this, workspace.prior_prob,
1426
+ &workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
1427
+ &workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
1428
+ workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
1429
+ if (workspace.drop_cluster) {
1430
+ workspace.clusters->pop_back();
1431
+ (*workspace.tree)[tree_from].clusters.pop_back();
1432
+ }
1433
+ if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
1434
+ (*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
1435
+ workspace.tree->emplace_back(tree_from, col, workspace.temp_ptr_x[workspace.ix_arr[workspace.this_split_ix]]);
1436
+ backup_recursion_state(workspace, *state_backup);
1437
+ workspace.st = workspace.this_split_ix;
1438
+ workspace.end = row - 1;
1439
+ recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
1440
+ restore_recursion_state(workspace, *state_backup);
1441
+ }
1442
+ }
1443
+ workspace.this_split_ix = row;
1444
+ }
1445
+ }
1446
+ /* last category is given by the end indices */
1447
+ if ((workspace.end - workspace.this_split_ix) > model_params.min_size_categ) {
1448
+ (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
1449
+ workspace.clusters->emplace_back(col, workspace.temp_ptr_x[workspace.ix_arr[workspace.end]], input_data.ncat[col], is_NA_branch);
1450
+ workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
1451
+ &workspace.ix_arr[0], workspace.this_split_ix, workspace.end,
1452
+ workspace.ncat_this, model_params.categ_from_maj,
1453
+ &workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
1454
+ &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
1455
+ workspace.clusters->size() - 1, tree_from, curr_depth + 1,
1456
+ model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
1457
+ workspace.prop_small_this, workspace.prior_prob,
1458
+ &workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
1459
+ &workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
1460
+ workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
1461
+ if (workspace.drop_cluster) {
1462
+ workspace.clusters->pop_back();
1463
+ (*workspace.tree)[tree_from].clusters.pop_back();
1464
+ }
1465
+ if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
1466
+ (*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
1467
+ workspace.tree->emplace_back(tree_from, col, workspace.temp_ptr_x[workspace.ix_arr[workspace.end]]);
1468
+ backup_recursion_state(workspace, *state_backup);
1469
+ workspace.st = workspace.this_split_ix;
1470
+ recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
1471
+ restore_recursion_state(workspace, *state_backup);
1472
+ }
1473
+
1474
+ }
1475
+
1476
+ if (workspace.this_gain > workspace.best_gain) {
1477
+ workspace.best_gain = workspace.this_gain;
1478
+ workspace.column_type_best = Categorical;
1479
+ workspace.col_best = col;
1480
+ }
1481
+
1482
+
1483
+ } else {
1484
+
1485
+ /* split by subsets of categories */
1486
+
1487
+ if (input_data.ncat[col] == 2) {
1488
+
1489
+ workspace.buffer_subset_categ[0] = 1;
1490
+ workspace.buffer_subset_categ[1] = 0;
1491
+ divide_subset_split(&workspace.ix_arr[0], input_data.categorical_data + col * input_data.nrows, workspace.this_split_NA, workspace.end,
1492
+ (int)0, false, &(workspace.this_split_NA), &(workspace.this_split_ix));
1493
+ if (
1494
+ (workspace.end - workspace.this_split_ix) < model_params.min_size_categ ||
1495
+ (workspace.this_split_ix - workspace.this_split_NA) < model_params.min_size_categ
1496
+ ) continue;
1497
+
1498
+ } else {
1499
+
1500
+ divide_subset_split(&workspace.ix_arr[0], input_data.categorical_data + col * input_data.nrows, workspace.this_split_NA, workspace.end,
1501
+ &workspace.buffer_subset_categ[0], input_data.ncat[col], false,
1502
+ &(workspace.this_split_NA), &(workspace.this_split_ix));
1503
+ }
1504
+
1505
+ /* left branch */
1506
+ (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
1507
+ workspace.clusters->emplace_back(Categorical, col, InSubset, &workspace.buffer_subset_categ[0], input_data.ncat[col], is_NA_branch);
1508
+ workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
1509
+ &workspace.ix_arr[0], workspace.this_split_NA, workspace.this_split_ix - 1,
1510
+ workspace.ncat_this, model_params.categ_from_maj,
1511
+ &workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
1512
+ &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
1513
+ workspace.clusters->size() - 1, tree_from, curr_depth + 1,
1514
+ model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
1515
+ workspace.prop_small_this, workspace.prior_prob,
1516
+ &workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
1517
+ &workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
1518
+ workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
1519
+ if (workspace.drop_cluster) {
1520
+ workspace.clusters->pop_back();
1521
+ (*workspace.tree)[tree_from].clusters.pop_back();
1522
+ }
1523
+
1524
+ if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
1525
+ (*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
1526
+ workspace.tree->emplace_back(tree_from, col, InSubset, &workspace.buffer_subset_categ[0], input_data.ncat[col]);
1527
+ backup_recursion_state(workspace, *state_backup);
1528
+ workspace.st = workspace.this_split_NA;
1529
+ workspace.end = workspace.this_split_ix - 1;
1530
+ recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
1531
+ restore_recursion_state(workspace, *state_backup);
1532
+ }
1533
+
1534
+ /* right branch */
1535
+ (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
1536
+ workspace.clusters->emplace_back(Categorical, col, NotInSubset, &workspace.buffer_subset_categ[0], input_data.ncat[col], is_NA_branch);
1537
+ workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
1538
+ &workspace.ix_arr[0], workspace.this_split_ix, workspace.end,
1539
+ workspace.ncat_this, model_params.categ_from_maj,
1540
+ &workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
1541
+ &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
1542
+ workspace.clusters->size() - 1, tree_from, curr_depth + 1,
1543
+ model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
1544
+ workspace.prop_small_this, workspace.prior_prob,
1545
+ &workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
1546
+ &workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
1547
+ workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
1548
+ if (workspace.drop_cluster) {
1549
+ workspace.clusters->pop_back();
1550
+ (*workspace.tree)[tree_from].clusters.pop_back();
1551
+ }
1552
+
1553
+ if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
1554
+ (*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
1555
+ workspace.tree->emplace_back(tree_from, col, NotInSubset, &workspace.buffer_subset_categ[0], input_data.ncat[col]);
1556
+ backup_recursion_state(workspace, *state_backup);
1557
+ workspace.st = workspace.this_split_ix;
1558
+ recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
1559
+ restore_recursion_state(workspace, *state_backup);
1560
+ }
1561
+
1562
+ if (workspace.this_gain > workspace.best_gain) {
1563
+ workspace.best_gain = workspace.this_gain;
1564
+ workspace.column_type_best = Categorical;
1565
+ workspace.col_best = col;
1566
+ memcpy(&workspace.buffer_subset_categ_best[0], &workspace.buffer_subset_categ[0], input_data.ncat[col] * sizeof(char));
1567
+ }
1568
+
1569
+ }
1570
+
1571
+ }
1572
+
1573
+ }
1574
+
1575
+
1576
+ /* then ordinal */
1577
+ for (size_t col = 0; col < input_data.ncols_ord; col++) {
1578
+
1579
+ if (curr_depth == 0 && workspace.col_is_bin && workspace.ncat_this > 2 && workspace.already_split_main) break;
1580
+ if (input_data.skip_col[col + input_data.ncols_numeric + input_data.ncols_categ]) continue;
1581
+ if (workspace.target_col_is_ord && col == (workspace.target_col_num - input_data.ncols_categ)) continue;
1582
+
1583
+ split_ordx_categy(&workspace.ix_arr[0], workspace.st, workspace.end,
1584
+ input_data.ordinal_data + col * input_data.nrows, workspace.untransf_target_col,
1585
+ input_data.ncat_ord[col], workspace.ncat_this,
1586
+ workspace.base_info_orig, &workspace.buffer_cat_cnt[0], &workspace.buffer_crosstab[0], &workspace.buffer_cat_sorted[0],
1587
+ (bool)(input_data.has_NA[col + input_data.ncols_numeric + input_data.ncols_categ]),
1588
+ model_params.min_size_categ, &(workspace.this_gain), &(workspace.this_split_lev));
1589
+ if (model_params.gain_as_pct) workspace.this_gain /= workspace.base_info_orig;
1590
+
1591
+ if (workspace.this_gain >= model_params.min_gain) {
1592
+
1593
+ divide_subset_split(&workspace.ix_arr[0], input_data.ordinal_data + col * input_data.nrows, workspace.st, workspace.end,
1594
+ workspace.this_split_lev, (bool)(workspace.buffer_cat_cnt[ input_data.ncat_ord[col] ] > 0),
1595
+ &(workspace.this_split_NA), &(workspace.this_split_ix) );
1596
+
1597
+ /* NA branch */
1598
+ if ((workspace.this_split_NA - workspace.st) > model_params.min_size_categ) {
1599
+
1600
+ (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
1601
+ workspace.clusters->emplace_back(Ordinal, col, IsNa, (int)0, true);
1602
+ workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
1603
+ &workspace.ix_arr[0], workspace.st, workspace.this_split_NA - 1,
1604
+ workspace.ncat_this, model_params.categ_from_maj,
1605
+ &workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
1606
+ &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
1607
+ workspace.clusters->size() - 1, tree_from, curr_depth + 1,
1608
+ model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
1609
+ workspace.prop_small_this, workspace.prior_prob,
1610
+ &workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
1611
+ &workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
1612
+ workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
1613
+ if (workspace.drop_cluster) {
1614
+ workspace.clusters->pop_back();
1615
+ (*workspace.tree)[tree_from].clusters.pop_back();
1616
+ }
1617
+
1618
+ if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
1619
+ (*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
1620
+ workspace.tree->emplace_back(tree_from, col, (int)-1, IsNa);
1621
+ backup_recursion_state(workspace, *state_backup);
1622
+ workspace.end = workspace.this_split_NA - 1;
1623
+ recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, true);
1624
+ restore_recursion_state(workspace, *state_backup);
1625
+ }
1626
+
1627
+ }
1628
+
1629
+ /* left branch */
1630
+ (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
1631
+ workspace.clusters->emplace_back(Ordinal, col, LessOrEqual, workspace.this_split_lev, is_NA_branch);
1632
+ workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
1633
+ &workspace.ix_arr[0], workspace.this_split_NA, workspace.this_split_ix - 1,
1634
+ workspace.ncat_this, model_params.categ_from_maj,
1635
+ &workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
1636
+ &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
1637
+ workspace.clusters->size() - 1, tree_from, curr_depth + 1,
1638
+ model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
1639
+ workspace.prop_small_this, workspace.prior_prob,
1640
+ &workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
1641
+ &workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
1642
+ workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
1643
+ if (workspace.drop_cluster) {
1644
+ workspace.clusters->pop_back();
1645
+ (*workspace.tree)[tree_from].clusters.pop_back();
1646
+ }
1647
+
1648
+ if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
1649
+ (*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
1650
+ workspace.tree->emplace_back(tree_from, col, workspace.this_split_lev, LessOrEqual);
1651
+ backup_recursion_state(workspace, *state_backup);
1652
+ workspace.st = workspace.this_split_NA;
1653
+ workspace.end = workspace.this_split_ix - 1;
1654
+ recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
1655
+ restore_recursion_state(workspace, *state_backup);
1656
+ }
1657
+
1658
+ /* right branch */
1659
+ (*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
1660
+ workspace.clusters->emplace_back(Ordinal, col, Greater, workspace.this_split_lev, is_NA_branch);
1661
+ workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
1662
+ &workspace.ix_arr[0], workspace.this_split_ix, workspace.end,
1663
+ workspace.ncat_this, model_params.categ_from_maj,
1664
+ &workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
1665
+ &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
1666
+ workspace.clusters->size() - 1, tree_from, curr_depth + 1,
1667
+ model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
1668
+ workspace.prop_small_this, workspace.prior_prob,
1669
+ &workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
1670
+ &workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
1671
+ workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
1672
+ if (workspace.drop_cluster) {
1673
+ workspace.clusters->pop_back();
1674
+ (*workspace.tree)[tree_from].clusters.pop_back();
1675
+ }
1676
+
1677
+ if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
1678
+ (*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
1679
+ workspace.tree->emplace_back(tree_from, col, workspace.this_split_lev, Greater);
1680
+ backup_recursion_state(workspace, *state_backup);
1681
+ workspace.st = workspace.this_split_ix;
1682
+ recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
1683
+ restore_recursion_state(workspace, *state_backup);
1684
+ }
1685
+
1686
+
1687
+ if (workspace.this_gain > workspace.best_gain) {
1688
+ workspace.best_gain = workspace.this_gain;
1689
+ workspace.column_type_best = Ordinal;
1690
+ workspace.col_best = col;
1691
+ workspace.split_lev_best = workspace.this_split_lev;
1692
+ }
1693
+
1694
+ }
1695
+
1696
+ }
1697
+
1698
+
1699
+ /* avoid unnecessary memory usage or repeats */
1700
+ workspace.col_has_outliers = workspace.lev_has_outliers? true : workspace.col_has_outliers;
1701
+ (*workspace.tree)[tree_from].clusters.shrink_to_fit();
1702
+ if ((*workspace.tree)[tree_from].all_branches.size() > 0) (*workspace.tree)[tree_from].all_branches.shrink_to_fit();
1703
+ if (curr_depth == 0 && workspace.col_is_bin && workspace.ncat_this > 2 && !workspace.already_split_main)
1704
+ workspace.already_split_main = true;
1705
+
1706
+
1707
+ /* if there is a non-insignificant gain, continue splitting from the branches of the best column */
1708
+ if (workspace.best_gain >= model_params.min_gain && !model_params.follow_all) {
1709
+
1710
+ curr_depth++;
1711
+ if (curr_depth >= model_params.max_depth) goto drop_if_not_needed;
1712
+
1713
+ /* discard outliers if any */
1714
+ if (workspace.lev_has_outliers)
1715
+ workspace.st = move_outliers_to_front(&workspace.ix_arr[0], &workspace.outlier_scores[0], workspace.st, workspace.end);
1716
+
1717
+ /* assign rows to their corresponding branch */
1718
+ switch(workspace.column_type_best) {
1719
+ case Numeric:
1720
+ {
1721
+ divide_subset_split(&workspace.ix_arr[0], input_data.numeric_data + workspace.col_best * input_data.nrows,
1722
+ workspace.st, workspace.end, workspace.split_point_best,
1723
+ (bool)(input_data.has_NA[workspace.col_best]),
1724
+ &(workspace.this_split_NA), &(workspace.this_split_ix) );
1725
+ spl1 = LessOrEqual; spl2 = Greater;
1726
+ set_tree_as_numeric(workspace.tree->back(), workspace.split_point_best, workspace.col_best);
1727
+ break;
1728
+ }
1729
+
1730
+ case Ordinal:
1731
+ {
1732
+ divide_subset_split(&workspace.ix_arr[0], input_data.ordinal_data + workspace.col_best * input_data.nrows,
1733
+ workspace.st, workspace.end, workspace.split_lev_best,
1734
+ (bool)(input_data.has_NA[workspace.col_best + input_data.ncols_numeric + input_data.ncols_categ]),
1735
+ &(workspace.this_split_NA), &(workspace.this_split_ix) );
1736
+ spl1 = LessOrEqual; spl2 = Greater;
1737
+ set_tree_as_ordinal(workspace.tree->back(), workspace.split_lev_best, workspace.col_best);
1738
+ break;
1739
+ }
1740
+
1741
+ case Categorical:
1742
+ {
1743
+
1744
+ if (input_data.ncat[workspace.col_best] == 2) {
1745
+
1746
+ divide_subset_split(&workspace.ix_arr[0], input_data.categorical_data + workspace.col_best * input_data.nrows,
1747
+ workspace.st, workspace.end, (int)0,
1748
+ (bool)(input_data.has_NA[workspace.col_best + input_data.ncols_numeric]),
1749
+ &(workspace.this_split_NA), &(workspace.this_split_ix) );
1750
+ spl1 = InSubset; spl2 = NotInSubset;
1751
+ set_tree_as_categorical(workspace.tree->back(), workspace.col_best);
1752
+
1753
+ } else if (workspace.col_is_bin || model_params.cat_bruteforce_subset) {
1754
+
1755
+ divide_subset_split(&workspace.ix_arr[0], input_data.categorical_data + workspace.col_best * input_data.nrows,
1756
+ workspace.st, workspace.end, &workspace.buffer_subset_categ_best[0], input_data.ncat[workspace.col_best],
1757
+ (bool)(input_data.has_NA[workspace.col_best + input_data.ncols_numeric]),
1758
+ &(workspace.this_split_NA), &(workspace.this_split_ix) );
1759
+ spl1 = InSubset; spl2 = NotInSubset;
1760
+ set_tree_as_categorical(workspace.tree->back(), input_data.ncat[workspace.col_best],
1761
+ &workspace.buffer_subset_categ_best[0], workspace.col_best);
1762
+
1763
+ } else {
1764
+ spl1 = SingleCateg;
1765
+ workspace.temp_ptr_x = input_data.categorical_data + workspace.col_best * input_data.nrows;
1766
+ std::sort(&workspace.ix_arr[0] + workspace.st, &workspace.ix_arr[0] + workspace.end + 1,
1767
+ [&workspace](const size_t a, const size_t b){return workspace.temp_ptr_x[a] < workspace.temp_ptr_x[b];});
1768
+ set_tree_as_categorical(workspace.tree->back(), workspace.col_best, input_data.ncat[workspace.col_best]);
1769
+
1770
+ for (size_t row = workspace.st; row <= workspace.end; row++) {
1771
+ if (workspace.temp_ptr_x[ workspace.ix_arr[row] ] >= 0) {
1772
+ workspace.this_split_NA = row;
1773
+ break;
1774
+ }
1775
+ }
1776
+ }
1777
+ break;
1778
+ }
1779
+
1780
+
1781
+ }
1782
+
1783
+
1784
+ ix1 = workspace.this_split_NA;
1785
+ ix2 = workspace.this_split_ix;
1786
+ ix3 = workspace.end;
1787
+
1788
+ /* NA branch */
1789
+ if (workspace.st > workspace.this_split_NA &&
1790
+ (workspace.st - workspace.this_split_NA) >= 2 * model_params.min_size_categ) {
1791
+
1792
+ workspace.end = ix1 - 1;
1793
+ (*workspace.tree)[tree_from].tree_NA = workspace.tree->size();
1794
+ workspace.tree->emplace_back(tree_from, IsNa);
1795
+ recursive_split_categ(workspace, input_data, model_params, curr_depth, true);
1796
+ }
1797
+
1798
+ if (spl1 == SingleCateg) {
1799
+
1800
+ /* TODO: this should be done instead in a loop per category looking for the start and end positions
1801
+ in ix_arr of each category using std::lower_bound */
1802
+
1803
+ /* TODO: it's not necessary to backup everything like when using 'follow_all', only need 'best_col' and 'temp_ptr_x' */
1804
+ state_backup = std::unique_ptr<RecursionState>(new RecursionState);
1805
+ for (int cat = 1; cat < input_data.ncat[workspace.col_best]; cat++) {
1806
+
1807
+ /* TODO: this is inefficient when some categories are not present, should instead at first do a pass over 'ix_arr'
1808
+ to calculate the start and end indices of each category, then loop over that array instead */
1809
+ for (size_t row = ix1 + 1; row < ix3; row++) {
1810
+ if (workspace.temp_ptr_x[ workspace.ix_arr[row] ] == cat) {
1811
+ if ((row - ix1) >= 2 * model_params.min_size_categ) {
1812
+ (*workspace.tree)[tree_from].binary_branches[cat-1] = workspace.tree->size();
1813
+ workspace.tree->emplace_back(tree_from, spl1);
1814
+ backup_recursion_state(workspace, *state_backup);
1815
+ workspace.st = ix1;
1816
+ workspace.end = row - 1;
1817
+ recursive_split_categ(workspace, input_data, model_params, curr_depth, is_NA_branch);
1818
+ restore_recursion_state(workspace, *state_backup);
1819
+ }
1820
+ ix1 = row;
1821
+ break;
1822
+ }
1823
+ else if (workspace.temp_ptr_x[ workspace.ix_arr[row] ] > cat) {
1824
+ ix1 = row;
1825
+ break;
1826
+ }
1827
+ }
1828
+
1829
+ }
1830
+ /* last category is given by the end index */
1831
+ if ((ix3 - ix1) >= 2 * model_params.min_size_categ) {
1832
+ (*workspace.tree)[tree_from].binary_branches[input_data.ncat[workspace.col_best]-1] = workspace.tree->size();
1833
+ workspace.tree->emplace_back(tree_from, spl1);
1834
+ workspace.st = ix1;
1835
+ workspace.end = ix3;
1836
+ recursive_split_categ(workspace, input_data, model_params, curr_depth, is_NA_branch);
1837
+ } else {
1838
+ (*workspace.tree)[tree_from].binary_branches.push_back(0);
1839
+ }
1840
+
1841
+ } else {
1842
+ /* numeric, ordinal, and subset split */
1843
+
1844
+ /* left branch */
1845
+ if ((ix2 - ix1) >= 2 * model_params.min_size_categ) {
1846
+ workspace.st = ix1;
1847
+ workspace.end = ix2 - 1;
1848
+ (*workspace.tree)[tree_from].tree_left = workspace.tree->size();
1849
+ workspace.tree->emplace_back(tree_from, spl1);
1850
+ recursive_split_categ(workspace, input_data, model_params, curr_depth, is_NA_branch);
1851
+ }
1852
+
1853
+ /* right branch */
1854
+ if ((ix3 - ix2) > 2 * model_params.min_size_categ) {
1855
+ workspace.st = ix2;
1856
+ workspace.end = ix3;
1857
+ (*workspace.tree)[tree_from].tree_right = workspace.tree->size();
1858
+ workspace.tree->emplace_back(tree_from, spl2);
1859
+ recursive_split_categ(workspace, input_data, model_params, curr_depth, is_NA_branch);
1860
+ }
1861
+
1862
+ }
1863
+
1864
+
1865
+ }
1866
+
1867
+
1868
+ /* if tree has no clusters and no subtrees, disconnect it from parent and then drop */
1869
+ drop_if_not_needed:
1870
+ if (check_tree_is_not_needed((*workspace.tree)[tree_from])) {
1871
+
1872
+ if (tree_from == 0) {
1873
+ workspace.tree->clear();
1874
+ } else if ((*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.size() > 0) {
1875
+ (*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.pop_back();
1876
+ workspace.tree->pop_back();
1877
+ } else {
1878
+ switch((*workspace.tree)[tree_from].parent_branch) {
1879
+
1880
+ case IsNa:
1881
+ {
1882
+ (*workspace.tree)[(*workspace.tree)[tree_from].parent].tree_NA = 0;
1883
+ workspace.tree->pop_back();
1884
+ break;
1885
+ }
1886
+
1887
+ case LessOrEqual:
1888
+ {
1889
+ (*workspace.tree)[(*workspace.tree)[tree_from].parent].tree_left = 0;
1890
+ workspace.tree->pop_back();
1891
+ break;
1892
+ }
1893
+
1894
+ case Greater:
1895
+ {
1896
+ (*workspace.tree)[(*workspace.tree)[tree_from].parent].tree_right = 0;
1897
+ workspace.tree->pop_back();
1898
+ break;
1899
+ }
1900
+
1901
+ case InSubset:
1902
+ {
1903
+ (*workspace.tree)[(*workspace.tree)[tree_from].parent].tree_left = 0;
1904
+ workspace.tree->pop_back();
1905
+ break;
1906
+ }
1907
+
1908
+ case NotInSubset:
1909
+ {
1910
+ (*workspace.tree)[(*workspace.tree)[tree_from].parent].tree_right = 0;
1911
+ workspace.tree->pop_back();
1912
+ break;
1913
+ }
1914
+
1915
+ case SingleCateg:
1916
+ {
1917
+ (*workspace.tree)[(*workspace.tree)[tree_from].parent].binary_branches.back() = 0;
1918
+ workspace.tree->pop_back();
1919
+ break;
1920
+ }
1921
+
1922
+ case SubTrees:
1923
+ {
1924
+ (*workspace.tree)[(*workspace.tree)[tree_from].parent].binary_branches.pop_back();
1925
+ workspace.tree->pop_back();
1926
+ break;
1927
+ }
1928
+ }
1929
+ }
1930
+ }
1931
+
1932
+ }