outliertree 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE.txt +674 -0
- data/NOTICE.txt +14 -0
- data/README.md +107 -0
- data/ext/outliertree/ext.cpp +260 -0
- data/ext/outliertree/extconf.rb +21 -0
- data/lib/outliertree.rb +17 -0
- data/lib/outliertree/dataset.rb +35 -0
- data/lib/outliertree/model.rb +128 -0
- data/lib/outliertree/result.rb +190 -0
- data/lib/outliertree/version.rb +3 -0
- data/vendor/outliertree/LICENSE +674 -0
- data/vendor/outliertree/README.md +155 -0
- data/vendor/outliertree/src/Makevars +3 -0
- data/vendor/outliertree/src/RcppExports.cpp +123 -0
- data/vendor/outliertree/src/Rwrapper.cpp +1225 -0
- data/vendor/outliertree/src/cat_outlier.cpp +328 -0
- data/vendor/outliertree/src/clusters.cpp +972 -0
- data/vendor/outliertree/src/fit_model.cpp +1932 -0
- data/vendor/outliertree/src/misc.cpp +685 -0
- data/vendor/outliertree/src/outlier_tree.hpp +758 -0
- data/vendor/outliertree/src/predict.cpp +706 -0
- data/vendor/outliertree/src/split.cpp +1098 -0
- metadata +150 -0
@@ -0,0 +1,1932 @@
|
|
1
|
+
/********************************************************************************************************************
|
2
|
+
* Explainable outlier detection
|
3
|
+
*
|
4
|
+
* Tries to detect outliers by generating decision trees that attempt to predict the values of each column based on
|
5
|
+
* each other column, testing in each branch of every tried split (if it meets some minimum criteria) whether there
|
6
|
+
* are observations that seem too distant from the others in a 1-D distribution for the column that the split tries
|
7
|
+
* to "predict" (will not generate a score for each observation).
|
8
|
+
* Splits are based on gain, while outlierness is based on confidence intervals.
|
9
|
+
* Similar in spirit to the GritBot software developed by RuleQuest research. Reference article is:
|
10
|
+
* Cortes, David. "Explainable outlier detection through decision tree conditioning."
|
11
|
+
* arXiv preprint arXiv:2001.00636 (2020).
|
12
|
+
*
|
13
|
+
*
|
14
|
+
* Copyright 2020 David Cortes.
|
15
|
+
*
|
16
|
+
* Written for C++11 standard and OpenMP 2.0 or later. Code is meant to be wrapped into scripting languages
|
17
|
+
* such as R or Python.
|
18
|
+
*
|
19
|
+
* This file is part of OutlierTree.
|
20
|
+
*
|
21
|
+
* OutlierTree is free software: you can redistribute it and/or modify
|
22
|
+
* it under the terms of the GNU General Public License as published by
|
23
|
+
* the Free Software Foundation, either version 3 of the License, or
|
24
|
+
* (at your option) any later version.
|
25
|
+
*
|
26
|
+
* OutlierTree is distributed in the hope that it will be useful,
|
27
|
+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
28
|
+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
29
|
+
* GNU General Public License for more details.
|
30
|
+
*
|
31
|
+
* You should have received a copy of the GNU General Public License
|
32
|
+
* along with OutlierTree. If not, see <https://www.gnu.org/licenses/>.
|
33
|
+
********************************************************************************************************************/
|
34
|
+
#include "outlier_tree.hpp"
|
35
|
+
|
36
|
+
/* Fit outliers model based on conditional distributions obtaines thorugh decision-tree splitting
|
37
|
+
*
|
38
|
+
* Note1: the function here will not perform any data validation - it must be done from outside already.
|
39
|
+
* Note2: the data types (double/int) were chosen due to R's internal representations of data structures,
|
40
|
+
* which only supports those types.
|
41
|
+
*
|
42
|
+
* Parameters:
|
43
|
+
* - model_outputs (out)
|
44
|
+
* Struct with the model outputs required for prediction time (trees and clusters) and information about identified outliers
|
45
|
+
* required to display their statistics. If there was any previous information from fitting the model to other data, it will
|
46
|
+
* be overwritten.
|
47
|
+
* - numeric_data[n * m1] (in)
|
48
|
+
* Array with numerical columns in the data. Must be ordered by columns like Fortran arrays.
|
49
|
+
* Missing values should be encoded as NaN. Infinite values in most sections are treated as NaN too.
|
50
|
+
* Binary or boolean columns must be passed as categorical.
|
51
|
+
* If there are no numerical columns, pass NULL.
|
52
|
+
* - ncols_numeric (in)
|
53
|
+
* Number of numeric columns in the array 'numeric_data'.
|
54
|
+
* - categorical_data[n * m2] (in)
|
55
|
+
* Array with categorical columns in the data. Must be ordered by columns like Fortran arrays.
|
56
|
+
* Negative numbers will be interpreted as missing values. Numeration must start at zero and be
|
57
|
+
* contiguous (i.e. if there's category 2, must also have category 1).
|
58
|
+
* If there are no categorical columns, pass NULL.
|
59
|
+
* - ncols_categ (in)
|
60
|
+
* Numer of categorical columns in the array 'categorical_data'.
|
61
|
+
* - ncat[m2] (in)
|
62
|
+
* Numer of categories in each categorical column. If there are no categorical columns, pass NULL.
|
63
|
+
* - ordinal_data[n * m3] (in)
|
64
|
+
* Array with ordinal categorical columns in the data. Must be ordered by columns like Fortran arrays.
|
65
|
+
* Same rules as for categorical data. Note that the order will only be taken into consideration when
|
66
|
+
* producing splits by these columns, but outliers are still detected in the same way as for categoricals.
|
67
|
+
* Binary or boolean columns must be passed as categorical (i.e. minimum categories in a column is 3).
|
68
|
+
* If there are no ordinal columns, pass NULL.
|
69
|
+
* - ncols_ord (in)
|
70
|
+
* Numer of ordinal columns in the array 'ordinal_data'.
|
71
|
+
* - ncat_ord[m3] (in)
|
72
|
+
* Numer of categories in each ordinal column. If there are no categorical columns, pass NULL.
|
73
|
+
* - nrows (in)
|
74
|
+
* Numer of rows in the arrays passed above.
|
75
|
+
* - cols_ignore[m1 + m2 + m3] (in)
|
76
|
+
* Boolean array indicating which columns should only be used as splitting criterion for other columns,
|
77
|
+
* while being ignored at the moment of finding outlier values in them. Pass NULL if outliers are to be
|
78
|
+
* searched for in all columns (this is the default).
|
79
|
+
* - nthreads (in)
|
80
|
+
* Numer of parallel threads to use. Should not be higher than the number of columns.
|
81
|
+
* Note that the more threads used, the more memory will need to be allocated.
|
82
|
+
* - categ_as_bin (in)
|
83
|
+
* Whether to binarize categorical columns at each category to split them by another categorical column.
|
84
|
+
* If this is false and 'cat_bruteforce_subset' is also false, then when splitting a categorical or ordinal
|
85
|
+
* variable by another categorical, it will have one branch per category of the splitting column. Ignored
|
86
|
+
* when splitting by numerical and ordinal. Overrides 'cat_bruteforce_subset' when passing true.
|
87
|
+
* - ord_as_bin (in)
|
88
|
+
* Same as above, but binarization is by less/greater than a level in the order.
|
89
|
+
* - cat_bruteforce_subset (in)
|
90
|
+
* Whether to do a brute-force search over all possible binary splits of grouped subsets of categories when
|
91
|
+
* splitting a categorical or ordinal column by another categorical column. If this is false and 'categ_as_bin'
|
92
|
+
* is also false, then when splitting a categorical or ordinal variable by another categorical, it will have
|
93
|
+
* one branch per category of the splitting column. Ignored when splitting by numerical and ordinal.
|
94
|
+
* Will be ignored when passing 'categ_as_bin' = true.
|
95
|
+
* - categ_from_maj (in)
|
96
|
+
* Whether to flag outliers in categorical variables according to the number of observations not belonging to
|
97
|
+
* the majority class (formula will be (n-n_maj)/(n * p_prior) < 1/(z_outlier^2) for each category). If passing
|
98
|
+
* 'false', will instead look for outliers in categorical variables based on being a minority and having a gap
|
99
|
+
* with respect to other categories, even if there is no dominant majority.
|
100
|
+
* - max_depth (in)
|
101
|
+
* Max depth of decision trees that generate conditional distributions (subsets of the data) in which to look
|
102
|
+
* for outliers.
|
103
|
+
* - max_perc_outliers (in)
|
104
|
+
* Model parameter. Approximate maximum percentage of outlier observations in each cluster. Default value is 0.01.
|
105
|
+
* - min_size_numeric (in)
|
106
|
+
* Minimum size that numeric clusters and splits on numeric variables can have. Default value is 35.
|
107
|
+
* - min_size_categ (in)
|
108
|
+
* Same but for categoricals. Default value is 75.
|
109
|
+
* - min_gain (in)
|
110
|
+
* Minimum gain that a split must produce in order not to discard it. Default value is 0.01 (in GritBot it's 0.000001).
|
111
|
+
* - gain_as_pct (in)
|
112
|
+
* Whether the gain above should be taken in absolute terms (sd_full - (n1*sd1 + n2*sd2)/n), or as a percentage
|
113
|
+
* ( (sd_full - (n1*sd1 + n2*sd2)/n) / sd_full ) (Replace 'sd' with shannon entropy for categorical variables).
|
114
|
+
* Taking it in absolute terms will prefer making more splits on columns that have a large variance, while taking it
|
115
|
+
* as a percentage might be more restrictive on them and might create deeper trees in some columns.
|
116
|
+
* - follow_all (in)
|
117
|
+
* Whether to create new tree branches (and continue creating new splits from all of them) from every split that meets them
|
118
|
+
* minimum gain or not. Doing so (which GritBot doesn't) will make the procedure much slower, but can flag more observations
|
119
|
+
* as outliers (with a much larger false-positive rate). Default is 'false'.
|
120
|
+
* - z_norm (in)
|
121
|
+
* Maximum Z value that is considered as normal in a distribution. Default value is 2.67 (percentile 99)
|
122
|
+
* - z_outlier (in)
|
123
|
+
* Minimum Z value that can be considered as outlier in numerical columns. Not used for categorical or ordinal columns.
|
124
|
+
*
|
125
|
+
* Returns:
|
126
|
+
* Whether any outliers were identified in the data to which the model was fit.
|
127
|
+
*/
|
128
|
+
bool fit_outliers_models(ModelOutputs &model_outputs,
|
129
|
+
double *restrict numeric_data, size_t ncols_numeric,
|
130
|
+
int *restrict categorical_data, size_t ncols_categ, int *restrict ncat,
|
131
|
+
int *restrict ordinal_data, size_t ncols_ord, int *restrict ncat_ord,
|
132
|
+
size_t nrows, char *restrict cols_ignore, int nthreads,
|
133
|
+
bool categ_as_bin, bool ord_as_bin, bool cat_bruteforce_subset, bool categ_from_maj, bool take_mid,
|
134
|
+
size_t max_depth, double max_perc_outliers, size_t min_size_numeric, size_t min_size_categ,
|
135
|
+
double min_gain, bool gain_as_pct, bool follow_all, double z_norm, double z_outlier)
|
136
|
+
{
|
137
|
+
|
138
|
+
/* put parameters and data into structs to avoid passing too many function arguments each time */
|
139
|
+
double z_tail = z_outlier - z_norm;
|
140
|
+
ModelParams model_params = {
|
141
|
+
categ_as_bin, ord_as_bin, cat_bruteforce_subset, categ_from_maj, take_mid,
|
142
|
+
max_depth, max_perc_outliers, min_size_numeric, min_size_categ,
|
143
|
+
min_gain, gain_as_pct, follow_all, z_norm, z_outlier, z_tail,
|
144
|
+
std::vector<long double>()
|
145
|
+
};
|
146
|
+
|
147
|
+
size_t tot_cols = ncols_numeric + ncols_categ + ncols_ord;
|
148
|
+
InputData input_data = {
|
149
|
+
numeric_data, ncols_numeric, categorical_data, ncols_categ, ncat,
|
150
|
+
ordinal_data, ncols_ord, ncat_ord, nrows, tot_cols, std::vector<char>(),
|
151
|
+
std::vector<char>(), -1, std::vector<size_t>(),
|
152
|
+
};
|
153
|
+
|
154
|
+
model_outputs.ncat.assign(ncat, ncat + ncols_categ);
|
155
|
+
model_outputs.ncat_ord.assign(ncat_ord, ncat_ord + ncols_ord);
|
156
|
+
model_outputs.ncols_numeric = ncols_numeric;
|
157
|
+
model_outputs.ncols_categ = ncols_categ;
|
158
|
+
model_outputs.ncols_ord = ncols_ord;
|
159
|
+
model_outputs.max_depth = max_depth;
|
160
|
+
model_outputs.min_outlier_any_cl.resize(model_outputs.ncols_numeric, -HUGE_VAL);
|
161
|
+
model_outputs.max_outlier_any_cl.resize(model_outputs.ncols_numeric, HUGE_VAL);
|
162
|
+
model_outputs.cat_outlier_any_cl.resize(model_outputs.ncols_categ + model_outputs.ncols_ord);
|
163
|
+
|
164
|
+
if (tot_cols < (size_t)nthreads)
|
165
|
+
nthreads = (int) tot_cols;
|
166
|
+
#ifndef _OPENMP
|
167
|
+
std::vector<Workspace> workspace(1);
|
168
|
+
#else
|
169
|
+
std::vector<Workspace> workspace(nthreads);
|
170
|
+
#endif
|
171
|
+
workspace.shrink_to_fit();
|
172
|
+
|
173
|
+
/* in case the model was already fit from before */
|
174
|
+
model_outputs.all_clusters.clear();
|
175
|
+
model_outputs.all_trees.clear();
|
176
|
+
allocate_row_outputs(model_outputs, nrows, max_depth);
|
177
|
+
|
178
|
+
/* initialize info holders as needed */
|
179
|
+
bool found_outliers = false;
|
180
|
+
input_data.has_NA.resize(tot_cols, false);
|
181
|
+
input_data.skip_col.resize(tot_cols, false);
|
182
|
+
model_outputs.start_ix_cat_counts.resize(ncols_categ + ncols_ord + 1);
|
183
|
+
model_outputs.col_transf.resize(ncols_numeric, NoTransf);
|
184
|
+
model_outputs.transf_offset.resize(ncols_numeric);
|
185
|
+
model_outputs.sd_div.resize(ncols_numeric);
|
186
|
+
model_outputs.min_decimals_col.resize(ncols_numeric);
|
187
|
+
|
188
|
+
/* determine maximum number of categories in a column, allocate arrays for category counts and proportions */
|
189
|
+
model_outputs.start_ix_cat_counts[0] = 0;
|
190
|
+
if (tot_cols > ncols_numeric) {
|
191
|
+
input_data.max_categ = calculate_category_indices(&model_outputs.start_ix_cat_counts[0], input_data.ncat, input_data.ncols_categ,
|
192
|
+
(bool*) &input_data.skip_col[ncols_numeric]);
|
193
|
+
input_data.max_categ = calculate_category_indices(&model_outputs.start_ix_cat_counts[input_data.ncols_categ], input_data.ncat_ord, input_data.ncols_ord,
|
194
|
+
(bool*) &input_data.skip_col[input_data.ncols_numeric + input_data.ncols_categ], input_data.max_categ);
|
195
|
+
} else {
|
196
|
+
input_data.max_categ = 0;
|
197
|
+
}
|
198
|
+
|
199
|
+
/* now allocate arrays for proportions */
|
200
|
+
input_data.cat_counts.resize(model_outputs.start_ix_cat_counts[ncols_categ + ncols_ord], 0);
|
201
|
+
model_params.prop_small.resize(model_outputs.start_ix_cat_counts[ncols_categ + ncols_ord]);
|
202
|
+
model_outputs.prop_categ.resize(model_outputs.start_ix_cat_counts[ncols_categ + ncols_ord]);
|
203
|
+
|
204
|
+
/* calculate prior probabilities for categorical variables (in parallel), see if any is unsplittable */
|
205
|
+
if (tot_cols > ncols_numeric) {
|
206
|
+
#pragma omp parallel
|
207
|
+
{
|
208
|
+
#pragma omp sections
|
209
|
+
{
|
210
|
+
|
211
|
+
#pragma omp section
|
212
|
+
{
|
213
|
+
if (ncols_categ > 0) {
|
214
|
+
calculate_all_cat_counts(&model_outputs.start_ix_cat_counts[0], &input_data.cat_counts[0], input_data.ncat,
|
215
|
+
input_data.categorical_data, input_data.ncols_categ, input_data.nrows,
|
216
|
+
(bool*) &input_data.has_NA[ncols_numeric], (bool*) &input_data.skip_col[input_data.ncols_numeric],
|
217
|
+
std::min(input_data.ncols_categ, (size_t)std::max(1, nthreads - 1)) );
|
218
|
+
|
219
|
+
check_cat_col_unsplittable(&model_outputs.start_ix_cat_counts[0], &input_data.cat_counts[0], input_data.ncat,
|
220
|
+
input_data.ncols_categ, std::min(model_params.min_size_numeric, model_params.min_size_categ), input_data.nrows,
|
221
|
+
(bool*) &input_data.skip_col[input_data.ncols_numeric],
|
222
|
+
std::min(input_data.ncols_categ, (size_t)std::max(1, nthreads - 1)));
|
223
|
+
}
|
224
|
+
|
225
|
+
|
226
|
+
}
|
227
|
+
|
228
|
+
#pragma omp section
|
229
|
+
{
|
230
|
+
if (ncols_ord > 0) {
|
231
|
+
calculate_all_cat_counts(&model_outputs.start_ix_cat_counts[input_data.ncols_categ], &input_data.cat_counts[0], input_data.ncat_ord,
|
232
|
+
input_data.ordinal_data, input_data.ncols_ord, input_data.nrows,
|
233
|
+
(bool*) &input_data.has_NA[input_data.ncols_numeric + input_data.ncols_categ],
|
234
|
+
(bool*) &input_data.skip_col[input_data.ncols_numeric + input_data.ncols_categ],
|
235
|
+
std::max((int)1, nthreads - (int)input_data.ncols_categ) );
|
236
|
+
|
237
|
+
check_cat_col_unsplittable(&model_outputs.start_ix_cat_counts[input_data.ncols_categ], &input_data.cat_counts[0], input_data.ncat_ord,
|
238
|
+
ncols_ord, std::min(model_params.min_size_numeric, model_params.min_size_categ), input_data.nrows,
|
239
|
+
(bool*) &input_data.skip_col[input_data.ncols_numeric + input_data.ncols_categ],
|
240
|
+
std::max((int)1, nthreads - (int)input_data.ncols_categ));
|
241
|
+
}
|
242
|
+
}
|
243
|
+
}
|
244
|
+
|
245
|
+
}
|
246
|
+
|
247
|
+
|
248
|
+
/* calculate proprotion limit and CI for each category of each column */
|
249
|
+
calculate_lowerlim_proportion(&model_params.prop_small[0], &model_outputs.prop_categ[0], &model_outputs.start_ix_cat_counts[0],
|
250
|
+
&input_data.cat_counts[0], input_data.ncols_categ, input_data.nrows, model_params.z_norm, model_params.z_tail);
|
251
|
+
calculate_lowerlim_proportion(&model_params.prop_small[0], &model_outputs.prop_categ[0], &model_outputs.start_ix_cat_counts[input_data.ncols_categ],
|
252
|
+
&input_data.cat_counts[0], input_data.ncols_ord, input_data.nrows, model_params.z_norm, model_params.z_tail);
|
253
|
+
}
|
254
|
+
|
255
|
+
/* for numerical columns, check if they have NAs or if total variance is too small */
|
256
|
+
check_missing_no_variance(input_data.numeric_data, input_data.ncols_numeric, input_data.nrows,
|
257
|
+
(bool*) &input_data.has_NA[0], (bool*) &input_data.skip_col[0],
|
258
|
+
model_outputs.min_decimals_col.data(), nthreads);
|
259
|
+
|
260
|
+
/* determine an approximate size for the output clusters, and reserve memory right away */
|
261
|
+
model_outputs.all_clusters.resize(tot_cols);
|
262
|
+
model_outputs.all_trees.resize(tot_cols);
|
263
|
+
#pragma omp parallel for shared(model_outputs, input_data, model_params, tot_cols)
|
264
|
+
for (size_t_for col = 0; col < tot_cols; col++) {
|
265
|
+
if (input_data.skip_col[col]) continue;
|
266
|
+
if (cols_ignore != NULL && cols_ignore[col]) continue;
|
267
|
+
model_outputs.all_clusters[col].reserve(tot_cols * std::min(2 * input_data.nrows, pow2(model_params.max_depth + 1)));
|
268
|
+
model_outputs.all_trees[col].reserve( square(model_params.max_depth) );
|
269
|
+
/* this is not exact as categoricals and ordinals can also be split multiple times */
|
270
|
+
}
|
271
|
+
|
272
|
+
|
273
|
+
/* now run the procedure on each column separately */
|
274
|
+
int tid;
|
275
|
+
nthreads = std::min(nthreads, (int)(ncols_numeric + ncols_categ + ncols_ord));
|
276
|
+
#pragma omp parallel for num_threads(nthreads) schedule(dynamic, 1) private(tid) shared(workspace, model_outputs, input_data, model_params, tot_cols)
|
277
|
+
for (size_t_for col = 0; col < tot_cols; col++) {
|
278
|
+
|
279
|
+
if (cols_ignore != NULL && cols_ignore[col]) continue;
|
280
|
+
if (input_data.skip_col[col] && col < input_data.ncols_numeric) continue;
|
281
|
+
tid = omp_get_thread_num();
|
282
|
+
|
283
|
+
/* re-use thread-private memory if possible */
|
284
|
+
if (!check_workspace_is_allocated(workspace[tid]))
|
285
|
+
allocate_thread_workspace(workspace[tid], input_data.nrows, input_data.max_categ);
|
286
|
+
|
287
|
+
/* numerical column */
|
288
|
+
if (col < input_data.ncols_numeric) {
|
289
|
+
process_numeric_col(model_outputs.all_clusters[col],
|
290
|
+
model_outputs.all_trees[col],
|
291
|
+
col,
|
292
|
+
workspace[tid],
|
293
|
+
input_data,
|
294
|
+
model_params, model_outputs);
|
295
|
+
calculate_cluster_minimums(model_outputs, col);
|
296
|
+
}
|
297
|
+
|
298
|
+
/* categorical column */
|
299
|
+
else if (col < (input_data.ncols_numeric + input_data.ncols_categ)) {
|
300
|
+
process_categ_col(model_outputs.all_clusters[col],
|
301
|
+
model_outputs.all_trees[col],
|
302
|
+
col, false,
|
303
|
+
workspace[tid],
|
304
|
+
input_data,
|
305
|
+
model_params, model_outputs);
|
306
|
+
calculate_cluster_poss_categs(model_outputs, col, col - input_data.ncols_numeric);
|
307
|
+
}
|
308
|
+
|
309
|
+
/* ordinal column */
|
310
|
+
else {
|
311
|
+
process_categ_col(model_outputs.all_clusters[col],
|
312
|
+
model_outputs.all_trees[col],
|
313
|
+
col, true,
|
314
|
+
workspace[tid],
|
315
|
+
input_data,
|
316
|
+
model_params, model_outputs);
|
317
|
+
calculate_cluster_poss_categs(model_outputs, col, col - input_data.ncols_numeric);
|
318
|
+
}
|
319
|
+
|
320
|
+
/* shrink the dynamic vectors to what ended up used only */
|
321
|
+
#ifdef TEST_MODE_DEFINE
|
322
|
+
prune_unused_trees(model_outputs.all_trees[col]);
|
323
|
+
#endif
|
324
|
+
if (
|
325
|
+
model_outputs.all_clusters[col].size() == 0 ||
|
326
|
+
model_outputs.all_trees[col].size() == 0 ||
|
327
|
+
check_tree_is_not_needed(model_outputs.all_trees[col][0])
|
328
|
+
)
|
329
|
+
{
|
330
|
+
model_outputs.all_trees[col].clear();
|
331
|
+
model_outputs.all_clusters[col].clear();
|
332
|
+
}
|
333
|
+
model_outputs.all_trees[col].shrink_to_fit();
|
334
|
+
model_outputs.all_clusters[col].shrink_to_fit();
|
335
|
+
|
336
|
+
/* simplify single-elements in subset to 'equals' or 'not equals' */
|
337
|
+
simplify_when_equal_cond(model_outputs.all_clusters[col], ncat_ord);
|
338
|
+
simplify_when_equal_cond(model_outputs.all_trees[col], ncat_ord);
|
339
|
+
|
340
|
+
/* remember only the best (rarest) value for each row */
|
341
|
+
#pragma omp critical
|
342
|
+
if (workspace[tid].col_has_outliers) {
|
343
|
+
|
344
|
+
found_outliers = true;
|
345
|
+
for (size_t row = 0; row < input_data.nrows; row++) {
|
346
|
+
|
347
|
+
if (workspace[tid].outlier_scores[row] < 1.0) {
|
348
|
+
|
349
|
+
if (
|
350
|
+
model_outputs.outlier_scores_final[row] >= 1.0 ||
|
351
|
+
(
|
352
|
+
workspace[tid].outlier_depth[row] < model_outputs.outlier_depth_final[row] &&
|
353
|
+
(
|
354
|
+
!model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch ||
|
355
|
+
model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
|
356
|
+
)
|
357
|
+
) ||
|
358
|
+
(
|
359
|
+
model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch &&
|
360
|
+
!model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch
|
361
|
+
) ||
|
362
|
+
(
|
363
|
+
workspace[tid].outlier_depth[row] == model_outputs.outlier_depth_final[row] &&
|
364
|
+
model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch
|
365
|
+
==
|
366
|
+
model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
|
367
|
+
&&
|
368
|
+
model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].cluster_size
|
369
|
+
<
|
370
|
+
model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].cluster_size
|
371
|
+
) ||
|
372
|
+
(
|
373
|
+
workspace[tid].outlier_depth[row] == model_outputs.outlier_depth_final[row] &&
|
374
|
+
model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].cluster_size
|
375
|
+
==
|
376
|
+
model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].cluster_size
|
377
|
+
&&
|
378
|
+
model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch
|
379
|
+
==
|
380
|
+
model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
|
381
|
+
&&
|
382
|
+
workspace[tid].outlier_scores[row] < model_outputs.outlier_scores_final[row]
|
383
|
+
)
|
384
|
+
)
|
385
|
+
{
|
386
|
+
model_outputs.outlier_scores_final[row] = workspace[tid].outlier_scores[row];
|
387
|
+
model_outputs.outlier_clusters_final[row] = workspace[tid].outlier_clusters[row];
|
388
|
+
model_outputs.outlier_trees_final[row] = workspace[tid].outlier_trees[row];
|
389
|
+
model_outputs.outlier_depth_final[row] = workspace[tid].outlier_depth[row];
|
390
|
+
model_outputs.outlier_columns_final[row] = col;
|
391
|
+
}
|
392
|
+
}
|
393
|
+
|
394
|
+
}
|
395
|
+
}
|
396
|
+
|
397
|
+
|
398
|
+
}
|
399
|
+
|
400
|
+
/* once finished, determine how many decimals to report for numerical outliers */
|
401
|
+
if (found_outliers)
|
402
|
+
calc_min_decimals_to_print(model_outputs, input_data.numeric_data, nthreads);
|
403
|
+
|
404
|
+
#ifdef TEST_MODE_DEFINE
|
405
|
+
for (size_t col = 0; col < tot_cols; col++) {
|
406
|
+
std::cout << "col " << col << " has " << model_outputs.all_clusters[col].size() << " clusters [" << model_outputs.all_trees[col].size() << " trees]" << std::endl;
|
407
|
+
}
|
408
|
+
|
409
|
+
find_new_outliers(numeric_data,
|
410
|
+
categorical_data,
|
411
|
+
ordinal_data,
|
412
|
+
nrows, nthreads, model_outputs);
|
413
|
+
|
414
|
+
|
415
|
+
// /* extract data for only one row */
|
416
|
+
// std::vector<double> num_data_row(ncols_numeric);
|
417
|
+
// std::vector<int> cat_data_row(ncols_categ);
|
418
|
+
// std::vector<int> ord_data_row(ncols_ord);
|
419
|
+
// size_t chosen_row = 38;
|
420
|
+
// for (size_t rowcol = 0; rowcol < ncols_numeric; rowcol++)
|
421
|
+
// num_data_row.at(rowcol) = numeric_data[chosen_row + rowcol * nrows];
|
422
|
+
// for (size_t rowcol = 0; rowcol < ncols_categ; rowcol++)
|
423
|
+
// cat_data_row.at(rowcol) = categorical_data[chosen_row + rowcol * nrows];
|
424
|
+
// for (size_t rowcol = 0; rowcol < ncols_ord; rowcol++)
|
425
|
+
// ord_data_row.at(rowcol) = ordinal_data[chosen_row + rowcol * nrows];
|
426
|
+
|
427
|
+
|
428
|
+
// find_new_outliers(&num_data_row[0],
|
429
|
+
// &cat_data_row[0],
|
430
|
+
// &ord_data_row[0],
|
431
|
+
// 1, 1, model_outputs);
|
432
|
+
// calc_min_printable_digits(model_outputs);
|
433
|
+
#endif
|
434
|
+
|
435
|
+
return found_outliers;
|
436
|
+
}
|
437
|
+
|
438
|
+
void process_numeric_col(std::vector<Cluster> &cluster_root,
|
439
|
+
std::vector<ClusterTree> &tree_root,
|
440
|
+
size_t target_col_num,
|
441
|
+
Workspace &workspace,
|
442
|
+
InputData &input_data,
|
443
|
+
ModelParams &model_params,
|
444
|
+
ModelOutputs &model_outputs)
|
445
|
+
{
|
446
|
+
/* discard NAs and infinites */
|
447
|
+
workspace.target_col_num = target_col_num;
|
448
|
+
workspace.target_numeric_col = input_data.numeric_data + target_col_num * input_data.nrows;
|
449
|
+
workspace.orig_target_col = workspace.target_numeric_col;
|
450
|
+
workspace.end = input_data.nrows - 1;
|
451
|
+
workspace.st = move_NAs_to_front(&workspace.ix_arr[0], workspace.target_numeric_col, 0, workspace.end, true);
|
452
|
+
workspace.col_has_outliers = false;
|
453
|
+
|
454
|
+
/* check for problematic distributions - need to sort data first */
|
455
|
+
std::sort(&workspace.ix_arr[0] + workspace.st, &workspace.ix_arr[0] + workspace.end + 1,
|
456
|
+
[&workspace](const size_t a, const size_t b){return workspace.target_numeric_col[a] < workspace.target_numeric_col[b];});
|
457
|
+
|
458
|
+
long double running_mean = 0;
|
459
|
+
long double mean_prev = 0;
|
460
|
+
long double running_ssq = 0;
|
461
|
+
double xval;
|
462
|
+
for (size_t row = workspace.st; row <= workspace.end; row++) {
|
463
|
+
xval = workspace.target_numeric_col[workspace.ix_arr[row]];
|
464
|
+
running_mean += (xval - running_mean) / (long double)(row - workspace.st + 1);
|
465
|
+
running_ssq += (xval - running_mean) * (xval - mean_prev);
|
466
|
+
mean_prev = running_mean;
|
467
|
+
}
|
468
|
+
|
469
|
+
check_for_tails(&workspace.ix_arr[0], workspace.st, workspace.end, workspace.target_numeric_col,
|
470
|
+
model_params.z_norm, model_params.max_perc_outliers,
|
471
|
+
&workspace.buffer_transf_y[0], (double)running_mean,
|
472
|
+
(double)sqrtl(running_ssq / (long double)(workspace.end - workspace.st)),
|
473
|
+
&workspace.left_tail, &workspace.right_tail,
|
474
|
+
&workspace.exp_transf, &workspace.log_transf);
|
475
|
+
|
476
|
+
/* if it's double-tailed, skip it as this model doesn't work properly with this */
|
477
|
+
if ( (workspace.exp_transf || !isinf(workspace.left_tail)) && (workspace.log_transf || !isinf(workspace.right_tail)) ) return;
|
478
|
+
|
479
|
+
/* apply log or exp transformation if necessary */
|
480
|
+
if (workspace.exp_transf) {
|
481
|
+
|
482
|
+
workspace.orig_mean = (double) running_mean;
|
483
|
+
workspace.orig_sd = (double) sqrtl(running_ssq / (long double)(workspace.end - workspace.st));
|
484
|
+
for (size_t row = workspace.st; row <= workspace.end; row++) {
|
485
|
+
workspace.buffer_transf_y[workspace.ix_arr[row]] = exp(z_score(workspace.target_numeric_col[workspace.ix_arr[row]], workspace.orig_mean, workspace.orig_sd));
|
486
|
+
}
|
487
|
+
workspace.target_numeric_col = &workspace.buffer_transf_y[0];
|
488
|
+
model_outputs.col_transf[workspace.target_col_num] = Exp;
|
489
|
+
model_outputs.transf_offset[workspace.target_col_num] = workspace.orig_mean;
|
490
|
+
model_outputs.sd_div[workspace.target_col_num] = workspace.orig_sd;
|
491
|
+
|
492
|
+
|
493
|
+
} else if (workspace.log_transf) {
|
494
|
+
|
495
|
+
if (workspace.target_numeric_col[workspace.ix_arr[workspace.st]] == 0) {
|
496
|
+
workspace.log_minval = -1;
|
497
|
+
} else {
|
498
|
+
workspace.log_minval = workspace.target_numeric_col[workspace.ix_arr[workspace.st]] - 1e-3;
|
499
|
+
}
|
500
|
+
|
501
|
+
for (size_t row = workspace.st; row <= workspace.end; row++) {
|
502
|
+
workspace.buffer_transf_y[workspace.ix_arr[row]] = log(workspace.target_numeric_col[workspace.ix_arr[row]] - workspace.log_minval);
|
503
|
+
}
|
504
|
+
workspace.target_numeric_col = &workspace.buffer_transf_y[0];
|
505
|
+
model_outputs.col_transf[workspace.target_col_num] = Log;
|
506
|
+
model_outputs.transf_offset[workspace.target_col_num] = workspace.log_minval;
|
507
|
+
|
508
|
+
}
|
509
|
+
|
510
|
+
/* create a cluster with no conditions */
|
511
|
+
workspace.clusters = &cluster_root;
|
512
|
+
workspace.tree = &tree_root;
|
513
|
+
std::fill(workspace.outlier_scores.begin(), workspace.outlier_scores.end(), (double)1.0);
|
514
|
+
workspace.tree->emplace_back(0, Root);
|
515
|
+
|
516
|
+
workspace.clusters->emplace_back(NoType, Root);
|
517
|
+
workspace.col_has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.st,
|
518
|
+
workspace.end, &workspace.outlier_scores[0],
|
519
|
+
&workspace.outlier_clusters[0], &workspace.outlier_trees[0], &workspace.outlier_depth[0],
|
520
|
+
workspace.clusters->back(), *(workspace.clusters), 0, 0, 0,
|
521
|
+
workspace.log_transf, workspace.log_minval, workspace.exp_transf,
|
522
|
+
workspace.orig_mean, workspace.orig_sd,
|
523
|
+
workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
|
524
|
+
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
|
525
|
+
workspace.tree->back().clusters.push_back(0);
|
526
|
+
|
527
|
+
/* remove outliers if any were found */
|
528
|
+
if (workspace.has_outliers)
|
529
|
+
workspace.st = move_outliers_to_front(&workspace.ix_arr[0], &workspace.outlier_scores[0], workspace.st, workspace.end);
|
530
|
+
|
531
|
+
/* update statistics if they've changed */
|
532
|
+
if (workspace.has_outliers || workspace.exp_transf || workspace.log_transf)
|
533
|
+
workspace.sd_y = calc_sd(&workspace.ix_arr[0], workspace.target_numeric_col,
|
534
|
+
workspace.st, workspace.end, &workspace.mean_y);
|
535
|
+
else
|
536
|
+
workspace.sd_y = sqrtl(running_ssq / (long double)(workspace.end - workspace.st));
|
537
|
+
|
538
|
+
if (model_params.max_depth > 0) recursive_split_numeric(workspace, input_data, model_params, 0, false);
|
539
|
+
}
|
540
|
+
|
541
|
+
void recursive_split_numeric(Workspace &workspace,
|
542
|
+
InputData &input_data,
|
543
|
+
ModelParams &model_params,
|
544
|
+
size_t curr_depth, bool is_NA_branch)
|
545
|
+
{
|
546
|
+
workspace.best_gain = -HUGE_VAL;
|
547
|
+
workspace.column_type_best = NoType;
|
548
|
+
workspace.lev_has_outliers = false;
|
549
|
+
if (curr_depth > 0) workspace.sd_y = calc_sd(&workspace.ix_arr[0], workspace.target_numeric_col,
|
550
|
+
workspace.st, workspace.end, &workspace.mean_y);
|
551
|
+
|
552
|
+
/* these are used to keep track of where to continue after calling a further recursion */
|
553
|
+
size_t ix1, ix2, ix3;
|
554
|
+
SplitType spl1, spl2;
|
555
|
+
size_t tree_from = workspace.tree->size() - 1;
|
556
|
+
|
557
|
+
/* when using 'follow_all' need to keep track of a lot more things */
|
558
|
+
std::unique_ptr<RecursionState> state_backup;
|
559
|
+
if (model_params.follow_all) state_backup = std::unique_ptr<RecursionState>(new RecursionState);
|
560
|
+
|
561
|
+
|
562
|
+
/* procedure: split with each other column */
|
563
|
+
|
564
|
+
/* first numeric */
|
565
|
+
for (size_t col = 0; col < input_data.ncols_numeric; col++) {
|
566
|
+
|
567
|
+
if (col == workspace.target_col_num) continue;
|
568
|
+
if (input_data.skip_col[col]) continue;
|
569
|
+
split_numericx_numericy(&workspace.ix_arr[0], workspace.st, workspace.end, input_data.numeric_data + col * input_data.nrows,
|
570
|
+
workspace.target_numeric_col, workspace.sd_y, (bool)(input_data.has_NA[col]), model_params.min_size_numeric,
|
571
|
+
model_params.take_mid, &workspace.buffer_sd[0], &(workspace.this_gain), &(workspace.this_split_point),
|
572
|
+
&(workspace.this_split_ix), &(workspace.this_split_NA));
|
573
|
+
if (model_params.gain_as_pct) workspace.this_gain /= workspace.sd_y;
|
574
|
+
|
575
|
+
/* if the gain is not insignificant, check clusters created by this split */
|
576
|
+
if (workspace.this_gain >= model_params.min_gain) {
|
577
|
+
|
578
|
+
/* NA branch */
|
579
|
+
if (workspace.this_split_NA > workspace.st &&
|
580
|
+
(workspace.this_split_NA - workspace.st) > model_params.min_size_numeric) {
|
581
|
+
|
582
|
+
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
583
|
+
workspace.clusters->emplace_back(Numeric, col, IsNa, -HUGE_VAL, true);
|
584
|
+
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.st,
|
585
|
+
workspace.this_split_NA - 1, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
|
586
|
+
&workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
|
587
|
+
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
588
|
+
workspace.log_transf, workspace.log_minval, workspace.exp_transf,
|
589
|
+
workspace.orig_mean, workspace.orig_sd,
|
590
|
+
workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
|
591
|
+
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
|
592
|
+
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
593
|
+
|
594
|
+
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
595
|
+
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
596
|
+
workspace.tree->emplace_back(tree_from, col, HUGE_VAL, IsNa);
|
597
|
+
backup_recursion_state(workspace, *state_backup);
|
598
|
+
workspace.end = workspace.this_split_NA - 1;
|
599
|
+
recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, true);
|
600
|
+
restore_recursion_state(workspace, *state_backup);
|
601
|
+
}
|
602
|
+
|
603
|
+
}
|
604
|
+
|
605
|
+
/* left branch */
|
606
|
+
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
607
|
+
workspace.clusters->emplace_back(Numeric, col, LessOrEqual, workspace.this_split_point, is_NA_branch);
|
608
|
+
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.this_split_NA,
|
609
|
+
workspace.this_split_ix, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
|
610
|
+
&workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
|
611
|
+
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
612
|
+
workspace.log_transf, workspace.log_minval, workspace.exp_transf,
|
613
|
+
workspace.orig_mean, workspace.orig_sd,
|
614
|
+
workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
|
615
|
+
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
|
616
|
+
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
617
|
+
|
618
|
+
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
619
|
+
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
620
|
+
workspace.tree->emplace_back(tree_from, col, workspace.this_split_point, LessOrEqual);
|
621
|
+
backup_recursion_state(workspace, *state_backup);
|
622
|
+
workspace.st = workspace.this_split_NA;
|
623
|
+
workspace.end = workspace.this_split_ix;
|
624
|
+
recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
625
|
+
restore_recursion_state(workspace, *state_backup);
|
626
|
+
}
|
627
|
+
|
628
|
+
|
629
|
+
/* right branch */
|
630
|
+
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
631
|
+
workspace.clusters->emplace_back(Numeric, col, Greater, workspace.this_split_point, is_NA_branch);
|
632
|
+
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.this_split_ix + 1,
|
633
|
+
workspace.end, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
|
634
|
+
&workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
|
635
|
+
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
636
|
+
workspace.log_transf, workspace.log_minval, workspace.exp_transf,
|
637
|
+
workspace.orig_mean, workspace.orig_sd,
|
638
|
+
workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
|
639
|
+
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
|
640
|
+
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
641
|
+
|
642
|
+
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
643
|
+
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
644
|
+
workspace.tree->emplace_back(tree_from, col, workspace.this_split_point, Greater);
|
645
|
+
backup_recursion_state(workspace, *state_backup);
|
646
|
+
workspace.st = workspace.this_split_ix + 1;
|
647
|
+
recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
648
|
+
restore_recursion_state(workspace, *state_backup);
|
649
|
+
}
|
650
|
+
|
651
|
+
|
652
|
+
/* if this is the best split, remember it for later */
|
653
|
+
if (workspace.this_gain > workspace.best_gain && !model_params.follow_all) {
|
654
|
+
workspace.best_gain = workspace.this_gain;
|
655
|
+
workspace.column_type_best = Numeric;
|
656
|
+
workspace.col_best = col;
|
657
|
+
workspace.split_point_best = workspace.this_split_point;
|
658
|
+
}
|
659
|
+
|
660
|
+
}
|
661
|
+
|
662
|
+
}
|
663
|
+
|
664
|
+
/* then categorical */
|
665
|
+
for (size_t col = 0; col < input_data.ncols_categ; col++) {
|
666
|
+
|
667
|
+
if (input_data.skip_col[col + input_data.ncols_numeric]) continue;
|
668
|
+
|
669
|
+
split_categx_numericy(&workspace.ix_arr[0], workspace.st, workspace.end, input_data.categorical_data + col * input_data.nrows,
|
670
|
+
workspace.target_numeric_col, workspace.sd_y, workspace.mean_y, false, input_data.ncat[col], &workspace.buffer_cat_cnt[0],
|
671
|
+
&workspace.buffer_cat_sum[0], &workspace.buffer_cat_sum_sq[0], &workspace.buffer_cat_sorted[0],
|
672
|
+
(bool)(input_data.has_NA[col + input_data.ncols_numeric]), model_params.min_size_numeric,
|
673
|
+
&(workspace.this_gain), &workspace.buffer_subset_categ[0], NULL);
|
674
|
+
if (model_params.gain_as_pct) workspace.this_gain /= workspace.sd_y;
|
675
|
+
|
676
|
+
if (workspace.this_gain >= model_params.min_gain) {
|
677
|
+
|
678
|
+
/* data is not arranged inside the splitting function, need to now assign to the branches as determined */
|
679
|
+
divide_subset_split(&workspace.ix_arr[0], input_data.categorical_data + col * input_data.nrows, workspace.st, workspace.end,
|
680
|
+
&workspace.buffer_subset_categ[0], input_data.ncat[col], (bool)(workspace.buffer_cat_cnt[input_data.ncat[col]] > 0),
|
681
|
+
&(workspace.this_split_NA), &(workspace.this_split_ix));
|
682
|
+
|
683
|
+
/* NA branch */
|
684
|
+
if ((workspace.this_split_NA - workspace.st) > model_params.min_size_numeric) {
|
685
|
+
|
686
|
+
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
687
|
+
workspace.clusters->emplace_back(Categorical, col, IsNa, (char*)NULL, (int)0, true);
|
688
|
+
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.st,
|
689
|
+
workspace.this_split_NA - 1, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
|
690
|
+
&workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
|
691
|
+
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
692
|
+
workspace.log_transf, workspace.log_minval, workspace.exp_transf,
|
693
|
+
workspace.orig_mean, workspace.orig_sd,
|
694
|
+
workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
|
695
|
+
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
|
696
|
+
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
697
|
+
|
698
|
+
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
699
|
+
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
700
|
+
workspace.tree->emplace_back(tree_from, col, IsNa, (char*)NULL, 0);
|
701
|
+
backup_recursion_state(workspace, *state_backup);
|
702
|
+
workspace.end = workspace.this_split_NA - 1;
|
703
|
+
recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, true);
|
704
|
+
restore_recursion_state(workspace, *state_backup);
|
705
|
+
}
|
706
|
+
|
707
|
+
}
|
708
|
+
|
709
|
+
/* left branch */
|
710
|
+
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
711
|
+
workspace.clusters->emplace_back(Categorical, col, InSubset, &workspace.buffer_subset_categ[0], input_data.ncat[col], is_NA_branch);
|
712
|
+
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.this_split_NA,
|
713
|
+
workspace.this_split_ix - 1, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
|
714
|
+
&workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
|
715
|
+
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
716
|
+
workspace.log_transf, workspace.log_minval, workspace.exp_transf,
|
717
|
+
workspace.orig_mean, workspace.orig_sd,
|
718
|
+
workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
|
719
|
+
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
|
720
|
+
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
721
|
+
|
722
|
+
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
723
|
+
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
724
|
+
workspace.tree->emplace_back(tree_from, col, InSubset, &workspace.buffer_subset_categ[0], input_data.ncat[col]);
|
725
|
+
backup_recursion_state(workspace, *state_backup);
|
726
|
+
workspace.st = workspace.this_split_NA;
|
727
|
+
workspace.end = workspace.this_split_ix - 1;
|
728
|
+
recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
729
|
+
restore_recursion_state(workspace, *state_backup);
|
730
|
+
}
|
731
|
+
|
732
|
+
/* right branch */
|
733
|
+
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
734
|
+
workspace.clusters->emplace_back(Categorical, col, NotInSubset, &workspace.buffer_subset_categ[0], input_data.ncat[col], is_NA_branch);
|
735
|
+
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.this_split_ix,
|
736
|
+
workspace.end, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
|
737
|
+
&workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
|
738
|
+
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
739
|
+
workspace.log_transf, workspace.log_minval, workspace.exp_transf,
|
740
|
+
workspace.orig_mean, workspace.orig_sd,
|
741
|
+
workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
|
742
|
+
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
|
743
|
+
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
744
|
+
|
745
|
+
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
746
|
+
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
747
|
+
workspace.tree->emplace_back(tree_from, col, NotInSubset, &workspace.buffer_subset_categ[0], input_data.ncat[col]);
|
748
|
+
backup_recursion_state(workspace, *state_backup);
|
749
|
+
workspace.st = workspace.this_split_ix;
|
750
|
+
recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
751
|
+
restore_recursion_state(workspace, *state_backup);
|
752
|
+
}
|
753
|
+
|
754
|
+
if (workspace.this_gain > workspace.best_gain && !model_params.follow_all) {
|
755
|
+
workspace.best_gain = workspace.this_gain;
|
756
|
+
workspace.column_type_best = Categorical;
|
757
|
+
workspace.col_best = col;
|
758
|
+
memcpy(&workspace.buffer_subset_categ_best[0], &workspace.buffer_subset_categ[0], input_data.ncat[col] * sizeof(char));
|
759
|
+
}
|
760
|
+
|
761
|
+
}
|
762
|
+
|
763
|
+
}
|
764
|
+
|
765
|
+
/* then ordinal */
|
766
|
+
for (size_t col = 0; col < input_data.ncols_ord; col++) {
|
767
|
+
|
768
|
+
if (input_data.skip_col[col + input_data.ncols_numeric + input_data.ncols_categ]) continue;
|
769
|
+
|
770
|
+
/* same code as for categorical, but this time with split level as int instead of boolean array as subset */
|
771
|
+
split_categx_numericy(&workspace.ix_arr[0], workspace.st, workspace.end, input_data.ordinal_data + col * input_data.nrows,
|
772
|
+
workspace.target_numeric_col, workspace.sd_y, workspace.mean_y, true, input_data.ncat_ord[col], &workspace.buffer_cat_cnt[0],
|
773
|
+
&workspace.buffer_cat_sum[0], &workspace.buffer_cat_sum_sq[0], &workspace.buffer_cat_sorted[0],
|
774
|
+
(bool)(input_data.has_NA[col + input_data.ncols_numeric + input_data.ncols_categ]), model_params.min_size_numeric,
|
775
|
+
&(workspace.this_gain), &workspace.buffer_subset_categ[0], &(workspace.this_split_lev));
|
776
|
+
if (model_params.gain_as_pct) workspace.this_gain /= workspace.sd_y;
|
777
|
+
|
778
|
+
if (workspace.this_gain >= model_params.min_gain) {
|
779
|
+
|
780
|
+
divide_subset_split(&workspace.ix_arr[0], input_data.ordinal_data + col * input_data.nrows, workspace.st, workspace.end,
|
781
|
+
workspace.this_split_lev, (bool)(workspace.buffer_cat_cnt[ input_data.ncat_ord[col] ] > 0),
|
782
|
+
&(workspace.this_split_NA), &(workspace.this_split_ix) );
|
783
|
+
|
784
|
+
if ((workspace.this_split_NA - workspace.st) > model_params.min_size_numeric) {
|
785
|
+
|
786
|
+
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
787
|
+
workspace.clusters->emplace_back(Ordinal, col, IsNa, (int)0, true);
|
788
|
+
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.st,
|
789
|
+
workspace.this_split_NA - 1, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
|
790
|
+
&workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
|
791
|
+
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
792
|
+
workspace.log_transf, workspace.log_minval, workspace.exp_transf,
|
793
|
+
workspace.orig_mean, workspace.orig_sd,
|
794
|
+
workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
|
795
|
+
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
|
796
|
+
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
797
|
+
|
798
|
+
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
799
|
+
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
800
|
+
workspace.tree->emplace_back(tree_from, col, (int)-1, IsNa);
|
801
|
+
backup_recursion_state(workspace, *state_backup);
|
802
|
+
workspace.end = workspace.this_split_NA - 1;
|
803
|
+
recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, true);
|
804
|
+
restore_recursion_state(workspace, *state_backup);
|
805
|
+
}
|
806
|
+
|
807
|
+
}
|
808
|
+
|
809
|
+
/* left branch */
|
810
|
+
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
811
|
+
workspace.clusters->emplace_back(Ordinal, col, LessOrEqual, workspace.this_split_lev, is_NA_branch);
|
812
|
+
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.this_split_NA,
|
813
|
+
workspace.this_split_ix - 1, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
|
814
|
+
&workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
|
815
|
+
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
816
|
+
workspace.log_transf, workspace.log_minval, workspace.exp_transf,
|
817
|
+
workspace.orig_mean, workspace.orig_sd,
|
818
|
+
workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
|
819
|
+
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
|
820
|
+
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
821
|
+
|
822
|
+
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
823
|
+
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
824
|
+
workspace.tree->emplace_back(tree_from, col, workspace.this_split_lev, LessOrEqual);
|
825
|
+
backup_recursion_state(workspace, *state_backup);
|
826
|
+
workspace.st = workspace.this_split_NA;
|
827
|
+
workspace.end = workspace.this_split_ix - 1;
|
828
|
+
recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
829
|
+
restore_recursion_state(workspace, *state_backup);
|
830
|
+
}
|
831
|
+
|
832
|
+
|
833
|
+
|
834
|
+
/* right branch */
|
835
|
+
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
836
|
+
workspace.clusters->emplace_back(Ordinal, col, Greater, workspace.this_split_lev, is_NA_branch);
|
837
|
+
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.this_split_ix,
|
838
|
+
workspace.end, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
|
839
|
+
&workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
|
840
|
+
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
841
|
+
workspace.log_transf, workspace.log_minval, workspace.exp_transf,
|
842
|
+
workspace.orig_mean, workspace.orig_sd,
|
843
|
+
workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
|
844
|
+
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
|
845
|
+
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
846
|
+
|
847
|
+
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
848
|
+
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
849
|
+
workspace.tree->emplace_back(tree_from, col, workspace.this_split_lev, Greater);
|
850
|
+
backup_recursion_state(workspace, *state_backup);
|
851
|
+
workspace.st = workspace.this_split_ix;
|
852
|
+
recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
853
|
+
restore_recursion_state(workspace, *state_backup);
|
854
|
+
}
|
855
|
+
|
856
|
+
if (workspace.this_gain > workspace.best_gain && !model_params.follow_all) {
|
857
|
+
workspace.best_gain = workspace.this_gain;
|
858
|
+
workspace.column_type_best = Ordinal;
|
859
|
+
workspace.col_best = col;
|
860
|
+
workspace.split_lev_best = workspace.this_split_lev;
|
861
|
+
}
|
862
|
+
|
863
|
+
}
|
864
|
+
|
865
|
+
}
|
866
|
+
|
867
|
+
/* avoid unnecessary memory usage */
|
868
|
+
workspace.col_has_outliers = workspace.lev_has_outliers? true : workspace.col_has_outliers;
|
869
|
+
(*workspace.tree)[tree_from].clusters.shrink_to_fit();
|
870
|
+
if ((*workspace.tree)[tree_from].all_branches.size() > 0) (*workspace.tree)[tree_from].all_branches.shrink_to_fit();
|
871
|
+
|
872
|
+
|
873
|
+
/* continue splitting further if meeting threshold criteria */
|
874
|
+
if (workspace.best_gain >= model_params.min_gain && !model_params.follow_all) {
|
875
|
+
|
876
|
+
/* check if depth limit is reached */
|
877
|
+
curr_depth++;
|
878
|
+
if (curr_depth >= model_params.max_depth) return;
|
879
|
+
|
880
|
+
/* discard outliers if any */
|
881
|
+
if (workspace.lev_has_outliers)
|
882
|
+
workspace.st = move_outliers_to_front(&workspace.ix_arr[0], &workspace.outlier_scores[0], workspace.st, workspace.end);
|
883
|
+
|
884
|
+
/* assign rows to their corresponding branch */
|
885
|
+
switch(workspace.column_type_best) {
|
886
|
+
case Numeric:
|
887
|
+
{
|
888
|
+
divide_subset_split(&workspace.ix_arr[0], input_data.numeric_data + workspace.col_best * input_data.nrows,
|
889
|
+
workspace.st, workspace.end, workspace.split_point_best,
|
890
|
+
(bool)(input_data.has_NA[workspace.col_best]),
|
891
|
+
&(workspace.this_split_NA), &(workspace.this_split_ix) );
|
892
|
+
spl1 = LessOrEqual; spl2 = Greater;
|
893
|
+
set_tree_as_numeric(workspace.tree->back(), workspace.split_point_best, workspace.col_best);
|
894
|
+
break;
|
895
|
+
}
|
896
|
+
|
897
|
+
case Categorical:
|
898
|
+
{
|
899
|
+
divide_subset_split(&workspace.ix_arr[0], input_data.categorical_data + workspace.col_best * input_data.nrows,
|
900
|
+
workspace.st, workspace.end, &workspace.buffer_subset_categ_best[0], input_data.ncat[workspace.col_best],
|
901
|
+
(bool)(input_data.has_NA[workspace.col_best + input_data.ncols_numeric]),
|
902
|
+
&(workspace.this_split_NA), &(workspace.this_split_ix) );
|
903
|
+
spl1 = InSubset; spl2 = NotInSubset;
|
904
|
+
set_tree_as_categorical(workspace.tree->back(), input_data.ncat[workspace.col_best],
|
905
|
+
&workspace.buffer_subset_categ_best[0], workspace.col_best);
|
906
|
+
break;
|
907
|
+
}
|
908
|
+
|
909
|
+
case Ordinal:
|
910
|
+
{
|
911
|
+
divide_subset_split(&workspace.ix_arr[0], input_data.ordinal_data + workspace.col_best * input_data.nrows,
|
912
|
+
workspace.st, workspace.end, workspace.split_lev_best,
|
913
|
+
(bool)(input_data.has_NA[workspace.col_best + input_data.ncols_numeric + input_data.ncols_categ]),
|
914
|
+
&(workspace.this_split_NA), &(workspace.this_split_ix) );
|
915
|
+
spl1 = LessOrEqual; spl2 = Greater;
|
916
|
+
set_tree_as_ordinal(workspace.tree->back(), workspace.split_lev_best, workspace.col_best);
|
917
|
+
break;
|
918
|
+
}
|
919
|
+
|
920
|
+
|
921
|
+
}
|
922
|
+
|
923
|
+
/* continue splitting recursively - need to remember from where */
|
924
|
+
ix1 = workspace.this_split_NA;
|
925
|
+
ix2 = workspace.this_split_ix;
|
926
|
+
ix3 = workspace.end;
|
927
|
+
|
928
|
+
/* NA branch */
|
929
|
+
if (workspace.st > workspace.this_split_NA &&
|
930
|
+
(workspace.st - workspace.this_split_NA) >= 2 * model_params.min_size_numeric) {
|
931
|
+
|
932
|
+
workspace.end = ix1 - 1;
|
933
|
+
(*workspace.tree)[tree_from].tree_NA = workspace.tree->size();
|
934
|
+
workspace.tree->emplace_back(tree_from, IsNa);
|
935
|
+
recursive_split_numeric(workspace, input_data, model_params, curr_depth, true);
|
936
|
+
}
|
937
|
+
|
938
|
+
/* left branch */
|
939
|
+
if ((ix2 - ix1) >= 2 * model_params.min_size_numeric) {
|
940
|
+
workspace.st = ix1;
|
941
|
+
workspace.end = ix2 - 1;
|
942
|
+
(*workspace.tree)[tree_from].tree_left = workspace.tree->size();
|
943
|
+
workspace.tree->emplace_back(tree_from, spl1);
|
944
|
+
recursive_split_numeric(workspace, input_data, model_params, curr_depth, is_NA_branch);
|
945
|
+
}
|
946
|
+
|
947
|
+
/* right branch */
|
948
|
+
if ((ix3 - ix2 + 1) >= 2 * model_params.min_size_numeric) {
|
949
|
+
workspace.st = ix2;
|
950
|
+
workspace.end = ix3;
|
951
|
+
(*workspace.tree)[tree_from].tree_right = workspace.tree->size();
|
952
|
+
workspace.tree->emplace_back(tree_from, spl2);
|
953
|
+
recursive_split_numeric(workspace, input_data, model_params, curr_depth, is_NA_branch);
|
954
|
+
}
|
955
|
+
|
956
|
+
}
|
957
|
+
|
958
|
+
/* if tree has no clusters and no subtrees, disconnect it from parent and then drop */
|
959
|
+
if (check_tree_is_not_needed((*workspace.tree)[tree_from])) {
|
960
|
+
|
961
|
+
if (tree_from == 0) {
|
962
|
+
workspace.tree->clear();
|
963
|
+
} else if ((*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.size() > 0) {
|
964
|
+
(*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.pop_back();
|
965
|
+
workspace.tree->pop_back();
|
966
|
+
} else {
|
967
|
+
switch((*workspace.tree)[tree_from].parent_branch) {
|
968
|
+
|
969
|
+
case IsNa:
|
970
|
+
{
|
971
|
+
(*workspace.tree)[(*workspace.tree)[tree_from].parent].tree_NA = 0;
|
972
|
+
break;
|
973
|
+
}
|
974
|
+
|
975
|
+
case LessOrEqual:
|
976
|
+
{
|
977
|
+
(*workspace.tree)[(*workspace.tree)[tree_from].parent].tree_left = 0;
|
978
|
+
break;
|
979
|
+
}
|
980
|
+
|
981
|
+
case Greater:
|
982
|
+
{
|
983
|
+
(*workspace.tree)[(*workspace.tree)[tree_from].parent].tree_right = 0;
|
984
|
+
break;
|
985
|
+
}
|
986
|
+
|
987
|
+
case InSubset:
|
988
|
+
{
|
989
|
+
(*workspace.tree)[(*workspace.tree)[tree_from].parent].tree_left = 0;
|
990
|
+
break;
|
991
|
+
}
|
992
|
+
|
993
|
+
case NotInSubset:
|
994
|
+
{
|
995
|
+
(*workspace.tree)[(*workspace.tree)[tree_from].parent].tree_right = 0;
|
996
|
+
break;
|
997
|
+
}
|
998
|
+
}
|
999
|
+
workspace.tree->pop_back();
|
1000
|
+
}
|
1001
|
+
}
|
1002
|
+
|
1003
|
+
}
|
1004
|
+
|
1005
|
+
void process_categ_col(std::vector<Cluster> &cluster_root,
|
1006
|
+
std::vector<ClusterTree> &tree_root,
|
1007
|
+
size_t target_col_num, bool is_ord,
|
1008
|
+
Workspace &workspace,
|
1009
|
+
InputData &input_data,
|
1010
|
+
ModelParams &model_params,
|
1011
|
+
ModelOutputs &model_outputs)
|
1012
|
+
{
|
1013
|
+
if (model_params.max_depth <= 0) return;
|
1014
|
+
|
1015
|
+
/* extract necesary info from column and discard NAs */
|
1016
|
+
workspace.target_col_is_ord = is_ord;
|
1017
|
+
workspace.target_col_num = target_col_num - input_data.ncols_numeric;
|
1018
|
+
if (!workspace.target_col_is_ord) {
|
1019
|
+
workspace.target_categ_col = input_data.categorical_data + workspace.target_col_num * input_data.nrows;
|
1020
|
+
workspace.ncat_this = input_data.ncat[workspace.target_col_num];
|
1021
|
+
} else {
|
1022
|
+
workspace.target_categ_col = input_data.ordinal_data + (workspace.target_col_num - input_data.ncols_categ) * input_data.nrows;
|
1023
|
+
workspace.ncat_this = input_data.ncat_ord[workspace.target_col_num - input_data.ncols_categ];
|
1024
|
+
}
|
1025
|
+
workspace.untransf_target_col = workspace.target_categ_col;
|
1026
|
+
workspace.end = input_data.nrows - 1;
|
1027
|
+
workspace.st = move_NAs_to_front(&workspace.ix_arr[0], workspace.target_categ_col, 0, workspace.end);
|
1028
|
+
workspace.col_has_outliers = false;
|
1029
|
+
workspace.col_is_bin = workspace.ncat_this <= 2;
|
1030
|
+
workspace.prop_small_this = &model_params.prop_small[ model_outputs.start_ix_cat_counts[workspace.target_col_num] ];
|
1031
|
+
workspace.prior_prob = &model_outputs.prop_categ[ model_outputs.start_ix_cat_counts[workspace.target_col_num] ];
|
1032
|
+
|
1033
|
+
/* create cluster root and reset outlier scores for this column */
|
1034
|
+
workspace.clusters = &cluster_root;
|
1035
|
+
workspace.tree = &tree_root;
|
1036
|
+
std::fill(workspace.outlier_scores.begin(), workspace.outlier_scores.end(), (double)1.0);
|
1037
|
+
workspace.tree->emplace_back(0, Root);
|
1038
|
+
|
1039
|
+
|
1040
|
+
/* at first, see if there's a category with 1-2 observations among only categories with large counts */
|
1041
|
+
workspace.col_has_outliers = find_outlier_categories_no_cond(&input_data.cat_counts[ model_outputs.start_ix_cat_counts[workspace.target_col_num] ],
|
1042
|
+
workspace.ncat_this, workspace.end - workspace.st + 1,
|
1043
|
+
&workspace.buffer_subset_categ[0], &(workspace.orig_mean));
|
1044
|
+
|
1045
|
+
/* if there is any such case, create a cluster for them */
|
1046
|
+
if (workspace.col_has_outliers) {
|
1047
|
+
workspace.tree->back().clusters.push_back(0);
|
1048
|
+
workspace.clusters->emplace_back(NoType, Root);
|
1049
|
+
define_categ_cluster_no_cond(workspace.untransf_target_col, &workspace.ix_arr[0], workspace.st, workspace.end, workspace.ncat_this,
|
1050
|
+
&workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
|
1051
|
+
&workspace.outlier_depth[0], workspace.clusters->back(),
|
1052
|
+
&input_data.cat_counts[ model_outputs.start_ix_cat_counts[workspace.target_col_num] ],
|
1053
|
+
&workspace.buffer_subset_categ[0], workspace.orig_mean);
|
1054
|
+
workspace.st = move_outliers_to_front(&workspace.ix_arr[0], &workspace.outlier_scores[0], workspace.st, workspace.end);
|
1055
|
+
}
|
1056
|
+
|
1057
|
+
/* if no conditional outliers are required, stop there */
|
1058
|
+
if (model_params.max_depth == 0) return;
|
1059
|
+
|
1060
|
+
/* if the rest of the data is all one category, do not process it any further */
|
1061
|
+
if (workspace.ncat_this == 2 && workspace.col_has_outliers) return;
|
1062
|
+
|
1063
|
+
/* if there isn't a single catchable outlier category, skip */
|
1064
|
+
bool should_skip = true;
|
1065
|
+
for (int cat = 0; cat < workspace.ncat_this; cat++) {
|
1066
|
+
|
1067
|
+
if (workspace.prop_small_this[cat] > (long double)1 / (long double)(workspace.end - workspace.st + 1 - model_params.min_size_categ))
|
1068
|
+
should_skip = false;
|
1069
|
+
}
|
1070
|
+
if (should_skip) return;
|
1071
|
+
|
1072
|
+
|
1073
|
+
/* if the column is already binary, or if using multiple categories, or if there are no more categorical columns, split the data as-is */
|
1074
|
+
if (
|
1075
|
+
(!model_params.categ_as_bin && !workspace.target_col_is_ord) ||
|
1076
|
+
(!model_params.ord_as_bin && workspace.target_col_is_ord) ||
|
1077
|
+
workspace.col_is_bin ||
|
1078
|
+
input_data.ncols_categ == (1 - ((workspace.target_col_is_ord)? 1 : 0))
|
1079
|
+
)
|
1080
|
+
{
|
1081
|
+
|
1082
|
+
/* calculate base information */
|
1083
|
+
workspace.base_info = total_info(&input_data.cat_counts[ model_outputs.start_ix_cat_counts[workspace.target_col_num] ],
|
1084
|
+
workspace.ncat_this, workspace.end - workspace.st + 1);
|
1085
|
+
workspace.base_info_orig = workspace.base_info;
|
1086
|
+
|
1087
|
+
/* then split */
|
1088
|
+
recursive_split_categ(workspace, input_data, model_params, 0, false);
|
1089
|
+
}
|
1090
|
+
|
1091
|
+
|
1092
|
+
else {
|
1093
|
+
/* otherwise, process the column 1 category at a time */
|
1094
|
+
size_t st_orig = workspace.st;
|
1095
|
+
size_t end_orig = workspace.end;
|
1096
|
+
size_t cat_counts_bin[2];
|
1097
|
+
workspace.col_is_bin = true;
|
1098
|
+
workspace.already_split_main = false;
|
1099
|
+
workspace.base_info_orig = total_info(&input_data.cat_counts[ model_outputs.start_ix_cat_counts[workspace.target_col_num] ],
|
1100
|
+
workspace.ncat_this, workspace.end - workspace.st + 1);
|
1101
|
+
workspace.tree->back().column_type = NoType;
|
1102
|
+
|
1103
|
+
|
1104
|
+
for (int cat = 0; cat < workspace.ncat_this - ((workspace.target_col_is_ord)? 1 : 0); cat++) {
|
1105
|
+
|
1106
|
+
workspace.st = st_orig;
|
1107
|
+
workspace.end = end_orig;
|
1108
|
+
|
1109
|
+
/* convert to binary */
|
1110
|
+
if (!workspace.target_col_is_ord) {
|
1111
|
+
|
1112
|
+
for (size_t row = workspace.st; row <= workspace.end; row++) {
|
1113
|
+
workspace.buffer_bin_y[workspace.ix_arr[row]] = (workspace.untransf_target_col[workspace.ix_arr[row]] == cat)? 1 : 0;
|
1114
|
+
}
|
1115
|
+
cat_counts_bin[0] = workspace.end - workspace.st + 1 - input_data.cat_counts[ cat + model_outputs.start_ix_cat_counts[workspace.target_col_num] ];
|
1116
|
+
cat_counts_bin[1] = input_data.cat_counts[ cat + model_outputs.start_ix_cat_counts[workspace.target_col_num] ];
|
1117
|
+
|
1118
|
+
} else {
|
1119
|
+
|
1120
|
+
for (size_t row = workspace.st; row <= workspace.end; row++) {
|
1121
|
+
workspace.buffer_bin_y[workspace.ix_arr[row]] = (workspace.untransf_target_col[workspace.ix_arr[row]] <= cat)? 1 : 0;
|
1122
|
+
}
|
1123
|
+
cat_counts_bin[0] = 0;
|
1124
|
+
cat_counts_bin[1] = workspace.end - workspace.st + 1;
|
1125
|
+
for (int catcat = 0; catcat <= cat; catcat++) {
|
1126
|
+
cat_counts_bin[0] += input_data.cat_counts[ catcat + model_outputs.start_ix_cat_counts[workspace.target_col_num] ];
|
1127
|
+
cat_counts_bin[1] -= input_data.cat_counts[ catcat + model_outputs.start_ix_cat_counts[workspace.target_col_num] ];
|
1128
|
+
}
|
1129
|
+
|
1130
|
+
}
|
1131
|
+
|
1132
|
+
if (cat_counts_bin[0] > 0 && cat_counts_bin[1] > 0) {
|
1133
|
+
workspace.target_categ_col = &workspace.buffer_bin_y[0];
|
1134
|
+
workspace.base_info = total_info(cat_counts_bin, 2, workspace.end - workspace.st + 1);
|
1135
|
+
(*workspace.tree)[0].binary_branches.push_back(workspace.tree->size());
|
1136
|
+
workspace.tree->emplace_back(0, SubTrees);
|
1137
|
+
recursive_split_categ(workspace, input_data, model_params, 0, false);
|
1138
|
+
}
|
1139
|
+
|
1140
|
+
}
|
1141
|
+
(*workspace.tree)[0].binary_branches.shrink_to_fit();
|
1142
|
+
|
1143
|
+
}
|
1144
|
+
|
1145
|
+
}
|
1146
|
+
|
1147
|
+
|
1148
|
+
void recursive_split_categ(Workspace &workspace,
|
1149
|
+
InputData &input_data,
|
1150
|
+
ModelParams &model_params,
|
1151
|
+
size_t curr_depth, bool is_NA_branch)
|
1152
|
+
{
|
1153
|
+
/* idea is the same as its numeric counterpart, only splitting by another categorical
|
1154
|
+
is less clear how to do and offers different options */
|
1155
|
+
workspace.best_gain = -HUGE_VAL;
|
1156
|
+
workspace.column_type_best = NoType;
|
1157
|
+
workspace.lev_has_outliers = false;
|
1158
|
+
size_t ix1, ix2, ix3;
|
1159
|
+
SplitType spl1, spl2;
|
1160
|
+
size_t tree_from = workspace.tree->size() - 1;
|
1161
|
+
|
1162
|
+
/* when using 'follow_all' need to keep track of a lot more things */
|
1163
|
+
std::unique_ptr<RecursionState> state_backup;
|
1164
|
+
if (model_params.follow_all) state_backup = std::unique_ptr<RecursionState>(new RecursionState);
|
1165
|
+
|
1166
|
+
if (curr_depth > 0) {
|
1167
|
+
workspace.base_info_orig = total_info(&workspace.ix_arr[0], workspace.untransf_target_col, workspace.st, workspace.end,
|
1168
|
+
workspace.ncat_this, &workspace.buffer_cat_cnt[0]);
|
1169
|
+
|
1170
|
+
/* check that there's still more than 1 category */
|
1171
|
+
size_t ncat_present = 0;
|
1172
|
+
for (int cat = 0; cat < workspace.ncat_this; cat++) {
|
1173
|
+
ncat_present += (workspace.buffer_cat_cnt[cat])? 1 : 0;
|
1174
|
+
if (ncat_present >= 2) break;
|
1175
|
+
}
|
1176
|
+
if (ncat_present < 2) goto drop_if_not_needed;
|
1177
|
+
if (workspace.col_is_bin && workspace.ncat_this > 2) {
|
1178
|
+
workspace.base_info = total_info(&workspace.ix_arr[0], workspace.target_categ_col, workspace.st, workspace.end,
|
1179
|
+
2, &workspace.buffer_cat_cnt[0]);
|
1180
|
+
if (workspace.buffer_cat_cnt[0] < model_params.min_size_categ || workspace.buffer_cat_cnt[1] == model_params.min_size_categ) goto drop_if_not_needed;
|
1181
|
+
} else {
|
1182
|
+
workspace.base_info = workspace.base_info_orig;
|
1183
|
+
}
|
1184
|
+
}
|
1185
|
+
|
1186
|
+
/* split with each other column */
|
1187
|
+
|
1188
|
+
|
1189
|
+
/* first numeric */
|
1190
|
+
for (size_t col = 0; col < input_data.ncols_numeric; col++) {
|
1191
|
+
|
1192
|
+
if (curr_depth == 0 && workspace.col_is_bin && workspace.ncat_this > 2 && workspace.already_split_main) break;
|
1193
|
+
if (input_data.skip_col[col]) continue;
|
1194
|
+
split_numericx_categy(&workspace.ix_arr[0], workspace.st, workspace.end, input_data.numeric_data + col * input_data.nrows,
|
1195
|
+
workspace.untransf_target_col, workspace.ncat_this, workspace.base_info_orig,
|
1196
|
+
&workspace.buffer_cat_cnt[0], (bool)(input_data.has_NA[col]), model_params.min_size_categ,
|
1197
|
+
model_params.take_mid, &(workspace.this_gain), &(workspace.this_split_point),
|
1198
|
+
&(workspace.this_split_ix), &(workspace.this_split_NA));
|
1199
|
+
if (model_params.gain_as_pct) workspace.this_gain /= workspace.base_info_orig;
|
1200
|
+
|
1201
|
+
if (workspace.this_gain >= model_params.min_gain) {
|
1202
|
+
|
1203
|
+
/* NA branch */
|
1204
|
+
if (workspace.this_split_NA > workspace.st &&
|
1205
|
+
(workspace.this_split_NA - workspace.st) > model_params.min_size_categ) {
|
1206
|
+
|
1207
|
+
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1208
|
+
workspace.clusters->emplace_back(Numeric, col, IsNa, -HUGE_VAL, true);
|
1209
|
+
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1210
|
+
&workspace.ix_arr[0], workspace.st, workspace.this_split_NA - 1,
|
1211
|
+
workspace.ncat_this, model_params.categ_from_maj,
|
1212
|
+
&workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
|
1213
|
+
&workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
|
1214
|
+
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1215
|
+
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1216
|
+
workspace.prop_small_this, workspace.prior_prob,
|
1217
|
+
&workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
|
1218
|
+
&workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
|
1219
|
+
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1220
|
+
if (workspace.drop_cluster) {
|
1221
|
+
workspace.clusters->pop_back();
|
1222
|
+
(*workspace.tree)[tree_from].clusters.pop_back();
|
1223
|
+
}
|
1224
|
+
|
1225
|
+
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
1226
|
+
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
1227
|
+
workspace.tree->emplace_back(tree_from, col, HUGE_VAL, IsNa);
|
1228
|
+
backup_recursion_state(workspace, *state_backup);
|
1229
|
+
workspace.end = workspace.this_split_NA - 1;
|
1230
|
+
recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, true);
|
1231
|
+
restore_recursion_state(workspace, *state_backup);
|
1232
|
+
}
|
1233
|
+
|
1234
|
+
}
|
1235
|
+
|
1236
|
+
/* left branch */
|
1237
|
+
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1238
|
+
workspace.clusters->emplace_back(Numeric, col, LessOrEqual, workspace.this_split_point, is_NA_branch);
|
1239
|
+
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1240
|
+
&workspace.ix_arr[0], workspace.this_split_NA, workspace.this_split_ix,
|
1241
|
+
workspace.ncat_this, model_params.categ_from_maj,
|
1242
|
+
&workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
|
1243
|
+
&workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
|
1244
|
+
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1245
|
+
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1246
|
+
workspace.prop_small_this, workspace.prior_prob,
|
1247
|
+
&workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
|
1248
|
+
&workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
|
1249
|
+
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1250
|
+
if (workspace.drop_cluster) {
|
1251
|
+
workspace.clusters->pop_back();
|
1252
|
+
(*workspace.tree)[tree_from].clusters.pop_back();
|
1253
|
+
}
|
1254
|
+
|
1255
|
+
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
1256
|
+
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
1257
|
+
workspace.tree->emplace_back(tree_from, col, workspace.this_split_point, LessOrEqual);
|
1258
|
+
backup_recursion_state(workspace, *state_backup);
|
1259
|
+
workspace.st = workspace.this_split_NA;
|
1260
|
+
workspace.end = workspace.this_split_ix;
|
1261
|
+
recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
1262
|
+
restore_recursion_state(workspace, *state_backup);
|
1263
|
+
}
|
1264
|
+
|
1265
|
+
|
1266
|
+
/* right branch */
|
1267
|
+
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1268
|
+
workspace.clusters->emplace_back(Numeric, col, Greater, workspace.this_split_point, is_NA_branch);
|
1269
|
+
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1270
|
+
&workspace.ix_arr[0], workspace.this_split_ix + 1, workspace.end,
|
1271
|
+
workspace.ncat_this, model_params.categ_from_maj,
|
1272
|
+
&workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
|
1273
|
+
&workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
|
1274
|
+
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1275
|
+
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1276
|
+
workspace.prop_small_this, workspace.prior_prob,
|
1277
|
+
&workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
|
1278
|
+
&workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
|
1279
|
+
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1280
|
+
if (workspace.drop_cluster) {
|
1281
|
+
workspace.clusters->pop_back();
|
1282
|
+
(*workspace.tree)[tree_from].clusters.pop_back();
|
1283
|
+
}
|
1284
|
+
|
1285
|
+
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
1286
|
+
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
1287
|
+
workspace.tree->emplace_back(tree_from, col, workspace.this_split_point, Greater);
|
1288
|
+
backup_recursion_state(workspace, *state_backup);
|
1289
|
+
workspace.st = workspace.this_split_ix + 1;
|
1290
|
+
recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
1291
|
+
restore_recursion_state(workspace, *state_backup);
|
1292
|
+
}
|
1293
|
+
|
1294
|
+
|
1295
|
+
/* if this is the best split, remember it for later */
|
1296
|
+
if (workspace.this_gain > workspace.best_gain) {
|
1297
|
+
workspace.best_gain = workspace.this_gain;
|
1298
|
+
workspace.column_type_best = Numeric;
|
1299
|
+
workspace.col_best = col;
|
1300
|
+
workspace.split_point_best = workspace.this_split_point;
|
1301
|
+
}
|
1302
|
+
|
1303
|
+
}
|
1304
|
+
|
1305
|
+
}
|
1306
|
+
|
1307
|
+
|
1308
|
+
/* then categorical */
|
1309
|
+
for (size_t col = 0; col < input_data.ncols_categ; col++) {
|
1310
|
+
|
1311
|
+
/* TODO: could make a pre-check that the splitting column up to this recursion still has
|
1312
|
+
more than 1 category, and skip for this and further recursions otherwise */
|
1313
|
+
|
1314
|
+
if (col == workspace.target_col_num && !workspace.target_col_is_ord) continue;
|
1315
|
+
if (input_data.skip_col[col + input_data.ncols_numeric]) continue;
|
1316
|
+
|
1317
|
+
if (workspace.col_is_bin) {
|
1318
|
+
|
1319
|
+
split_categx_biny(&workspace.ix_arr[0], workspace.st, workspace.end,
|
1320
|
+
input_data.categorical_data + col * input_data.nrows, workspace.target_categ_col,
|
1321
|
+
input_data.ncat[col], workspace.base_info, &workspace.buffer_cat_cnt[0],
|
1322
|
+
&workspace.buffer_crosstab[0], &workspace.buffer_cat_sorted[0],
|
1323
|
+
(bool)(input_data.has_NA[col + input_data.ncols_numeric]), model_params.min_size_categ,
|
1324
|
+
&(workspace.this_gain), &workspace.buffer_subset_categ[0]);
|
1325
|
+
|
1326
|
+
/* If it was forcibly binarized, need to calculate the gain on the original categories to make it comparable */
|
1327
|
+
if (
|
1328
|
+
!isinf(workspace.this_gain) &&
|
1329
|
+
(
|
1330
|
+
(!workspace.target_col_is_ord && input_data.ncat[workspace.target_col_num] > 2) ||
|
1331
|
+
(workspace.target_col_is_ord && input_data.ncat_ord[workspace.target_col_num - input_data.ncols_categ] > 2)
|
1332
|
+
)
|
1333
|
+
)
|
1334
|
+
{
|
1335
|
+
divide_subset_split(&workspace.ix_arr[0], input_data.categorical_data + col * input_data.nrows,
|
1336
|
+
workspace.st, workspace.end, &workspace.buffer_subset_categ[0], input_data.ncat[col],
|
1337
|
+
(bool)input_data.has_NA[col + input_data.ncols_numeric],
|
1338
|
+
&(workspace.this_split_NA), &(workspace.this_split_ix) );
|
1339
|
+
workspace.this_gain = categ_gain_from_split(&workspace.ix_arr[0], workspace.untransf_target_col, workspace.st,
|
1340
|
+
workspace.this_split_NA, workspace.this_split_ix, workspace.end,
|
1341
|
+
workspace.ncat_this, &workspace.buffer_cat_cnt[0], workspace.base_info_orig);
|
1342
|
+
}
|
1343
|
+
|
1344
|
+
} else {
|
1345
|
+
|
1346
|
+
if (model_params.cat_bruteforce_subset && input_data.ncat[col] > 2) {
|
1347
|
+
split_categx_categy_subset(&workspace.ix_arr[0], workspace.st, workspace.end,
|
1348
|
+
input_data.categorical_data + col * input_data.nrows, workspace.target_categ_col,
|
1349
|
+
input_data.ncat[col], workspace.ncat_this, workspace.base_info_orig,
|
1350
|
+
&workspace.buffer_cat_sorted[0], &workspace.buffer_crosstab[0], &workspace.buffer_cat_cnt[0],
|
1351
|
+
(bool)(input_data.has_NA[col + input_data.ncols_numeric]), model_params.min_size_categ,
|
1352
|
+
&(workspace.this_gain), &workspace.buffer_subset_categ[0]);
|
1353
|
+
} else {
|
1354
|
+
split_categx_categy_separate(&workspace.ix_arr[0], workspace.st, workspace.end,
|
1355
|
+
input_data.categorical_data + col * input_data.nrows, workspace.target_categ_col,
|
1356
|
+
input_data.ncat[col], workspace.ncat_this, workspace.base_info_orig,
|
1357
|
+
&workspace.buffer_cat_cnt[0], &workspace.buffer_crosstab[0],
|
1358
|
+
(bool)(input_data.has_NA[col + input_data.ncols_numeric]),
|
1359
|
+
model_params.min_size_categ, &(workspace.this_gain));
|
1360
|
+
}
|
1361
|
+
|
1362
|
+
}
|
1363
|
+
|
1364
|
+
if (model_params.gain_as_pct) workspace.this_gain /= workspace.base_info_orig;
|
1365
|
+
if (workspace.this_gain >= model_params.min_gain) {
|
1366
|
+
|
1367
|
+
/* NA branch */
|
1368
|
+
workspace.this_split_NA = move_NAs_to_front(&workspace.ix_arr[0], input_data.categorical_data + col * input_data.nrows, workspace.st, workspace.end);
|
1369
|
+
if ((workspace.this_split_NA - workspace.st) > model_params.min_size_categ) {
|
1370
|
+
|
1371
|
+
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1372
|
+
workspace.clusters->emplace_back(Categorical, col, IsNa, (char*)NULL, (int)0, true);
|
1373
|
+
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1374
|
+
&workspace.ix_arr[0], workspace.st, workspace.this_split_NA - 1,
|
1375
|
+
workspace.ncat_this, model_params.categ_from_maj,
|
1376
|
+
&workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
|
1377
|
+
&workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
|
1378
|
+
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1379
|
+
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1380
|
+
workspace.prop_small_this, workspace.prior_prob,
|
1381
|
+
&workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
|
1382
|
+
&workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
|
1383
|
+
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1384
|
+
if (workspace.drop_cluster) {
|
1385
|
+
workspace.clusters->pop_back();
|
1386
|
+
(*workspace.tree)[tree_from].clusters.pop_back();
|
1387
|
+
}
|
1388
|
+
|
1389
|
+
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
1390
|
+
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
1391
|
+
workspace.tree->emplace_back(tree_from, col, IsNa, (char*)NULL, 0);
|
1392
|
+
backup_recursion_state(workspace, *state_backup);
|
1393
|
+
workspace.end = workspace.this_split_NA - 1;
|
1394
|
+
recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, true);
|
1395
|
+
restore_recursion_state(workspace, *state_backup);
|
1396
|
+
}
|
1397
|
+
|
1398
|
+
}
|
1399
|
+
|
1400
|
+
if (!model_params.cat_bruteforce_subset && !workspace.col_is_bin && input_data.ncat[col] > 2) {
|
1401
|
+
|
1402
|
+
/* sort by the splitting variable and iterate over to determine the split points */
|
1403
|
+
workspace.temp_ptr_x = input_data.categorical_data + col * input_data.nrows;
|
1404
|
+
std::sort(&workspace.ix_arr[0] + workspace.this_split_NA, &workspace.ix_arr[0] + workspace.end + 1,
|
1405
|
+
[&workspace](const size_t a, const size_t b){return workspace.temp_ptr_x[a] < workspace.temp_ptr_x[b];});
|
1406
|
+
workspace.this_split_ix = workspace.this_split_NA;
|
1407
|
+
|
1408
|
+
/* TODO: should instead use std::lower_bound to calculate the start and end indices of each category */
|
1409
|
+
for (size_t row = workspace.this_split_NA + 1; row <= workspace.end; row++) {
|
1410
|
+
|
1411
|
+
/* if the next observation is in a different category, then the split ends here */
|
1412
|
+
if (workspace.temp_ptr_x[workspace.ix_arr[row]] != workspace.temp_ptr_x[workspace.ix_arr[row-1]]) {
|
1413
|
+
|
1414
|
+
if ((row - workspace.this_split_ix) >= model_params.min_size_categ) {
|
1415
|
+
|
1416
|
+
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1417
|
+
workspace.clusters->emplace_back(col, workspace.temp_ptr_x[workspace.ix_arr[row-1]], input_data.ncat[col], is_NA_branch);
|
1418
|
+
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1419
|
+
&workspace.ix_arr[0], workspace.this_split_ix, row - 1,
|
1420
|
+
workspace.ncat_this, model_params.categ_from_maj,
|
1421
|
+
&workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
|
1422
|
+
&workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
|
1423
|
+
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1424
|
+
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1425
|
+
workspace.prop_small_this, workspace.prior_prob,
|
1426
|
+
&workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
|
1427
|
+
&workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
|
1428
|
+
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1429
|
+
if (workspace.drop_cluster) {
|
1430
|
+
workspace.clusters->pop_back();
|
1431
|
+
(*workspace.tree)[tree_from].clusters.pop_back();
|
1432
|
+
}
|
1433
|
+
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
1434
|
+
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
1435
|
+
workspace.tree->emplace_back(tree_from, col, workspace.temp_ptr_x[workspace.ix_arr[workspace.this_split_ix]]);
|
1436
|
+
backup_recursion_state(workspace, *state_backup);
|
1437
|
+
workspace.st = workspace.this_split_ix;
|
1438
|
+
workspace.end = row - 1;
|
1439
|
+
recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
1440
|
+
restore_recursion_state(workspace, *state_backup);
|
1441
|
+
}
|
1442
|
+
}
|
1443
|
+
workspace.this_split_ix = row;
|
1444
|
+
}
|
1445
|
+
}
|
1446
|
+
/* last category is given by the end indices */
|
1447
|
+
if ((workspace.end - workspace.this_split_ix) > model_params.min_size_categ) {
|
1448
|
+
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1449
|
+
workspace.clusters->emplace_back(col, workspace.temp_ptr_x[workspace.ix_arr[workspace.end]], input_data.ncat[col], is_NA_branch);
|
1450
|
+
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1451
|
+
&workspace.ix_arr[0], workspace.this_split_ix, workspace.end,
|
1452
|
+
workspace.ncat_this, model_params.categ_from_maj,
|
1453
|
+
&workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
|
1454
|
+
&workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
|
1455
|
+
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1456
|
+
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1457
|
+
workspace.prop_small_this, workspace.prior_prob,
|
1458
|
+
&workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
|
1459
|
+
&workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
|
1460
|
+
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1461
|
+
if (workspace.drop_cluster) {
|
1462
|
+
workspace.clusters->pop_back();
|
1463
|
+
(*workspace.tree)[tree_from].clusters.pop_back();
|
1464
|
+
}
|
1465
|
+
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
1466
|
+
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
1467
|
+
workspace.tree->emplace_back(tree_from, col, workspace.temp_ptr_x[workspace.ix_arr[workspace.end]]);
|
1468
|
+
backup_recursion_state(workspace, *state_backup);
|
1469
|
+
workspace.st = workspace.this_split_ix;
|
1470
|
+
recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
1471
|
+
restore_recursion_state(workspace, *state_backup);
|
1472
|
+
}
|
1473
|
+
|
1474
|
+
}
|
1475
|
+
|
1476
|
+
if (workspace.this_gain > workspace.best_gain) {
|
1477
|
+
workspace.best_gain = workspace.this_gain;
|
1478
|
+
workspace.column_type_best = Categorical;
|
1479
|
+
workspace.col_best = col;
|
1480
|
+
}
|
1481
|
+
|
1482
|
+
|
1483
|
+
} else {
|
1484
|
+
|
1485
|
+
/* split by subsets of categories */
|
1486
|
+
|
1487
|
+
if (input_data.ncat[col] == 2) {
|
1488
|
+
|
1489
|
+
workspace.buffer_subset_categ[0] = 1;
|
1490
|
+
workspace.buffer_subset_categ[1] = 0;
|
1491
|
+
divide_subset_split(&workspace.ix_arr[0], input_data.categorical_data + col * input_data.nrows, workspace.this_split_NA, workspace.end,
|
1492
|
+
(int)0, false, &(workspace.this_split_NA), &(workspace.this_split_ix));
|
1493
|
+
if (
|
1494
|
+
(workspace.end - workspace.this_split_ix) < model_params.min_size_categ ||
|
1495
|
+
(workspace.this_split_ix - workspace.this_split_NA) < model_params.min_size_categ
|
1496
|
+
) continue;
|
1497
|
+
|
1498
|
+
} else {
|
1499
|
+
|
1500
|
+
divide_subset_split(&workspace.ix_arr[0], input_data.categorical_data + col * input_data.nrows, workspace.this_split_NA, workspace.end,
|
1501
|
+
&workspace.buffer_subset_categ[0], input_data.ncat[col], false,
|
1502
|
+
&(workspace.this_split_NA), &(workspace.this_split_ix));
|
1503
|
+
}
|
1504
|
+
|
1505
|
+
/* left branch */
|
1506
|
+
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1507
|
+
workspace.clusters->emplace_back(Categorical, col, InSubset, &workspace.buffer_subset_categ[0], input_data.ncat[col], is_NA_branch);
|
1508
|
+
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1509
|
+
&workspace.ix_arr[0], workspace.this_split_NA, workspace.this_split_ix - 1,
|
1510
|
+
workspace.ncat_this, model_params.categ_from_maj,
|
1511
|
+
&workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
|
1512
|
+
&workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
|
1513
|
+
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1514
|
+
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1515
|
+
workspace.prop_small_this, workspace.prior_prob,
|
1516
|
+
&workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
|
1517
|
+
&workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
|
1518
|
+
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1519
|
+
if (workspace.drop_cluster) {
|
1520
|
+
workspace.clusters->pop_back();
|
1521
|
+
(*workspace.tree)[tree_from].clusters.pop_back();
|
1522
|
+
}
|
1523
|
+
|
1524
|
+
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
1525
|
+
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
1526
|
+
workspace.tree->emplace_back(tree_from, col, InSubset, &workspace.buffer_subset_categ[0], input_data.ncat[col]);
|
1527
|
+
backup_recursion_state(workspace, *state_backup);
|
1528
|
+
workspace.st = workspace.this_split_NA;
|
1529
|
+
workspace.end = workspace.this_split_ix - 1;
|
1530
|
+
recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
1531
|
+
restore_recursion_state(workspace, *state_backup);
|
1532
|
+
}
|
1533
|
+
|
1534
|
+
/* right branch */
|
1535
|
+
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1536
|
+
workspace.clusters->emplace_back(Categorical, col, NotInSubset, &workspace.buffer_subset_categ[0], input_data.ncat[col], is_NA_branch);
|
1537
|
+
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1538
|
+
&workspace.ix_arr[0], workspace.this_split_ix, workspace.end,
|
1539
|
+
workspace.ncat_this, model_params.categ_from_maj,
|
1540
|
+
&workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
|
1541
|
+
&workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
|
1542
|
+
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1543
|
+
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1544
|
+
workspace.prop_small_this, workspace.prior_prob,
|
1545
|
+
&workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
|
1546
|
+
&workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
|
1547
|
+
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1548
|
+
if (workspace.drop_cluster) {
|
1549
|
+
workspace.clusters->pop_back();
|
1550
|
+
(*workspace.tree)[tree_from].clusters.pop_back();
|
1551
|
+
}
|
1552
|
+
|
1553
|
+
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
1554
|
+
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
1555
|
+
workspace.tree->emplace_back(tree_from, col, NotInSubset, &workspace.buffer_subset_categ[0], input_data.ncat[col]);
|
1556
|
+
backup_recursion_state(workspace, *state_backup);
|
1557
|
+
workspace.st = workspace.this_split_ix;
|
1558
|
+
recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
1559
|
+
restore_recursion_state(workspace, *state_backup);
|
1560
|
+
}
|
1561
|
+
|
1562
|
+
if (workspace.this_gain > workspace.best_gain) {
|
1563
|
+
workspace.best_gain = workspace.this_gain;
|
1564
|
+
workspace.column_type_best = Categorical;
|
1565
|
+
workspace.col_best = col;
|
1566
|
+
memcpy(&workspace.buffer_subset_categ_best[0], &workspace.buffer_subset_categ[0], input_data.ncat[col] * sizeof(char));
|
1567
|
+
}
|
1568
|
+
|
1569
|
+
}
|
1570
|
+
|
1571
|
+
}
|
1572
|
+
|
1573
|
+
}
|
1574
|
+
|
1575
|
+
|
1576
|
+
/* then ordinal */
|
1577
|
+
for (size_t col = 0; col < input_data.ncols_ord; col++) {
|
1578
|
+
|
1579
|
+
if (curr_depth == 0 && workspace.col_is_bin && workspace.ncat_this > 2 && workspace.already_split_main) break;
|
1580
|
+
if (input_data.skip_col[col + input_data.ncols_numeric + input_data.ncols_categ]) continue;
|
1581
|
+
if (workspace.target_col_is_ord && col == (workspace.target_col_num - input_data.ncols_categ)) continue;
|
1582
|
+
|
1583
|
+
split_ordx_categy(&workspace.ix_arr[0], workspace.st, workspace.end,
|
1584
|
+
input_data.ordinal_data + col * input_data.nrows, workspace.untransf_target_col,
|
1585
|
+
input_data.ncat_ord[col], workspace.ncat_this,
|
1586
|
+
workspace.base_info_orig, &workspace.buffer_cat_cnt[0], &workspace.buffer_crosstab[0], &workspace.buffer_cat_sorted[0],
|
1587
|
+
(bool)(input_data.has_NA[col + input_data.ncols_numeric + input_data.ncols_categ]),
|
1588
|
+
model_params.min_size_categ, &(workspace.this_gain), &(workspace.this_split_lev));
|
1589
|
+
if (model_params.gain_as_pct) workspace.this_gain /= workspace.base_info_orig;
|
1590
|
+
|
1591
|
+
if (workspace.this_gain >= model_params.min_gain) {
|
1592
|
+
|
1593
|
+
divide_subset_split(&workspace.ix_arr[0], input_data.ordinal_data + col * input_data.nrows, workspace.st, workspace.end,
|
1594
|
+
workspace.this_split_lev, (bool)(workspace.buffer_cat_cnt[ input_data.ncat_ord[col] ] > 0),
|
1595
|
+
&(workspace.this_split_NA), &(workspace.this_split_ix) );
|
1596
|
+
|
1597
|
+
/* NA branch */
|
1598
|
+
if ((workspace.this_split_NA - workspace.st) > model_params.min_size_categ) {
|
1599
|
+
|
1600
|
+
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1601
|
+
workspace.clusters->emplace_back(Ordinal, col, IsNa, (int)0, true);
|
1602
|
+
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1603
|
+
&workspace.ix_arr[0], workspace.st, workspace.this_split_NA - 1,
|
1604
|
+
workspace.ncat_this, model_params.categ_from_maj,
|
1605
|
+
&workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
|
1606
|
+
&workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
|
1607
|
+
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1608
|
+
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1609
|
+
workspace.prop_small_this, workspace.prior_prob,
|
1610
|
+
&workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
|
1611
|
+
&workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
|
1612
|
+
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1613
|
+
if (workspace.drop_cluster) {
|
1614
|
+
workspace.clusters->pop_back();
|
1615
|
+
(*workspace.tree)[tree_from].clusters.pop_back();
|
1616
|
+
}
|
1617
|
+
|
1618
|
+
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
1619
|
+
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
1620
|
+
workspace.tree->emplace_back(tree_from, col, (int)-1, IsNa);
|
1621
|
+
backup_recursion_state(workspace, *state_backup);
|
1622
|
+
workspace.end = workspace.this_split_NA - 1;
|
1623
|
+
recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, true);
|
1624
|
+
restore_recursion_state(workspace, *state_backup);
|
1625
|
+
}
|
1626
|
+
|
1627
|
+
}
|
1628
|
+
|
1629
|
+
/* left branch */
|
1630
|
+
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1631
|
+
workspace.clusters->emplace_back(Ordinal, col, LessOrEqual, workspace.this_split_lev, is_NA_branch);
|
1632
|
+
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1633
|
+
&workspace.ix_arr[0], workspace.this_split_NA, workspace.this_split_ix - 1,
|
1634
|
+
workspace.ncat_this, model_params.categ_from_maj,
|
1635
|
+
&workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
|
1636
|
+
&workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
|
1637
|
+
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1638
|
+
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1639
|
+
workspace.prop_small_this, workspace.prior_prob,
|
1640
|
+
&workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
|
1641
|
+
&workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
|
1642
|
+
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1643
|
+
if (workspace.drop_cluster) {
|
1644
|
+
workspace.clusters->pop_back();
|
1645
|
+
(*workspace.tree)[tree_from].clusters.pop_back();
|
1646
|
+
}
|
1647
|
+
|
1648
|
+
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
1649
|
+
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
1650
|
+
workspace.tree->emplace_back(tree_from, col, workspace.this_split_lev, LessOrEqual);
|
1651
|
+
backup_recursion_state(workspace, *state_backup);
|
1652
|
+
workspace.st = workspace.this_split_NA;
|
1653
|
+
workspace.end = workspace.this_split_ix - 1;
|
1654
|
+
recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
1655
|
+
restore_recursion_state(workspace, *state_backup);
|
1656
|
+
}
|
1657
|
+
|
1658
|
+
/* right branch */
|
1659
|
+
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1660
|
+
workspace.clusters->emplace_back(Ordinal, col, Greater, workspace.this_split_lev, is_NA_branch);
|
1661
|
+
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1662
|
+
&workspace.ix_arr[0], workspace.this_split_ix, workspace.end,
|
1663
|
+
workspace.ncat_this, model_params.categ_from_maj,
|
1664
|
+
&workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
|
1665
|
+
&workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
|
1666
|
+
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1667
|
+
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1668
|
+
workspace.prop_small_this, workspace.prior_prob,
|
1669
|
+
&workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
|
1670
|
+
&workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
|
1671
|
+
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1672
|
+
if (workspace.drop_cluster) {
|
1673
|
+
workspace.clusters->pop_back();
|
1674
|
+
(*workspace.tree)[tree_from].clusters.pop_back();
|
1675
|
+
}
|
1676
|
+
|
1677
|
+
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
1678
|
+
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
1679
|
+
workspace.tree->emplace_back(tree_from, col, workspace.this_split_lev, Greater);
|
1680
|
+
backup_recursion_state(workspace, *state_backup);
|
1681
|
+
workspace.st = workspace.this_split_ix;
|
1682
|
+
recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
1683
|
+
restore_recursion_state(workspace, *state_backup);
|
1684
|
+
}
|
1685
|
+
|
1686
|
+
|
1687
|
+
if (workspace.this_gain > workspace.best_gain) {
|
1688
|
+
workspace.best_gain = workspace.this_gain;
|
1689
|
+
workspace.column_type_best = Ordinal;
|
1690
|
+
workspace.col_best = col;
|
1691
|
+
workspace.split_lev_best = workspace.this_split_lev;
|
1692
|
+
}
|
1693
|
+
|
1694
|
+
}
|
1695
|
+
|
1696
|
+
}
|
1697
|
+
|
1698
|
+
|
1699
|
+
/* avoid unnecessary memory usage or repeats */
|
1700
|
+
workspace.col_has_outliers = workspace.lev_has_outliers? true : workspace.col_has_outliers;
|
1701
|
+
(*workspace.tree)[tree_from].clusters.shrink_to_fit();
|
1702
|
+
if ((*workspace.tree)[tree_from].all_branches.size() > 0) (*workspace.tree)[tree_from].all_branches.shrink_to_fit();
|
1703
|
+
if (curr_depth == 0 && workspace.col_is_bin && workspace.ncat_this > 2 && !workspace.already_split_main)
|
1704
|
+
workspace.already_split_main = true;
|
1705
|
+
|
1706
|
+
|
1707
|
+
/* if there is a non-insignificant gain, continue splitting from the branches of the best column */
|
1708
|
+
if (workspace.best_gain >= model_params.min_gain && !model_params.follow_all) {
|
1709
|
+
|
1710
|
+
curr_depth++;
|
1711
|
+
if (curr_depth >= model_params.max_depth) goto drop_if_not_needed;
|
1712
|
+
|
1713
|
+
/* discard outliers if any */
|
1714
|
+
if (workspace.lev_has_outliers)
|
1715
|
+
workspace.st = move_outliers_to_front(&workspace.ix_arr[0], &workspace.outlier_scores[0], workspace.st, workspace.end);
|
1716
|
+
|
1717
|
+
/* assign rows to their corresponding branch */
|
1718
|
+
switch(workspace.column_type_best) {
|
1719
|
+
case Numeric:
|
1720
|
+
{
|
1721
|
+
divide_subset_split(&workspace.ix_arr[0], input_data.numeric_data + workspace.col_best * input_data.nrows,
|
1722
|
+
workspace.st, workspace.end, workspace.split_point_best,
|
1723
|
+
(bool)(input_data.has_NA[workspace.col_best]),
|
1724
|
+
&(workspace.this_split_NA), &(workspace.this_split_ix) );
|
1725
|
+
spl1 = LessOrEqual; spl2 = Greater;
|
1726
|
+
set_tree_as_numeric(workspace.tree->back(), workspace.split_point_best, workspace.col_best);
|
1727
|
+
break;
|
1728
|
+
}
|
1729
|
+
|
1730
|
+
case Ordinal:
|
1731
|
+
{
|
1732
|
+
divide_subset_split(&workspace.ix_arr[0], input_data.ordinal_data + workspace.col_best * input_data.nrows,
|
1733
|
+
workspace.st, workspace.end, workspace.split_lev_best,
|
1734
|
+
(bool)(input_data.has_NA[workspace.col_best + input_data.ncols_numeric + input_data.ncols_categ]),
|
1735
|
+
&(workspace.this_split_NA), &(workspace.this_split_ix) );
|
1736
|
+
spl1 = LessOrEqual; spl2 = Greater;
|
1737
|
+
set_tree_as_ordinal(workspace.tree->back(), workspace.split_lev_best, workspace.col_best);
|
1738
|
+
break;
|
1739
|
+
}
|
1740
|
+
|
1741
|
+
case Categorical:
|
1742
|
+
{
|
1743
|
+
|
1744
|
+
if (input_data.ncat[workspace.col_best] == 2) {
|
1745
|
+
|
1746
|
+
divide_subset_split(&workspace.ix_arr[0], input_data.categorical_data + workspace.col_best * input_data.nrows,
|
1747
|
+
workspace.st, workspace.end, (int)0,
|
1748
|
+
(bool)(input_data.has_NA[workspace.col_best + input_data.ncols_numeric]),
|
1749
|
+
&(workspace.this_split_NA), &(workspace.this_split_ix) );
|
1750
|
+
spl1 = InSubset; spl2 = NotInSubset;
|
1751
|
+
set_tree_as_categorical(workspace.tree->back(), workspace.col_best);
|
1752
|
+
|
1753
|
+
} else if (workspace.col_is_bin || model_params.cat_bruteforce_subset) {
|
1754
|
+
|
1755
|
+
divide_subset_split(&workspace.ix_arr[0], input_data.categorical_data + workspace.col_best * input_data.nrows,
|
1756
|
+
workspace.st, workspace.end, &workspace.buffer_subset_categ_best[0], input_data.ncat[workspace.col_best],
|
1757
|
+
(bool)(input_data.has_NA[workspace.col_best + input_data.ncols_numeric]),
|
1758
|
+
&(workspace.this_split_NA), &(workspace.this_split_ix) );
|
1759
|
+
spl1 = InSubset; spl2 = NotInSubset;
|
1760
|
+
set_tree_as_categorical(workspace.tree->back(), input_data.ncat[workspace.col_best],
|
1761
|
+
&workspace.buffer_subset_categ_best[0], workspace.col_best);
|
1762
|
+
|
1763
|
+
} else {
|
1764
|
+
spl1 = SingleCateg;
|
1765
|
+
workspace.temp_ptr_x = input_data.categorical_data + workspace.col_best * input_data.nrows;
|
1766
|
+
std::sort(&workspace.ix_arr[0] + workspace.st, &workspace.ix_arr[0] + workspace.end + 1,
|
1767
|
+
[&workspace](const size_t a, const size_t b){return workspace.temp_ptr_x[a] < workspace.temp_ptr_x[b];});
|
1768
|
+
set_tree_as_categorical(workspace.tree->back(), workspace.col_best, input_data.ncat[workspace.col_best]);
|
1769
|
+
|
1770
|
+
for (size_t row = workspace.st; row <= workspace.end; row++) {
|
1771
|
+
if (workspace.temp_ptr_x[ workspace.ix_arr[row] ] >= 0) {
|
1772
|
+
workspace.this_split_NA = row;
|
1773
|
+
break;
|
1774
|
+
}
|
1775
|
+
}
|
1776
|
+
}
|
1777
|
+
break;
|
1778
|
+
}
|
1779
|
+
|
1780
|
+
|
1781
|
+
}
|
1782
|
+
|
1783
|
+
|
1784
|
+
ix1 = workspace.this_split_NA;
|
1785
|
+
ix2 = workspace.this_split_ix;
|
1786
|
+
ix3 = workspace.end;
|
1787
|
+
|
1788
|
+
/* NA branch */
|
1789
|
+
if (workspace.st > workspace.this_split_NA &&
|
1790
|
+
(workspace.st - workspace.this_split_NA) >= 2 * model_params.min_size_categ) {
|
1791
|
+
|
1792
|
+
workspace.end = ix1 - 1;
|
1793
|
+
(*workspace.tree)[tree_from].tree_NA = workspace.tree->size();
|
1794
|
+
workspace.tree->emplace_back(tree_from, IsNa);
|
1795
|
+
recursive_split_categ(workspace, input_data, model_params, curr_depth, true);
|
1796
|
+
}
|
1797
|
+
|
1798
|
+
if (spl1 == SingleCateg) {
|
1799
|
+
|
1800
|
+
/* TODO: this should be done instead in a loop per category looking for the start and end positions
|
1801
|
+
in ix_arr of each category using std::lower_bound */
|
1802
|
+
|
1803
|
+
/* TODO: it's not necessary to backup everything like when using 'follow_all', only need 'best_col' and 'temp_ptr_x' */
|
1804
|
+
state_backup = std::unique_ptr<RecursionState>(new RecursionState);
|
1805
|
+
for (int cat = 1; cat < input_data.ncat[workspace.col_best]; cat++) {
|
1806
|
+
|
1807
|
+
/* TODO: this is inefficient when some categories are not present, should instead at first do a pass over 'ix_arr'
|
1808
|
+
to calculate the start and end indices of each category, then loop over that array instead */
|
1809
|
+
for (size_t row = ix1 + 1; row < ix3; row++) {
|
1810
|
+
if (workspace.temp_ptr_x[ workspace.ix_arr[row] ] == cat) {
|
1811
|
+
if ((row - ix1) >= 2 * model_params.min_size_categ) {
|
1812
|
+
(*workspace.tree)[tree_from].binary_branches[cat-1] = workspace.tree->size();
|
1813
|
+
workspace.tree->emplace_back(tree_from, spl1);
|
1814
|
+
backup_recursion_state(workspace, *state_backup);
|
1815
|
+
workspace.st = ix1;
|
1816
|
+
workspace.end = row - 1;
|
1817
|
+
recursive_split_categ(workspace, input_data, model_params, curr_depth, is_NA_branch);
|
1818
|
+
restore_recursion_state(workspace, *state_backup);
|
1819
|
+
}
|
1820
|
+
ix1 = row;
|
1821
|
+
break;
|
1822
|
+
}
|
1823
|
+
else if (workspace.temp_ptr_x[ workspace.ix_arr[row] ] > cat) {
|
1824
|
+
ix1 = row;
|
1825
|
+
break;
|
1826
|
+
}
|
1827
|
+
}
|
1828
|
+
|
1829
|
+
}
|
1830
|
+
/* last category is given by the end index */
|
1831
|
+
if ((ix3 - ix1) >= 2 * model_params.min_size_categ) {
|
1832
|
+
(*workspace.tree)[tree_from].binary_branches[input_data.ncat[workspace.col_best]-1] = workspace.tree->size();
|
1833
|
+
workspace.tree->emplace_back(tree_from, spl1);
|
1834
|
+
workspace.st = ix1;
|
1835
|
+
workspace.end = ix3;
|
1836
|
+
recursive_split_categ(workspace, input_data, model_params, curr_depth, is_NA_branch);
|
1837
|
+
} else {
|
1838
|
+
(*workspace.tree)[tree_from].binary_branches.push_back(0);
|
1839
|
+
}
|
1840
|
+
|
1841
|
+
} else {
|
1842
|
+
/* numeric, ordinal, and subset split */
|
1843
|
+
|
1844
|
+
/* left branch */
|
1845
|
+
if ((ix2 - ix1) >= 2 * model_params.min_size_categ) {
|
1846
|
+
workspace.st = ix1;
|
1847
|
+
workspace.end = ix2 - 1;
|
1848
|
+
(*workspace.tree)[tree_from].tree_left = workspace.tree->size();
|
1849
|
+
workspace.tree->emplace_back(tree_from, spl1);
|
1850
|
+
recursive_split_categ(workspace, input_data, model_params, curr_depth, is_NA_branch);
|
1851
|
+
}
|
1852
|
+
|
1853
|
+
/* right branch */
|
1854
|
+
if ((ix3 - ix2) > 2 * model_params.min_size_categ) {
|
1855
|
+
workspace.st = ix2;
|
1856
|
+
workspace.end = ix3;
|
1857
|
+
(*workspace.tree)[tree_from].tree_right = workspace.tree->size();
|
1858
|
+
workspace.tree->emplace_back(tree_from, spl2);
|
1859
|
+
recursive_split_categ(workspace, input_data, model_params, curr_depth, is_NA_branch);
|
1860
|
+
}
|
1861
|
+
|
1862
|
+
}
|
1863
|
+
|
1864
|
+
|
1865
|
+
}
|
1866
|
+
|
1867
|
+
|
1868
|
+
/* if tree has no clusters and no subtrees, disconnect it from parent and then drop */
|
1869
|
+
drop_if_not_needed:
|
1870
|
+
if (check_tree_is_not_needed((*workspace.tree)[tree_from])) {
|
1871
|
+
|
1872
|
+
if (tree_from == 0) {
|
1873
|
+
workspace.tree->clear();
|
1874
|
+
} else if ((*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.size() > 0) {
|
1875
|
+
(*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.pop_back();
|
1876
|
+
workspace.tree->pop_back();
|
1877
|
+
} else {
|
1878
|
+
switch((*workspace.tree)[tree_from].parent_branch) {
|
1879
|
+
|
1880
|
+
case IsNa:
|
1881
|
+
{
|
1882
|
+
(*workspace.tree)[(*workspace.tree)[tree_from].parent].tree_NA = 0;
|
1883
|
+
workspace.tree->pop_back();
|
1884
|
+
break;
|
1885
|
+
}
|
1886
|
+
|
1887
|
+
case LessOrEqual:
|
1888
|
+
{
|
1889
|
+
(*workspace.tree)[(*workspace.tree)[tree_from].parent].tree_left = 0;
|
1890
|
+
workspace.tree->pop_back();
|
1891
|
+
break;
|
1892
|
+
}
|
1893
|
+
|
1894
|
+
case Greater:
|
1895
|
+
{
|
1896
|
+
(*workspace.tree)[(*workspace.tree)[tree_from].parent].tree_right = 0;
|
1897
|
+
workspace.tree->pop_back();
|
1898
|
+
break;
|
1899
|
+
}
|
1900
|
+
|
1901
|
+
case InSubset:
|
1902
|
+
{
|
1903
|
+
(*workspace.tree)[(*workspace.tree)[tree_from].parent].tree_left = 0;
|
1904
|
+
workspace.tree->pop_back();
|
1905
|
+
break;
|
1906
|
+
}
|
1907
|
+
|
1908
|
+
case NotInSubset:
|
1909
|
+
{
|
1910
|
+
(*workspace.tree)[(*workspace.tree)[tree_from].parent].tree_right = 0;
|
1911
|
+
workspace.tree->pop_back();
|
1912
|
+
break;
|
1913
|
+
}
|
1914
|
+
|
1915
|
+
case SingleCateg:
|
1916
|
+
{
|
1917
|
+
(*workspace.tree)[(*workspace.tree)[tree_from].parent].binary_branches.back() = 0;
|
1918
|
+
workspace.tree->pop_back();
|
1919
|
+
break;
|
1920
|
+
}
|
1921
|
+
|
1922
|
+
case SubTrees:
|
1923
|
+
{
|
1924
|
+
(*workspace.tree)[(*workspace.tree)[tree_from].parent].binary_branches.pop_back();
|
1925
|
+
workspace.tree->pop_back();
|
1926
|
+
break;
|
1927
|
+
}
|
1928
|
+
}
|
1929
|
+
}
|
1930
|
+
}
|
1931
|
+
|
1932
|
+
}
|