outliertree 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE.txt +674 -0
- data/NOTICE.txt +14 -0
- data/README.md +107 -0
- data/ext/outliertree/ext.cpp +260 -0
- data/ext/outliertree/extconf.rb +21 -0
- data/lib/outliertree.rb +17 -0
- data/lib/outliertree/dataset.rb +35 -0
- data/lib/outliertree/model.rb +128 -0
- data/lib/outliertree/result.rb +190 -0
- data/lib/outliertree/version.rb +3 -0
- data/vendor/outliertree/LICENSE +674 -0
- data/vendor/outliertree/README.md +155 -0
- data/vendor/outliertree/src/Makevars +3 -0
- data/vendor/outliertree/src/RcppExports.cpp +123 -0
- data/vendor/outliertree/src/Rwrapper.cpp +1225 -0
- data/vendor/outliertree/src/cat_outlier.cpp +328 -0
- data/vendor/outliertree/src/clusters.cpp +972 -0
- data/vendor/outliertree/src/fit_model.cpp +1932 -0
- data/vendor/outliertree/src/misc.cpp +685 -0
- data/vendor/outliertree/src/outlier_tree.hpp +758 -0
- data/vendor/outliertree/src/predict.cpp +706 -0
- data/vendor/outliertree/src/split.cpp +1098 -0
- metadata +150 -0
@@ -0,0 +1,1932 @@
|
|
1
|
+
/********************************************************************************************************************
|
2
|
+
* Explainable outlier detection
|
3
|
+
*
|
4
|
+
* Tries to detect outliers by generating decision trees that attempt to predict the values of each column based on
|
5
|
+
* each other column, testing in each branch of every tried split (if it meets some minimum criteria) whether there
|
6
|
+
* are observations that seem too distant from the others in a 1-D distribution for the column that the split tries
|
7
|
+
* to "predict" (will not generate a score for each observation).
|
8
|
+
* Splits are based on gain, while outlierness is based on confidence intervals.
|
9
|
+
* Similar in spirit to the GritBot software developed by RuleQuest research. Reference article is:
|
10
|
+
* Cortes, David. "Explainable outlier detection through decision tree conditioning."
|
11
|
+
* arXiv preprint arXiv:2001.00636 (2020).
|
12
|
+
*
|
13
|
+
*
|
14
|
+
* Copyright 2020 David Cortes.
|
15
|
+
*
|
16
|
+
* Written for C++11 standard and OpenMP 2.0 or later. Code is meant to be wrapped into scripting languages
|
17
|
+
* such as R or Python.
|
18
|
+
*
|
19
|
+
* This file is part of OutlierTree.
|
20
|
+
*
|
21
|
+
* OutlierTree is free software: you can redistribute it and/or modify
|
22
|
+
* it under the terms of the GNU General Public License as published by
|
23
|
+
* the Free Software Foundation, either version 3 of the License, or
|
24
|
+
* (at your option) any later version.
|
25
|
+
*
|
26
|
+
* OutlierTree is distributed in the hope that it will be useful,
|
27
|
+
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
28
|
+
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
29
|
+
* GNU General Public License for more details.
|
30
|
+
*
|
31
|
+
* You should have received a copy of the GNU General Public License
|
32
|
+
* along with OutlierTree. If not, see <https://www.gnu.org/licenses/>.
|
33
|
+
********************************************************************************************************************/
|
34
|
+
#include "outlier_tree.hpp"
|
35
|
+
|
36
|
+
/* Fit outliers model based on conditional distributions obtaines thorugh decision-tree splitting
|
37
|
+
*
|
38
|
+
* Note1: the function here will not perform any data validation - it must be done from outside already.
|
39
|
+
* Note2: the data types (double/int) were chosen due to R's internal representations of data structures,
|
40
|
+
* which only supports those types.
|
41
|
+
*
|
42
|
+
* Parameters:
|
43
|
+
* - model_outputs (out)
|
44
|
+
* Struct with the model outputs required for prediction time (trees and clusters) and information about identified outliers
|
45
|
+
* required to display their statistics. If there was any previous information from fitting the model to other data, it will
|
46
|
+
* be overwritten.
|
47
|
+
* - numeric_data[n * m1] (in)
|
48
|
+
* Array with numerical columns in the data. Must be ordered by columns like Fortran arrays.
|
49
|
+
* Missing values should be encoded as NaN. Infinite values in most sections are treated as NaN too.
|
50
|
+
* Binary or boolean columns must be passed as categorical.
|
51
|
+
* If there are no numerical columns, pass NULL.
|
52
|
+
* - ncols_numeric (in)
|
53
|
+
* Number of numeric columns in the array 'numeric_data'.
|
54
|
+
* - categorical_data[n * m2] (in)
|
55
|
+
* Array with categorical columns in the data. Must be ordered by columns like Fortran arrays.
|
56
|
+
* Negative numbers will be interpreted as missing values. Numeration must start at zero and be
|
57
|
+
* contiguous (i.e. if there's category 2, must also have category 1).
|
58
|
+
* If there are no categorical columns, pass NULL.
|
59
|
+
* - ncols_categ (in)
|
60
|
+
* Numer of categorical columns in the array 'categorical_data'.
|
61
|
+
* - ncat[m2] (in)
|
62
|
+
* Numer of categories in each categorical column. If there are no categorical columns, pass NULL.
|
63
|
+
* - ordinal_data[n * m3] (in)
|
64
|
+
* Array with ordinal categorical columns in the data. Must be ordered by columns like Fortran arrays.
|
65
|
+
* Same rules as for categorical data. Note that the order will only be taken into consideration when
|
66
|
+
* producing splits by these columns, but outliers are still detected in the same way as for categoricals.
|
67
|
+
* Binary or boolean columns must be passed as categorical (i.e. minimum categories in a column is 3).
|
68
|
+
* If there are no ordinal columns, pass NULL.
|
69
|
+
* - ncols_ord (in)
|
70
|
+
* Numer of ordinal columns in the array 'ordinal_data'.
|
71
|
+
* - ncat_ord[m3] (in)
|
72
|
+
* Numer of categories in each ordinal column. If there are no categorical columns, pass NULL.
|
73
|
+
* - nrows (in)
|
74
|
+
* Numer of rows in the arrays passed above.
|
75
|
+
* - cols_ignore[m1 + m2 + m3] (in)
|
76
|
+
* Boolean array indicating which columns should only be used as splitting criterion for other columns,
|
77
|
+
* while being ignored at the moment of finding outlier values in them. Pass NULL if outliers are to be
|
78
|
+
* searched for in all columns (this is the default).
|
79
|
+
* - nthreads (in)
|
80
|
+
* Numer of parallel threads to use. Should not be higher than the number of columns.
|
81
|
+
* Note that the more threads used, the more memory will need to be allocated.
|
82
|
+
* - categ_as_bin (in)
|
83
|
+
* Whether to binarize categorical columns at each category to split them by another categorical column.
|
84
|
+
* If this is false and 'cat_bruteforce_subset' is also false, then when splitting a categorical or ordinal
|
85
|
+
* variable by another categorical, it will have one branch per category of the splitting column. Ignored
|
86
|
+
* when splitting by numerical and ordinal. Overrides 'cat_bruteforce_subset' when passing true.
|
87
|
+
* - ord_as_bin (in)
|
88
|
+
* Same as above, but binarization is by less/greater than a level in the order.
|
89
|
+
* - cat_bruteforce_subset (in)
|
90
|
+
* Whether to do a brute-force search over all possible binary splits of grouped subsets of categories when
|
91
|
+
* splitting a categorical or ordinal column by another categorical column. If this is false and 'categ_as_bin'
|
92
|
+
* is also false, then when splitting a categorical or ordinal variable by another categorical, it will have
|
93
|
+
* one branch per category of the splitting column. Ignored when splitting by numerical and ordinal.
|
94
|
+
* Will be ignored when passing 'categ_as_bin' = true.
|
95
|
+
* - categ_from_maj (in)
|
96
|
+
* Whether to flag outliers in categorical variables according to the number of observations not belonging to
|
97
|
+
* the majority class (formula will be (n-n_maj)/(n * p_prior) < 1/(z_outlier^2) for each category). If passing
|
98
|
+
* 'false', will instead look for outliers in categorical variables based on being a minority and having a gap
|
99
|
+
* with respect to other categories, even if there is no dominant majority.
|
100
|
+
* - max_depth (in)
|
101
|
+
* Max depth of decision trees that generate conditional distributions (subsets of the data) in which to look
|
102
|
+
* for outliers.
|
103
|
+
* - max_perc_outliers (in)
|
104
|
+
* Model parameter. Approximate maximum percentage of outlier observations in each cluster. Default value is 0.01.
|
105
|
+
* - min_size_numeric (in)
|
106
|
+
* Minimum size that numeric clusters and splits on numeric variables can have. Default value is 35.
|
107
|
+
* - min_size_categ (in)
|
108
|
+
* Same but for categoricals. Default value is 75.
|
109
|
+
* - min_gain (in)
|
110
|
+
* Minimum gain that a split must produce in order not to discard it. Default value is 0.01 (in GritBot it's 0.000001).
|
111
|
+
* - gain_as_pct (in)
|
112
|
+
* Whether the gain above should be taken in absolute terms (sd_full - (n1*sd1 + n2*sd2)/n), or as a percentage
|
113
|
+
* ( (sd_full - (n1*sd1 + n2*sd2)/n) / sd_full ) (Replace 'sd' with shannon entropy for categorical variables).
|
114
|
+
* Taking it in absolute terms will prefer making more splits on columns that have a large variance, while taking it
|
115
|
+
* as a percentage might be more restrictive on them and might create deeper trees in some columns.
|
116
|
+
* - follow_all (in)
|
117
|
+
* Whether to create new tree branches (and continue creating new splits from all of them) from every split that meets them
|
118
|
+
* minimum gain or not. Doing so (which GritBot doesn't) will make the procedure much slower, but can flag more observations
|
119
|
+
* as outliers (with a much larger false-positive rate). Default is 'false'.
|
120
|
+
* - z_norm (in)
|
121
|
+
* Maximum Z value that is considered as normal in a distribution. Default value is 2.67 (percentile 99)
|
122
|
+
* - z_outlier (in)
|
123
|
+
* Minimum Z value that can be considered as outlier in numerical columns. Not used for categorical or ordinal columns.
|
124
|
+
*
|
125
|
+
* Returns:
|
126
|
+
* Whether any outliers were identified in the data to which the model was fit.
|
127
|
+
*/
|
128
|
+
bool fit_outliers_models(ModelOutputs &model_outputs,
|
129
|
+
double *restrict numeric_data, size_t ncols_numeric,
|
130
|
+
int *restrict categorical_data, size_t ncols_categ, int *restrict ncat,
|
131
|
+
int *restrict ordinal_data, size_t ncols_ord, int *restrict ncat_ord,
|
132
|
+
size_t nrows, char *restrict cols_ignore, int nthreads,
|
133
|
+
bool categ_as_bin, bool ord_as_bin, bool cat_bruteforce_subset, bool categ_from_maj, bool take_mid,
|
134
|
+
size_t max_depth, double max_perc_outliers, size_t min_size_numeric, size_t min_size_categ,
|
135
|
+
double min_gain, bool gain_as_pct, bool follow_all, double z_norm, double z_outlier)
|
136
|
+
{
|
137
|
+
|
138
|
+
/* put parameters and data into structs to avoid passing too many function arguments each time */
|
139
|
+
double z_tail = z_outlier - z_norm;
|
140
|
+
ModelParams model_params = {
|
141
|
+
categ_as_bin, ord_as_bin, cat_bruteforce_subset, categ_from_maj, take_mid,
|
142
|
+
max_depth, max_perc_outliers, min_size_numeric, min_size_categ,
|
143
|
+
min_gain, gain_as_pct, follow_all, z_norm, z_outlier, z_tail,
|
144
|
+
std::vector<long double>()
|
145
|
+
};
|
146
|
+
|
147
|
+
size_t tot_cols = ncols_numeric + ncols_categ + ncols_ord;
|
148
|
+
InputData input_data = {
|
149
|
+
numeric_data, ncols_numeric, categorical_data, ncols_categ, ncat,
|
150
|
+
ordinal_data, ncols_ord, ncat_ord, nrows, tot_cols, std::vector<char>(),
|
151
|
+
std::vector<char>(), -1, std::vector<size_t>(),
|
152
|
+
};
|
153
|
+
|
154
|
+
model_outputs.ncat.assign(ncat, ncat + ncols_categ);
|
155
|
+
model_outputs.ncat_ord.assign(ncat_ord, ncat_ord + ncols_ord);
|
156
|
+
model_outputs.ncols_numeric = ncols_numeric;
|
157
|
+
model_outputs.ncols_categ = ncols_categ;
|
158
|
+
model_outputs.ncols_ord = ncols_ord;
|
159
|
+
model_outputs.max_depth = max_depth;
|
160
|
+
model_outputs.min_outlier_any_cl.resize(model_outputs.ncols_numeric, -HUGE_VAL);
|
161
|
+
model_outputs.max_outlier_any_cl.resize(model_outputs.ncols_numeric, HUGE_VAL);
|
162
|
+
model_outputs.cat_outlier_any_cl.resize(model_outputs.ncols_categ + model_outputs.ncols_ord);
|
163
|
+
|
164
|
+
if (tot_cols < (size_t)nthreads)
|
165
|
+
nthreads = (int) tot_cols;
|
166
|
+
#ifndef _OPENMP
|
167
|
+
std::vector<Workspace> workspace(1);
|
168
|
+
#else
|
169
|
+
std::vector<Workspace> workspace(nthreads);
|
170
|
+
#endif
|
171
|
+
workspace.shrink_to_fit();
|
172
|
+
|
173
|
+
/* in case the model was already fit from before */
|
174
|
+
model_outputs.all_clusters.clear();
|
175
|
+
model_outputs.all_trees.clear();
|
176
|
+
allocate_row_outputs(model_outputs, nrows, max_depth);
|
177
|
+
|
178
|
+
/* initialize info holders as needed */
|
179
|
+
bool found_outliers = false;
|
180
|
+
input_data.has_NA.resize(tot_cols, false);
|
181
|
+
input_data.skip_col.resize(tot_cols, false);
|
182
|
+
model_outputs.start_ix_cat_counts.resize(ncols_categ + ncols_ord + 1);
|
183
|
+
model_outputs.col_transf.resize(ncols_numeric, NoTransf);
|
184
|
+
model_outputs.transf_offset.resize(ncols_numeric);
|
185
|
+
model_outputs.sd_div.resize(ncols_numeric);
|
186
|
+
model_outputs.min_decimals_col.resize(ncols_numeric);
|
187
|
+
|
188
|
+
/* determine maximum number of categories in a column, allocate arrays for category counts and proportions */
|
189
|
+
model_outputs.start_ix_cat_counts[0] = 0;
|
190
|
+
if (tot_cols > ncols_numeric) {
|
191
|
+
input_data.max_categ = calculate_category_indices(&model_outputs.start_ix_cat_counts[0], input_data.ncat, input_data.ncols_categ,
|
192
|
+
(bool*) &input_data.skip_col[ncols_numeric]);
|
193
|
+
input_data.max_categ = calculate_category_indices(&model_outputs.start_ix_cat_counts[input_data.ncols_categ], input_data.ncat_ord, input_data.ncols_ord,
|
194
|
+
(bool*) &input_data.skip_col[input_data.ncols_numeric + input_data.ncols_categ], input_data.max_categ);
|
195
|
+
} else {
|
196
|
+
input_data.max_categ = 0;
|
197
|
+
}
|
198
|
+
|
199
|
+
/* now allocate arrays for proportions */
|
200
|
+
input_data.cat_counts.resize(model_outputs.start_ix_cat_counts[ncols_categ + ncols_ord], 0);
|
201
|
+
model_params.prop_small.resize(model_outputs.start_ix_cat_counts[ncols_categ + ncols_ord]);
|
202
|
+
model_outputs.prop_categ.resize(model_outputs.start_ix_cat_counts[ncols_categ + ncols_ord]);
|
203
|
+
|
204
|
+
/* calculate prior probabilities for categorical variables (in parallel), see if any is unsplittable */
|
205
|
+
if (tot_cols > ncols_numeric) {
|
206
|
+
#pragma omp parallel
|
207
|
+
{
|
208
|
+
#pragma omp sections
|
209
|
+
{
|
210
|
+
|
211
|
+
#pragma omp section
|
212
|
+
{
|
213
|
+
if (ncols_categ > 0) {
|
214
|
+
calculate_all_cat_counts(&model_outputs.start_ix_cat_counts[0], &input_data.cat_counts[0], input_data.ncat,
|
215
|
+
input_data.categorical_data, input_data.ncols_categ, input_data.nrows,
|
216
|
+
(bool*) &input_data.has_NA[ncols_numeric], (bool*) &input_data.skip_col[input_data.ncols_numeric],
|
217
|
+
std::min(input_data.ncols_categ, (size_t)std::max(1, nthreads - 1)) );
|
218
|
+
|
219
|
+
check_cat_col_unsplittable(&model_outputs.start_ix_cat_counts[0], &input_data.cat_counts[0], input_data.ncat,
|
220
|
+
input_data.ncols_categ, std::min(model_params.min_size_numeric, model_params.min_size_categ), input_data.nrows,
|
221
|
+
(bool*) &input_data.skip_col[input_data.ncols_numeric],
|
222
|
+
std::min(input_data.ncols_categ, (size_t)std::max(1, nthreads - 1)));
|
223
|
+
}
|
224
|
+
|
225
|
+
|
226
|
+
}
|
227
|
+
|
228
|
+
#pragma omp section
|
229
|
+
{
|
230
|
+
if (ncols_ord > 0) {
|
231
|
+
calculate_all_cat_counts(&model_outputs.start_ix_cat_counts[input_data.ncols_categ], &input_data.cat_counts[0], input_data.ncat_ord,
|
232
|
+
input_data.ordinal_data, input_data.ncols_ord, input_data.nrows,
|
233
|
+
(bool*) &input_data.has_NA[input_data.ncols_numeric + input_data.ncols_categ],
|
234
|
+
(bool*) &input_data.skip_col[input_data.ncols_numeric + input_data.ncols_categ],
|
235
|
+
std::max((int)1, nthreads - (int)input_data.ncols_categ) );
|
236
|
+
|
237
|
+
check_cat_col_unsplittable(&model_outputs.start_ix_cat_counts[input_data.ncols_categ], &input_data.cat_counts[0], input_data.ncat_ord,
|
238
|
+
ncols_ord, std::min(model_params.min_size_numeric, model_params.min_size_categ), input_data.nrows,
|
239
|
+
(bool*) &input_data.skip_col[input_data.ncols_numeric + input_data.ncols_categ],
|
240
|
+
std::max((int)1, nthreads - (int)input_data.ncols_categ));
|
241
|
+
}
|
242
|
+
}
|
243
|
+
}
|
244
|
+
|
245
|
+
}
|
246
|
+
|
247
|
+
|
248
|
+
/* calculate proprotion limit and CI for each category of each column */
|
249
|
+
calculate_lowerlim_proportion(&model_params.prop_small[0], &model_outputs.prop_categ[0], &model_outputs.start_ix_cat_counts[0],
|
250
|
+
&input_data.cat_counts[0], input_data.ncols_categ, input_data.nrows, model_params.z_norm, model_params.z_tail);
|
251
|
+
calculate_lowerlim_proportion(&model_params.prop_small[0], &model_outputs.prop_categ[0], &model_outputs.start_ix_cat_counts[input_data.ncols_categ],
|
252
|
+
&input_data.cat_counts[0], input_data.ncols_ord, input_data.nrows, model_params.z_norm, model_params.z_tail);
|
253
|
+
}
|
254
|
+
|
255
|
+
/* for numerical columns, check if they have NAs or if total variance is too small */
|
256
|
+
check_missing_no_variance(input_data.numeric_data, input_data.ncols_numeric, input_data.nrows,
|
257
|
+
(bool*) &input_data.has_NA[0], (bool*) &input_data.skip_col[0],
|
258
|
+
model_outputs.min_decimals_col.data(), nthreads);
|
259
|
+
|
260
|
+
/* determine an approximate size for the output clusters, and reserve memory right away */
|
261
|
+
model_outputs.all_clusters.resize(tot_cols);
|
262
|
+
model_outputs.all_trees.resize(tot_cols);
|
263
|
+
#pragma omp parallel for shared(model_outputs, input_data, model_params, tot_cols)
|
264
|
+
for (size_t_for col = 0; col < tot_cols; col++) {
|
265
|
+
if (input_data.skip_col[col]) continue;
|
266
|
+
if (cols_ignore != NULL && cols_ignore[col]) continue;
|
267
|
+
model_outputs.all_clusters[col].reserve(tot_cols * std::min(2 * input_data.nrows, pow2(model_params.max_depth + 1)));
|
268
|
+
model_outputs.all_trees[col].reserve( square(model_params.max_depth) );
|
269
|
+
/* this is not exact as categoricals and ordinals can also be split multiple times */
|
270
|
+
}
|
271
|
+
|
272
|
+
|
273
|
+
/* now run the procedure on each column separately */
|
274
|
+
int tid;
|
275
|
+
nthreads = std::min(nthreads, (int)(ncols_numeric + ncols_categ + ncols_ord));
|
276
|
+
#pragma omp parallel for num_threads(nthreads) schedule(dynamic, 1) private(tid) shared(workspace, model_outputs, input_data, model_params, tot_cols)
|
277
|
+
for (size_t_for col = 0; col < tot_cols; col++) {
|
278
|
+
|
279
|
+
if (cols_ignore != NULL && cols_ignore[col]) continue;
|
280
|
+
if (input_data.skip_col[col] && col < input_data.ncols_numeric) continue;
|
281
|
+
tid = omp_get_thread_num();
|
282
|
+
|
283
|
+
/* re-use thread-private memory if possible */
|
284
|
+
if (!check_workspace_is_allocated(workspace[tid]))
|
285
|
+
allocate_thread_workspace(workspace[tid], input_data.nrows, input_data.max_categ);
|
286
|
+
|
287
|
+
/* numerical column */
|
288
|
+
if (col < input_data.ncols_numeric) {
|
289
|
+
process_numeric_col(model_outputs.all_clusters[col],
|
290
|
+
model_outputs.all_trees[col],
|
291
|
+
col,
|
292
|
+
workspace[tid],
|
293
|
+
input_data,
|
294
|
+
model_params, model_outputs);
|
295
|
+
calculate_cluster_minimums(model_outputs, col);
|
296
|
+
}
|
297
|
+
|
298
|
+
/* categorical column */
|
299
|
+
else if (col < (input_data.ncols_numeric + input_data.ncols_categ)) {
|
300
|
+
process_categ_col(model_outputs.all_clusters[col],
|
301
|
+
model_outputs.all_trees[col],
|
302
|
+
col, false,
|
303
|
+
workspace[tid],
|
304
|
+
input_data,
|
305
|
+
model_params, model_outputs);
|
306
|
+
calculate_cluster_poss_categs(model_outputs, col, col - input_data.ncols_numeric);
|
307
|
+
}
|
308
|
+
|
309
|
+
/* ordinal column */
|
310
|
+
else {
|
311
|
+
process_categ_col(model_outputs.all_clusters[col],
|
312
|
+
model_outputs.all_trees[col],
|
313
|
+
col, true,
|
314
|
+
workspace[tid],
|
315
|
+
input_data,
|
316
|
+
model_params, model_outputs);
|
317
|
+
calculate_cluster_poss_categs(model_outputs, col, col - input_data.ncols_numeric);
|
318
|
+
}
|
319
|
+
|
320
|
+
/* shrink the dynamic vectors to what ended up used only */
|
321
|
+
#ifdef TEST_MODE_DEFINE
|
322
|
+
prune_unused_trees(model_outputs.all_trees[col]);
|
323
|
+
#endif
|
324
|
+
if (
|
325
|
+
model_outputs.all_clusters[col].size() == 0 ||
|
326
|
+
model_outputs.all_trees[col].size() == 0 ||
|
327
|
+
check_tree_is_not_needed(model_outputs.all_trees[col][0])
|
328
|
+
)
|
329
|
+
{
|
330
|
+
model_outputs.all_trees[col].clear();
|
331
|
+
model_outputs.all_clusters[col].clear();
|
332
|
+
}
|
333
|
+
model_outputs.all_trees[col].shrink_to_fit();
|
334
|
+
model_outputs.all_clusters[col].shrink_to_fit();
|
335
|
+
|
336
|
+
/* simplify single-elements in subset to 'equals' or 'not equals' */
|
337
|
+
simplify_when_equal_cond(model_outputs.all_clusters[col], ncat_ord);
|
338
|
+
simplify_when_equal_cond(model_outputs.all_trees[col], ncat_ord);
|
339
|
+
|
340
|
+
/* remember only the best (rarest) value for each row */
|
341
|
+
#pragma omp critical
|
342
|
+
if (workspace[tid].col_has_outliers) {
|
343
|
+
|
344
|
+
found_outliers = true;
|
345
|
+
for (size_t row = 0; row < input_data.nrows; row++) {
|
346
|
+
|
347
|
+
if (workspace[tid].outlier_scores[row] < 1.0) {
|
348
|
+
|
349
|
+
if (
|
350
|
+
model_outputs.outlier_scores_final[row] >= 1.0 ||
|
351
|
+
(
|
352
|
+
workspace[tid].outlier_depth[row] < model_outputs.outlier_depth_final[row] &&
|
353
|
+
(
|
354
|
+
!model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch ||
|
355
|
+
model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
|
356
|
+
)
|
357
|
+
) ||
|
358
|
+
(
|
359
|
+
model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch &&
|
360
|
+
!model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch
|
361
|
+
) ||
|
362
|
+
(
|
363
|
+
workspace[tid].outlier_depth[row] == model_outputs.outlier_depth_final[row] &&
|
364
|
+
model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch
|
365
|
+
==
|
366
|
+
model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
|
367
|
+
&&
|
368
|
+
model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].cluster_size
|
369
|
+
<
|
370
|
+
model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].cluster_size
|
371
|
+
) ||
|
372
|
+
(
|
373
|
+
workspace[tid].outlier_depth[row] == model_outputs.outlier_depth_final[row] &&
|
374
|
+
model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].cluster_size
|
375
|
+
==
|
376
|
+
model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].cluster_size
|
377
|
+
&&
|
378
|
+
model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch
|
379
|
+
==
|
380
|
+
model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
|
381
|
+
&&
|
382
|
+
workspace[tid].outlier_scores[row] < model_outputs.outlier_scores_final[row]
|
383
|
+
)
|
384
|
+
)
|
385
|
+
{
|
386
|
+
model_outputs.outlier_scores_final[row] = workspace[tid].outlier_scores[row];
|
387
|
+
model_outputs.outlier_clusters_final[row] = workspace[tid].outlier_clusters[row];
|
388
|
+
model_outputs.outlier_trees_final[row] = workspace[tid].outlier_trees[row];
|
389
|
+
model_outputs.outlier_depth_final[row] = workspace[tid].outlier_depth[row];
|
390
|
+
model_outputs.outlier_columns_final[row] = col;
|
391
|
+
}
|
392
|
+
}
|
393
|
+
|
394
|
+
}
|
395
|
+
}
|
396
|
+
|
397
|
+
|
398
|
+
}
|
399
|
+
|
400
|
+
/* once finished, determine how many decimals to report for numerical outliers */
|
401
|
+
if (found_outliers)
|
402
|
+
calc_min_decimals_to_print(model_outputs, input_data.numeric_data, nthreads);
|
403
|
+
|
404
|
+
#ifdef TEST_MODE_DEFINE
|
405
|
+
for (size_t col = 0; col < tot_cols; col++) {
|
406
|
+
std::cout << "col " << col << " has " << model_outputs.all_clusters[col].size() << " clusters [" << model_outputs.all_trees[col].size() << " trees]" << std::endl;
|
407
|
+
}
|
408
|
+
|
409
|
+
find_new_outliers(numeric_data,
|
410
|
+
categorical_data,
|
411
|
+
ordinal_data,
|
412
|
+
nrows, nthreads, model_outputs);
|
413
|
+
|
414
|
+
|
415
|
+
// /* extract data for only one row */
|
416
|
+
// std::vector<double> num_data_row(ncols_numeric);
|
417
|
+
// std::vector<int> cat_data_row(ncols_categ);
|
418
|
+
// std::vector<int> ord_data_row(ncols_ord);
|
419
|
+
// size_t chosen_row = 38;
|
420
|
+
// for (size_t rowcol = 0; rowcol < ncols_numeric; rowcol++)
|
421
|
+
// num_data_row.at(rowcol) = numeric_data[chosen_row + rowcol * nrows];
|
422
|
+
// for (size_t rowcol = 0; rowcol < ncols_categ; rowcol++)
|
423
|
+
// cat_data_row.at(rowcol) = categorical_data[chosen_row + rowcol * nrows];
|
424
|
+
// for (size_t rowcol = 0; rowcol < ncols_ord; rowcol++)
|
425
|
+
// ord_data_row.at(rowcol) = ordinal_data[chosen_row + rowcol * nrows];
|
426
|
+
|
427
|
+
|
428
|
+
// find_new_outliers(&num_data_row[0],
|
429
|
+
// &cat_data_row[0],
|
430
|
+
// &ord_data_row[0],
|
431
|
+
// 1, 1, model_outputs);
|
432
|
+
// calc_min_printable_digits(model_outputs);
|
433
|
+
#endif
|
434
|
+
|
435
|
+
return found_outliers;
|
436
|
+
}
|
437
|
+
|
438
|
+
void process_numeric_col(std::vector<Cluster> &cluster_root,
|
439
|
+
std::vector<ClusterTree> &tree_root,
|
440
|
+
size_t target_col_num,
|
441
|
+
Workspace &workspace,
|
442
|
+
InputData &input_data,
|
443
|
+
ModelParams &model_params,
|
444
|
+
ModelOutputs &model_outputs)
|
445
|
+
{
|
446
|
+
/* discard NAs and infinites */
|
447
|
+
workspace.target_col_num = target_col_num;
|
448
|
+
workspace.target_numeric_col = input_data.numeric_data + target_col_num * input_data.nrows;
|
449
|
+
workspace.orig_target_col = workspace.target_numeric_col;
|
450
|
+
workspace.end = input_data.nrows - 1;
|
451
|
+
workspace.st = move_NAs_to_front(&workspace.ix_arr[0], workspace.target_numeric_col, 0, workspace.end, true);
|
452
|
+
workspace.col_has_outliers = false;
|
453
|
+
|
454
|
+
/* check for problematic distributions - need to sort data first */
|
455
|
+
std::sort(&workspace.ix_arr[0] + workspace.st, &workspace.ix_arr[0] + workspace.end + 1,
|
456
|
+
[&workspace](const size_t a, const size_t b){return workspace.target_numeric_col[a] < workspace.target_numeric_col[b];});
|
457
|
+
|
458
|
+
long double running_mean = 0;
|
459
|
+
long double mean_prev = 0;
|
460
|
+
long double running_ssq = 0;
|
461
|
+
double xval;
|
462
|
+
for (size_t row = workspace.st; row <= workspace.end; row++) {
|
463
|
+
xval = workspace.target_numeric_col[workspace.ix_arr[row]];
|
464
|
+
running_mean += (xval - running_mean) / (long double)(row - workspace.st + 1);
|
465
|
+
running_ssq += (xval - running_mean) * (xval - mean_prev);
|
466
|
+
mean_prev = running_mean;
|
467
|
+
}
|
468
|
+
|
469
|
+
check_for_tails(&workspace.ix_arr[0], workspace.st, workspace.end, workspace.target_numeric_col,
|
470
|
+
model_params.z_norm, model_params.max_perc_outliers,
|
471
|
+
&workspace.buffer_transf_y[0], (double)running_mean,
|
472
|
+
(double)sqrtl(running_ssq / (long double)(workspace.end - workspace.st)),
|
473
|
+
&workspace.left_tail, &workspace.right_tail,
|
474
|
+
&workspace.exp_transf, &workspace.log_transf);
|
475
|
+
|
476
|
+
/* if it's double-tailed, skip it as this model doesn't work properly with this */
|
477
|
+
if ( (workspace.exp_transf || !isinf(workspace.left_tail)) && (workspace.log_transf || !isinf(workspace.right_tail)) ) return;
|
478
|
+
|
479
|
+
/* apply log or exp transformation if necessary */
|
480
|
+
if (workspace.exp_transf) {
|
481
|
+
|
482
|
+
workspace.orig_mean = (double) running_mean;
|
483
|
+
workspace.orig_sd = (double) sqrtl(running_ssq / (long double)(workspace.end - workspace.st));
|
484
|
+
for (size_t row = workspace.st; row <= workspace.end; row++) {
|
485
|
+
workspace.buffer_transf_y[workspace.ix_arr[row]] = exp(z_score(workspace.target_numeric_col[workspace.ix_arr[row]], workspace.orig_mean, workspace.orig_sd));
|
486
|
+
}
|
487
|
+
workspace.target_numeric_col = &workspace.buffer_transf_y[0];
|
488
|
+
model_outputs.col_transf[workspace.target_col_num] = Exp;
|
489
|
+
model_outputs.transf_offset[workspace.target_col_num] = workspace.orig_mean;
|
490
|
+
model_outputs.sd_div[workspace.target_col_num] = workspace.orig_sd;
|
491
|
+
|
492
|
+
|
493
|
+
} else if (workspace.log_transf) {
|
494
|
+
|
495
|
+
if (workspace.target_numeric_col[workspace.ix_arr[workspace.st]] == 0) {
|
496
|
+
workspace.log_minval = -1;
|
497
|
+
} else {
|
498
|
+
workspace.log_minval = workspace.target_numeric_col[workspace.ix_arr[workspace.st]] - 1e-3;
|
499
|
+
}
|
500
|
+
|
501
|
+
for (size_t row = workspace.st; row <= workspace.end; row++) {
|
502
|
+
workspace.buffer_transf_y[workspace.ix_arr[row]] = log(workspace.target_numeric_col[workspace.ix_arr[row]] - workspace.log_minval);
|
503
|
+
}
|
504
|
+
workspace.target_numeric_col = &workspace.buffer_transf_y[0];
|
505
|
+
model_outputs.col_transf[workspace.target_col_num] = Log;
|
506
|
+
model_outputs.transf_offset[workspace.target_col_num] = workspace.log_minval;
|
507
|
+
|
508
|
+
}
|
509
|
+
|
510
|
+
/* create a cluster with no conditions */
|
511
|
+
workspace.clusters = &cluster_root;
|
512
|
+
workspace.tree = &tree_root;
|
513
|
+
std::fill(workspace.outlier_scores.begin(), workspace.outlier_scores.end(), (double)1.0);
|
514
|
+
workspace.tree->emplace_back(0, Root);
|
515
|
+
|
516
|
+
workspace.clusters->emplace_back(NoType, Root);
|
517
|
+
workspace.col_has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.st,
|
518
|
+
workspace.end, &workspace.outlier_scores[0],
|
519
|
+
&workspace.outlier_clusters[0], &workspace.outlier_trees[0], &workspace.outlier_depth[0],
|
520
|
+
workspace.clusters->back(), *(workspace.clusters), 0, 0, 0,
|
521
|
+
workspace.log_transf, workspace.log_minval, workspace.exp_transf,
|
522
|
+
workspace.orig_mean, workspace.orig_sd,
|
523
|
+
workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
|
524
|
+
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
|
525
|
+
workspace.tree->back().clusters.push_back(0);
|
526
|
+
|
527
|
+
/* remove outliers if any were found */
|
528
|
+
if (workspace.has_outliers)
|
529
|
+
workspace.st = move_outliers_to_front(&workspace.ix_arr[0], &workspace.outlier_scores[0], workspace.st, workspace.end);
|
530
|
+
|
531
|
+
/* update statistics if they've changed */
|
532
|
+
if (workspace.has_outliers || workspace.exp_transf || workspace.log_transf)
|
533
|
+
workspace.sd_y = calc_sd(&workspace.ix_arr[0], workspace.target_numeric_col,
|
534
|
+
workspace.st, workspace.end, &workspace.mean_y);
|
535
|
+
else
|
536
|
+
workspace.sd_y = sqrtl(running_ssq / (long double)(workspace.end - workspace.st));
|
537
|
+
|
538
|
+
if (model_params.max_depth > 0) recursive_split_numeric(workspace, input_data, model_params, 0, false);
|
539
|
+
}
|
540
|
+
|
541
|
+
void recursive_split_numeric(Workspace &workspace,
|
542
|
+
InputData &input_data,
|
543
|
+
ModelParams &model_params,
|
544
|
+
size_t curr_depth, bool is_NA_branch)
|
545
|
+
{
|
546
|
+
workspace.best_gain = -HUGE_VAL;
|
547
|
+
workspace.column_type_best = NoType;
|
548
|
+
workspace.lev_has_outliers = false;
|
549
|
+
if (curr_depth > 0) workspace.sd_y = calc_sd(&workspace.ix_arr[0], workspace.target_numeric_col,
|
550
|
+
workspace.st, workspace.end, &workspace.mean_y);
|
551
|
+
|
552
|
+
/* these are used to keep track of where to continue after calling a further recursion */
|
553
|
+
size_t ix1, ix2, ix3;
|
554
|
+
SplitType spl1, spl2;
|
555
|
+
size_t tree_from = workspace.tree->size() - 1;
|
556
|
+
|
557
|
+
/* when using 'follow_all' need to keep track of a lot more things */
|
558
|
+
std::unique_ptr<RecursionState> state_backup;
|
559
|
+
if (model_params.follow_all) state_backup = std::unique_ptr<RecursionState>(new RecursionState);
|
560
|
+
|
561
|
+
|
562
|
+
/* procedure: split with each other column */
|
563
|
+
|
564
|
+
/* first numeric */
|
565
|
+
for (size_t col = 0; col < input_data.ncols_numeric; col++) {
|
566
|
+
|
567
|
+
if (col == workspace.target_col_num) continue;
|
568
|
+
if (input_data.skip_col[col]) continue;
|
569
|
+
split_numericx_numericy(&workspace.ix_arr[0], workspace.st, workspace.end, input_data.numeric_data + col * input_data.nrows,
|
570
|
+
workspace.target_numeric_col, workspace.sd_y, (bool)(input_data.has_NA[col]), model_params.min_size_numeric,
|
571
|
+
model_params.take_mid, &workspace.buffer_sd[0], &(workspace.this_gain), &(workspace.this_split_point),
|
572
|
+
&(workspace.this_split_ix), &(workspace.this_split_NA));
|
573
|
+
if (model_params.gain_as_pct) workspace.this_gain /= workspace.sd_y;
|
574
|
+
|
575
|
+
/* if the gain is not insignificant, check clusters created by this split */
|
576
|
+
if (workspace.this_gain >= model_params.min_gain) {
|
577
|
+
|
578
|
+
/* NA branch */
|
579
|
+
if (workspace.this_split_NA > workspace.st &&
|
580
|
+
(workspace.this_split_NA - workspace.st) > model_params.min_size_numeric) {
|
581
|
+
|
582
|
+
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
583
|
+
workspace.clusters->emplace_back(Numeric, col, IsNa, -HUGE_VAL, true);
|
584
|
+
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.st,
|
585
|
+
workspace.this_split_NA - 1, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
|
586
|
+
&workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
|
587
|
+
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
588
|
+
workspace.log_transf, workspace.log_minval, workspace.exp_transf,
|
589
|
+
workspace.orig_mean, workspace.orig_sd,
|
590
|
+
workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
|
591
|
+
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
|
592
|
+
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
593
|
+
|
594
|
+
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
595
|
+
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
596
|
+
workspace.tree->emplace_back(tree_from, col, HUGE_VAL, IsNa);
|
597
|
+
backup_recursion_state(workspace, *state_backup);
|
598
|
+
workspace.end = workspace.this_split_NA - 1;
|
599
|
+
recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, true);
|
600
|
+
restore_recursion_state(workspace, *state_backup);
|
601
|
+
}
|
602
|
+
|
603
|
+
}
|
604
|
+
|
605
|
+
/* left branch */
|
606
|
+
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
607
|
+
workspace.clusters->emplace_back(Numeric, col, LessOrEqual, workspace.this_split_point, is_NA_branch);
|
608
|
+
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.this_split_NA,
|
609
|
+
workspace.this_split_ix, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
|
610
|
+
&workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
|
611
|
+
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
612
|
+
workspace.log_transf, workspace.log_minval, workspace.exp_transf,
|
613
|
+
workspace.orig_mean, workspace.orig_sd,
|
614
|
+
workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
|
615
|
+
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
|
616
|
+
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
617
|
+
|
618
|
+
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
619
|
+
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
620
|
+
workspace.tree->emplace_back(tree_from, col, workspace.this_split_point, LessOrEqual);
|
621
|
+
backup_recursion_state(workspace, *state_backup);
|
622
|
+
workspace.st = workspace.this_split_NA;
|
623
|
+
workspace.end = workspace.this_split_ix;
|
624
|
+
recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
625
|
+
restore_recursion_state(workspace, *state_backup);
|
626
|
+
}
|
627
|
+
|
628
|
+
|
629
|
+
/* right branch */
|
630
|
+
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
631
|
+
workspace.clusters->emplace_back(Numeric, col, Greater, workspace.this_split_point, is_NA_branch);
|
632
|
+
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.this_split_ix + 1,
|
633
|
+
workspace.end, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
|
634
|
+
&workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
|
635
|
+
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
636
|
+
workspace.log_transf, workspace.log_minval, workspace.exp_transf,
|
637
|
+
workspace.orig_mean, workspace.orig_sd,
|
638
|
+
workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
|
639
|
+
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
|
640
|
+
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
641
|
+
|
642
|
+
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
643
|
+
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
644
|
+
workspace.tree->emplace_back(tree_from, col, workspace.this_split_point, Greater);
|
645
|
+
backup_recursion_state(workspace, *state_backup);
|
646
|
+
workspace.st = workspace.this_split_ix + 1;
|
647
|
+
recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
648
|
+
restore_recursion_state(workspace, *state_backup);
|
649
|
+
}
|
650
|
+
|
651
|
+
|
652
|
+
/* if this is the best split, remember it for later */
|
653
|
+
if (workspace.this_gain > workspace.best_gain && !model_params.follow_all) {
|
654
|
+
workspace.best_gain = workspace.this_gain;
|
655
|
+
workspace.column_type_best = Numeric;
|
656
|
+
workspace.col_best = col;
|
657
|
+
workspace.split_point_best = workspace.this_split_point;
|
658
|
+
}
|
659
|
+
|
660
|
+
}
|
661
|
+
|
662
|
+
}
|
663
|
+
|
664
|
+
/* then categorical */
|
665
|
+
for (size_t col = 0; col < input_data.ncols_categ; col++) {
|
666
|
+
|
667
|
+
if (input_data.skip_col[col + input_data.ncols_numeric]) continue;
|
668
|
+
|
669
|
+
split_categx_numericy(&workspace.ix_arr[0], workspace.st, workspace.end, input_data.categorical_data + col * input_data.nrows,
|
670
|
+
workspace.target_numeric_col, workspace.sd_y, workspace.mean_y, false, input_data.ncat[col], &workspace.buffer_cat_cnt[0],
|
671
|
+
&workspace.buffer_cat_sum[0], &workspace.buffer_cat_sum_sq[0], &workspace.buffer_cat_sorted[0],
|
672
|
+
(bool)(input_data.has_NA[col + input_data.ncols_numeric]), model_params.min_size_numeric,
|
673
|
+
&(workspace.this_gain), &workspace.buffer_subset_categ[0], NULL);
|
674
|
+
if (model_params.gain_as_pct) workspace.this_gain /= workspace.sd_y;
|
675
|
+
|
676
|
+
if (workspace.this_gain >= model_params.min_gain) {
|
677
|
+
|
678
|
+
/* data is not arranged inside the splitting function, need to now assign to the branches as determined */
|
679
|
+
divide_subset_split(&workspace.ix_arr[0], input_data.categorical_data + col * input_data.nrows, workspace.st, workspace.end,
|
680
|
+
&workspace.buffer_subset_categ[0], input_data.ncat[col], (bool)(workspace.buffer_cat_cnt[input_data.ncat[col]] > 0),
|
681
|
+
&(workspace.this_split_NA), &(workspace.this_split_ix));
|
682
|
+
|
683
|
+
/* NA branch */
|
684
|
+
if ((workspace.this_split_NA - workspace.st) > model_params.min_size_numeric) {
|
685
|
+
|
686
|
+
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
687
|
+
workspace.clusters->emplace_back(Categorical, col, IsNa, (char*)NULL, (int)0, true);
|
688
|
+
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.st,
|
689
|
+
workspace.this_split_NA - 1, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
|
690
|
+
&workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
|
691
|
+
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
692
|
+
workspace.log_transf, workspace.log_minval, workspace.exp_transf,
|
693
|
+
workspace.orig_mean, workspace.orig_sd,
|
694
|
+
workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
|
695
|
+
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
|
696
|
+
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
697
|
+
|
698
|
+
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
699
|
+
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
700
|
+
workspace.tree->emplace_back(tree_from, col, IsNa, (char*)NULL, 0);
|
701
|
+
backup_recursion_state(workspace, *state_backup);
|
702
|
+
workspace.end = workspace.this_split_NA - 1;
|
703
|
+
recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, true);
|
704
|
+
restore_recursion_state(workspace, *state_backup);
|
705
|
+
}
|
706
|
+
|
707
|
+
}
|
708
|
+
|
709
|
+
/* left branch */
|
710
|
+
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
711
|
+
workspace.clusters->emplace_back(Categorical, col, InSubset, &workspace.buffer_subset_categ[0], input_data.ncat[col], is_NA_branch);
|
712
|
+
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.this_split_NA,
|
713
|
+
workspace.this_split_ix - 1, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
|
714
|
+
&workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
|
715
|
+
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
716
|
+
workspace.log_transf, workspace.log_minval, workspace.exp_transf,
|
717
|
+
workspace.orig_mean, workspace.orig_sd,
|
718
|
+
workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
|
719
|
+
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
|
720
|
+
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
721
|
+
|
722
|
+
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
723
|
+
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
724
|
+
workspace.tree->emplace_back(tree_from, col, InSubset, &workspace.buffer_subset_categ[0], input_data.ncat[col]);
|
725
|
+
backup_recursion_state(workspace, *state_backup);
|
726
|
+
workspace.st = workspace.this_split_NA;
|
727
|
+
workspace.end = workspace.this_split_ix - 1;
|
728
|
+
recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
729
|
+
restore_recursion_state(workspace, *state_backup);
|
730
|
+
}
|
731
|
+
|
732
|
+
/* right branch */
|
733
|
+
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
734
|
+
workspace.clusters->emplace_back(Categorical, col, NotInSubset, &workspace.buffer_subset_categ[0], input_data.ncat[col], is_NA_branch);
|
735
|
+
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.this_split_ix,
|
736
|
+
workspace.end, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
|
737
|
+
&workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
|
738
|
+
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
739
|
+
workspace.log_transf, workspace.log_minval, workspace.exp_transf,
|
740
|
+
workspace.orig_mean, workspace.orig_sd,
|
741
|
+
workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
|
742
|
+
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
|
743
|
+
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
744
|
+
|
745
|
+
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
746
|
+
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
747
|
+
workspace.tree->emplace_back(tree_from, col, NotInSubset, &workspace.buffer_subset_categ[0], input_data.ncat[col]);
|
748
|
+
backup_recursion_state(workspace, *state_backup);
|
749
|
+
workspace.st = workspace.this_split_ix;
|
750
|
+
recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
751
|
+
restore_recursion_state(workspace, *state_backup);
|
752
|
+
}
|
753
|
+
|
754
|
+
if (workspace.this_gain > workspace.best_gain && !model_params.follow_all) {
|
755
|
+
workspace.best_gain = workspace.this_gain;
|
756
|
+
workspace.column_type_best = Categorical;
|
757
|
+
workspace.col_best = col;
|
758
|
+
memcpy(&workspace.buffer_subset_categ_best[0], &workspace.buffer_subset_categ[0], input_data.ncat[col] * sizeof(char));
|
759
|
+
}
|
760
|
+
|
761
|
+
}
|
762
|
+
|
763
|
+
}
|
764
|
+
|
765
|
+
/* then ordinal */
|
766
|
+
for (size_t col = 0; col < input_data.ncols_ord; col++) {
|
767
|
+
|
768
|
+
if (input_data.skip_col[col + input_data.ncols_numeric + input_data.ncols_categ]) continue;
|
769
|
+
|
770
|
+
/* same code as for categorical, but this time with split level as int instead of boolean array as subset */
|
771
|
+
split_categx_numericy(&workspace.ix_arr[0], workspace.st, workspace.end, input_data.ordinal_data + col * input_data.nrows,
|
772
|
+
workspace.target_numeric_col, workspace.sd_y, workspace.mean_y, true, input_data.ncat_ord[col], &workspace.buffer_cat_cnt[0],
|
773
|
+
&workspace.buffer_cat_sum[0], &workspace.buffer_cat_sum_sq[0], &workspace.buffer_cat_sorted[0],
|
774
|
+
(bool)(input_data.has_NA[col + input_data.ncols_numeric + input_data.ncols_categ]), model_params.min_size_numeric,
|
775
|
+
&(workspace.this_gain), &workspace.buffer_subset_categ[0], &(workspace.this_split_lev));
|
776
|
+
if (model_params.gain_as_pct) workspace.this_gain /= workspace.sd_y;
|
777
|
+
|
778
|
+
if (workspace.this_gain >= model_params.min_gain) {
|
779
|
+
|
780
|
+
divide_subset_split(&workspace.ix_arr[0], input_data.ordinal_data + col * input_data.nrows, workspace.st, workspace.end,
|
781
|
+
workspace.this_split_lev, (bool)(workspace.buffer_cat_cnt[ input_data.ncat_ord[col] ] > 0),
|
782
|
+
&(workspace.this_split_NA), &(workspace.this_split_ix) );
|
783
|
+
|
784
|
+
if ((workspace.this_split_NA - workspace.st) > model_params.min_size_numeric) {
|
785
|
+
|
786
|
+
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
787
|
+
workspace.clusters->emplace_back(Ordinal, col, IsNa, (int)0, true);
|
788
|
+
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.st,
|
789
|
+
workspace.this_split_NA - 1, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
|
790
|
+
&workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
|
791
|
+
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
792
|
+
workspace.log_transf, workspace.log_minval, workspace.exp_transf,
|
793
|
+
workspace.orig_mean, workspace.orig_sd,
|
794
|
+
workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
|
795
|
+
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
|
796
|
+
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
797
|
+
|
798
|
+
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
799
|
+
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
800
|
+
workspace.tree->emplace_back(tree_from, col, (int)-1, IsNa);
|
801
|
+
backup_recursion_state(workspace, *state_backup);
|
802
|
+
workspace.end = workspace.this_split_NA - 1;
|
803
|
+
recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, true);
|
804
|
+
restore_recursion_state(workspace, *state_backup);
|
805
|
+
}
|
806
|
+
|
807
|
+
}
|
808
|
+
|
809
|
+
/* left branch */
|
810
|
+
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
811
|
+
workspace.clusters->emplace_back(Ordinal, col, LessOrEqual, workspace.this_split_lev, is_NA_branch);
|
812
|
+
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.this_split_NA,
|
813
|
+
workspace.this_split_ix - 1, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
|
814
|
+
&workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
|
815
|
+
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
816
|
+
workspace.log_transf, workspace.log_minval, workspace.exp_transf,
|
817
|
+
workspace.orig_mean, workspace.orig_sd,
|
818
|
+
workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
|
819
|
+
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
|
820
|
+
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
821
|
+
|
822
|
+
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
823
|
+
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
824
|
+
workspace.tree->emplace_back(tree_from, col, workspace.this_split_lev, LessOrEqual);
|
825
|
+
backup_recursion_state(workspace, *state_backup);
|
826
|
+
workspace.st = workspace.this_split_NA;
|
827
|
+
workspace.end = workspace.this_split_ix - 1;
|
828
|
+
recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
829
|
+
restore_recursion_state(workspace, *state_backup);
|
830
|
+
}
|
831
|
+
|
832
|
+
|
833
|
+
|
834
|
+
/* right branch */
|
835
|
+
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
836
|
+
workspace.clusters->emplace_back(Ordinal, col, Greater, workspace.this_split_lev, is_NA_branch);
|
837
|
+
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, &workspace.ix_arr[0], workspace.this_split_ix,
|
838
|
+
workspace.end, &workspace.outlier_scores[0], &workspace.outlier_clusters[0],
|
839
|
+
&workspace.outlier_trees[0], &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
|
840
|
+
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
841
|
+
workspace.log_transf, workspace.log_minval, workspace.exp_transf,
|
842
|
+
workspace.orig_mean, workspace.orig_sd,
|
843
|
+
workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
|
844
|
+
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
|
845
|
+
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
846
|
+
|
847
|
+
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
848
|
+
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
849
|
+
workspace.tree->emplace_back(tree_from, col, workspace.this_split_lev, Greater);
|
850
|
+
backup_recursion_state(workspace, *state_backup);
|
851
|
+
workspace.st = workspace.this_split_ix;
|
852
|
+
recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
853
|
+
restore_recursion_state(workspace, *state_backup);
|
854
|
+
}
|
855
|
+
|
856
|
+
if (workspace.this_gain > workspace.best_gain && !model_params.follow_all) {
|
857
|
+
workspace.best_gain = workspace.this_gain;
|
858
|
+
workspace.column_type_best = Ordinal;
|
859
|
+
workspace.col_best = col;
|
860
|
+
workspace.split_lev_best = workspace.this_split_lev;
|
861
|
+
}
|
862
|
+
|
863
|
+
}
|
864
|
+
|
865
|
+
}
|
866
|
+
|
867
|
+
/* avoid unnecessary memory usage */
|
868
|
+
workspace.col_has_outliers = workspace.lev_has_outliers? true : workspace.col_has_outliers;
|
869
|
+
(*workspace.tree)[tree_from].clusters.shrink_to_fit();
|
870
|
+
if ((*workspace.tree)[tree_from].all_branches.size() > 0) (*workspace.tree)[tree_from].all_branches.shrink_to_fit();
|
871
|
+
|
872
|
+
|
873
|
+
/* continue splitting further if meeting threshold criteria */
|
874
|
+
if (workspace.best_gain >= model_params.min_gain && !model_params.follow_all) {
|
875
|
+
|
876
|
+
/* check if depth limit is reached */
|
877
|
+
curr_depth++;
|
878
|
+
if (curr_depth >= model_params.max_depth) return;
|
879
|
+
|
880
|
+
/* discard outliers if any */
|
881
|
+
if (workspace.lev_has_outliers)
|
882
|
+
workspace.st = move_outliers_to_front(&workspace.ix_arr[0], &workspace.outlier_scores[0], workspace.st, workspace.end);
|
883
|
+
|
884
|
+
/* assign rows to their corresponding branch */
|
885
|
+
switch(workspace.column_type_best) {
|
886
|
+
case Numeric:
|
887
|
+
{
|
888
|
+
divide_subset_split(&workspace.ix_arr[0], input_data.numeric_data + workspace.col_best * input_data.nrows,
|
889
|
+
workspace.st, workspace.end, workspace.split_point_best,
|
890
|
+
(bool)(input_data.has_NA[workspace.col_best]),
|
891
|
+
&(workspace.this_split_NA), &(workspace.this_split_ix) );
|
892
|
+
spl1 = LessOrEqual; spl2 = Greater;
|
893
|
+
set_tree_as_numeric(workspace.tree->back(), workspace.split_point_best, workspace.col_best);
|
894
|
+
break;
|
895
|
+
}
|
896
|
+
|
897
|
+
case Categorical:
|
898
|
+
{
|
899
|
+
divide_subset_split(&workspace.ix_arr[0], input_data.categorical_data + workspace.col_best * input_data.nrows,
|
900
|
+
workspace.st, workspace.end, &workspace.buffer_subset_categ_best[0], input_data.ncat[workspace.col_best],
|
901
|
+
(bool)(input_data.has_NA[workspace.col_best + input_data.ncols_numeric]),
|
902
|
+
&(workspace.this_split_NA), &(workspace.this_split_ix) );
|
903
|
+
spl1 = InSubset; spl2 = NotInSubset;
|
904
|
+
set_tree_as_categorical(workspace.tree->back(), input_data.ncat[workspace.col_best],
|
905
|
+
&workspace.buffer_subset_categ_best[0], workspace.col_best);
|
906
|
+
break;
|
907
|
+
}
|
908
|
+
|
909
|
+
case Ordinal:
|
910
|
+
{
|
911
|
+
divide_subset_split(&workspace.ix_arr[0], input_data.ordinal_data + workspace.col_best * input_data.nrows,
|
912
|
+
workspace.st, workspace.end, workspace.split_lev_best,
|
913
|
+
(bool)(input_data.has_NA[workspace.col_best + input_data.ncols_numeric + input_data.ncols_categ]),
|
914
|
+
&(workspace.this_split_NA), &(workspace.this_split_ix) );
|
915
|
+
spl1 = LessOrEqual; spl2 = Greater;
|
916
|
+
set_tree_as_ordinal(workspace.tree->back(), workspace.split_lev_best, workspace.col_best);
|
917
|
+
break;
|
918
|
+
}
|
919
|
+
|
920
|
+
|
921
|
+
}
|
922
|
+
|
923
|
+
/* continue splitting recursively - need to remember from where */
|
924
|
+
ix1 = workspace.this_split_NA;
|
925
|
+
ix2 = workspace.this_split_ix;
|
926
|
+
ix3 = workspace.end;
|
927
|
+
|
928
|
+
/* NA branch */
|
929
|
+
if (workspace.st > workspace.this_split_NA &&
|
930
|
+
(workspace.st - workspace.this_split_NA) >= 2 * model_params.min_size_numeric) {
|
931
|
+
|
932
|
+
workspace.end = ix1 - 1;
|
933
|
+
(*workspace.tree)[tree_from].tree_NA = workspace.tree->size();
|
934
|
+
workspace.tree->emplace_back(tree_from, IsNa);
|
935
|
+
recursive_split_numeric(workspace, input_data, model_params, curr_depth, true);
|
936
|
+
}
|
937
|
+
|
938
|
+
/* left branch */
|
939
|
+
if ((ix2 - ix1) >= 2 * model_params.min_size_numeric) {
|
940
|
+
workspace.st = ix1;
|
941
|
+
workspace.end = ix2 - 1;
|
942
|
+
(*workspace.tree)[tree_from].tree_left = workspace.tree->size();
|
943
|
+
workspace.tree->emplace_back(tree_from, spl1);
|
944
|
+
recursive_split_numeric(workspace, input_data, model_params, curr_depth, is_NA_branch);
|
945
|
+
}
|
946
|
+
|
947
|
+
/* right branch */
|
948
|
+
if ((ix3 - ix2 + 1) >= 2 * model_params.min_size_numeric) {
|
949
|
+
workspace.st = ix2;
|
950
|
+
workspace.end = ix3;
|
951
|
+
(*workspace.tree)[tree_from].tree_right = workspace.tree->size();
|
952
|
+
workspace.tree->emplace_back(tree_from, spl2);
|
953
|
+
recursive_split_numeric(workspace, input_data, model_params, curr_depth, is_NA_branch);
|
954
|
+
}
|
955
|
+
|
956
|
+
}
|
957
|
+
|
958
|
+
/* if tree has no clusters and no subtrees, disconnect it from parent and then drop */
|
959
|
+
if (check_tree_is_not_needed((*workspace.tree)[tree_from])) {
|
960
|
+
|
961
|
+
if (tree_from == 0) {
|
962
|
+
workspace.tree->clear();
|
963
|
+
} else if ((*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.size() > 0) {
|
964
|
+
(*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.pop_back();
|
965
|
+
workspace.tree->pop_back();
|
966
|
+
} else {
|
967
|
+
switch((*workspace.tree)[tree_from].parent_branch) {
|
968
|
+
|
969
|
+
case IsNa:
|
970
|
+
{
|
971
|
+
(*workspace.tree)[(*workspace.tree)[tree_from].parent].tree_NA = 0;
|
972
|
+
break;
|
973
|
+
}
|
974
|
+
|
975
|
+
case LessOrEqual:
|
976
|
+
{
|
977
|
+
(*workspace.tree)[(*workspace.tree)[tree_from].parent].tree_left = 0;
|
978
|
+
break;
|
979
|
+
}
|
980
|
+
|
981
|
+
case Greater:
|
982
|
+
{
|
983
|
+
(*workspace.tree)[(*workspace.tree)[tree_from].parent].tree_right = 0;
|
984
|
+
break;
|
985
|
+
}
|
986
|
+
|
987
|
+
case InSubset:
|
988
|
+
{
|
989
|
+
(*workspace.tree)[(*workspace.tree)[tree_from].parent].tree_left = 0;
|
990
|
+
break;
|
991
|
+
}
|
992
|
+
|
993
|
+
case NotInSubset:
|
994
|
+
{
|
995
|
+
(*workspace.tree)[(*workspace.tree)[tree_from].parent].tree_right = 0;
|
996
|
+
break;
|
997
|
+
}
|
998
|
+
}
|
999
|
+
workspace.tree->pop_back();
|
1000
|
+
}
|
1001
|
+
}
|
1002
|
+
|
1003
|
+
}
|
1004
|
+
|
1005
|
+
void process_categ_col(std::vector<Cluster> &cluster_root,
|
1006
|
+
std::vector<ClusterTree> &tree_root,
|
1007
|
+
size_t target_col_num, bool is_ord,
|
1008
|
+
Workspace &workspace,
|
1009
|
+
InputData &input_data,
|
1010
|
+
ModelParams &model_params,
|
1011
|
+
ModelOutputs &model_outputs)
|
1012
|
+
{
|
1013
|
+
if (model_params.max_depth <= 0) return;
|
1014
|
+
|
1015
|
+
/* extract necesary info from column and discard NAs */
|
1016
|
+
workspace.target_col_is_ord = is_ord;
|
1017
|
+
workspace.target_col_num = target_col_num - input_data.ncols_numeric;
|
1018
|
+
if (!workspace.target_col_is_ord) {
|
1019
|
+
workspace.target_categ_col = input_data.categorical_data + workspace.target_col_num * input_data.nrows;
|
1020
|
+
workspace.ncat_this = input_data.ncat[workspace.target_col_num];
|
1021
|
+
} else {
|
1022
|
+
workspace.target_categ_col = input_data.ordinal_data + (workspace.target_col_num - input_data.ncols_categ) * input_data.nrows;
|
1023
|
+
workspace.ncat_this = input_data.ncat_ord[workspace.target_col_num - input_data.ncols_categ];
|
1024
|
+
}
|
1025
|
+
workspace.untransf_target_col = workspace.target_categ_col;
|
1026
|
+
workspace.end = input_data.nrows - 1;
|
1027
|
+
workspace.st = move_NAs_to_front(&workspace.ix_arr[0], workspace.target_categ_col, 0, workspace.end);
|
1028
|
+
workspace.col_has_outliers = false;
|
1029
|
+
workspace.col_is_bin = workspace.ncat_this <= 2;
|
1030
|
+
workspace.prop_small_this = &model_params.prop_small[ model_outputs.start_ix_cat_counts[workspace.target_col_num] ];
|
1031
|
+
workspace.prior_prob = &model_outputs.prop_categ[ model_outputs.start_ix_cat_counts[workspace.target_col_num] ];
|
1032
|
+
|
1033
|
+
/* create cluster root and reset outlier scores for this column */
|
1034
|
+
workspace.clusters = &cluster_root;
|
1035
|
+
workspace.tree = &tree_root;
|
1036
|
+
std::fill(workspace.outlier_scores.begin(), workspace.outlier_scores.end(), (double)1.0);
|
1037
|
+
workspace.tree->emplace_back(0, Root);
|
1038
|
+
|
1039
|
+
|
1040
|
+
/* at first, see if there's a category with 1-2 observations among only categories with large counts */
|
1041
|
+
workspace.col_has_outliers = find_outlier_categories_no_cond(&input_data.cat_counts[ model_outputs.start_ix_cat_counts[workspace.target_col_num] ],
|
1042
|
+
workspace.ncat_this, workspace.end - workspace.st + 1,
|
1043
|
+
&workspace.buffer_subset_categ[0], &(workspace.orig_mean));
|
1044
|
+
|
1045
|
+
/* if there is any such case, create a cluster for them */
|
1046
|
+
if (workspace.col_has_outliers) {
|
1047
|
+
workspace.tree->back().clusters.push_back(0);
|
1048
|
+
workspace.clusters->emplace_back(NoType, Root);
|
1049
|
+
define_categ_cluster_no_cond(workspace.untransf_target_col, &workspace.ix_arr[0], workspace.st, workspace.end, workspace.ncat_this,
|
1050
|
+
&workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
|
1051
|
+
&workspace.outlier_depth[0], workspace.clusters->back(),
|
1052
|
+
&input_data.cat_counts[ model_outputs.start_ix_cat_counts[workspace.target_col_num] ],
|
1053
|
+
&workspace.buffer_subset_categ[0], workspace.orig_mean);
|
1054
|
+
workspace.st = move_outliers_to_front(&workspace.ix_arr[0], &workspace.outlier_scores[0], workspace.st, workspace.end);
|
1055
|
+
}
|
1056
|
+
|
1057
|
+
/* if no conditional outliers are required, stop there */
|
1058
|
+
if (model_params.max_depth == 0) return;
|
1059
|
+
|
1060
|
+
/* if the rest of the data is all one category, do not process it any further */
|
1061
|
+
if (workspace.ncat_this == 2 && workspace.col_has_outliers) return;
|
1062
|
+
|
1063
|
+
/* if there isn't a single catchable outlier category, skip */
|
1064
|
+
bool should_skip = true;
|
1065
|
+
for (int cat = 0; cat < workspace.ncat_this; cat++) {
|
1066
|
+
|
1067
|
+
if (workspace.prop_small_this[cat] > (long double)1 / (long double)(workspace.end - workspace.st + 1 - model_params.min_size_categ))
|
1068
|
+
should_skip = false;
|
1069
|
+
}
|
1070
|
+
if (should_skip) return;
|
1071
|
+
|
1072
|
+
|
1073
|
+
/* if the column is already binary, or if using multiple categories, or if there are no more categorical columns, split the data as-is */
|
1074
|
+
if (
|
1075
|
+
(!model_params.categ_as_bin && !workspace.target_col_is_ord) ||
|
1076
|
+
(!model_params.ord_as_bin && workspace.target_col_is_ord) ||
|
1077
|
+
workspace.col_is_bin ||
|
1078
|
+
input_data.ncols_categ == (1 - ((workspace.target_col_is_ord)? 1 : 0))
|
1079
|
+
)
|
1080
|
+
{
|
1081
|
+
|
1082
|
+
/* calculate base information */
|
1083
|
+
workspace.base_info = total_info(&input_data.cat_counts[ model_outputs.start_ix_cat_counts[workspace.target_col_num] ],
|
1084
|
+
workspace.ncat_this, workspace.end - workspace.st + 1);
|
1085
|
+
workspace.base_info_orig = workspace.base_info;
|
1086
|
+
|
1087
|
+
/* then split */
|
1088
|
+
recursive_split_categ(workspace, input_data, model_params, 0, false);
|
1089
|
+
}
|
1090
|
+
|
1091
|
+
|
1092
|
+
else {
|
1093
|
+
/* otherwise, process the column 1 category at a time */
|
1094
|
+
size_t st_orig = workspace.st;
|
1095
|
+
size_t end_orig = workspace.end;
|
1096
|
+
size_t cat_counts_bin[2];
|
1097
|
+
workspace.col_is_bin = true;
|
1098
|
+
workspace.already_split_main = false;
|
1099
|
+
workspace.base_info_orig = total_info(&input_data.cat_counts[ model_outputs.start_ix_cat_counts[workspace.target_col_num] ],
|
1100
|
+
workspace.ncat_this, workspace.end - workspace.st + 1);
|
1101
|
+
workspace.tree->back().column_type = NoType;
|
1102
|
+
|
1103
|
+
|
1104
|
+
for (int cat = 0; cat < workspace.ncat_this - ((workspace.target_col_is_ord)? 1 : 0); cat++) {
|
1105
|
+
|
1106
|
+
workspace.st = st_orig;
|
1107
|
+
workspace.end = end_orig;
|
1108
|
+
|
1109
|
+
/* convert to binary */
|
1110
|
+
if (!workspace.target_col_is_ord) {
|
1111
|
+
|
1112
|
+
for (size_t row = workspace.st; row <= workspace.end; row++) {
|
1113
|
+
workspace.buffer_bin_y[workspace.ix_arr[row]] = (workspace.untransf_target_col[workspace.ix_arr[row]] == cat)? 1 : 0;
|
1114
|
+
}
|
1115
|
+
cat_counts_bin[0] = workspace.end - workspace.st + 1 - input_data.cat_counts[ cat + model_outputs.start_ix_cat_counts[workspace.target_col_num] ];
|
1116
|
+
cat_counts_bin[1] = input_data.cat_counts[ cat + model_outputs.start_ix_cat_counts[workspace.target_col_num] ];
|
1117
|
+
|
1118
|
+
} else {
|
1119
|
+
|
1120
|
+
for (size_t row = workspace.st; row <= workspace.end; row++) {
|
1121
|
+
workspace.buffer_bin_y[workspace.ix_arr[row]] = (workspace.untransf_target_col[workspace.ix_arr[row]] <= cat)? 1 : 0;
|
1122
|
+
}
|
1123
|
+
cat_counts_bin[0] = 0;
|
1124
|
+
cat_counts_bin[1] = workspace.end - workspace.st + 1;
|
1125
|
+
for (int catcat = 0; catcat <= cat; catcat++) {
|
1126
|
+
cat_counts_bin[0] += input_data.cat_counts[ catcat + model_outputs.start_ix_cat_counts[workspace.target_col_num] ];
|
1127
|
+
cat_counts_bin[1] -= input_data.cat_counts[ catcat + model_outputs.start_ix_cat_counts[workspace.target_col_num] ];
|
1128
|
+
}
|
1129
|
+
|
1130
|
+
}
|
1131
|
+
|
1132
|
+
if (cat_counts_bin[0] > 0 && cat_counts_bin[1] > 0) {
|
1133
|
+
workspace.target_categ_col = &workspace.buffer_bin_y[0];
|
1134
|
+
workspace.base_info = total_info(cat_counts_bin, 2, workspace.end - workspace.st + 1);
|
1135
|
+
(*workspace.tree)[0].binary_branches.push_back(workspace.tree->size());
|
1136
|
+
workspace.tree->emplace_back(0, SubTrees);
|
1137
|
+
recursive_split_categ(workspace, input_data, model_params, 0, false);
|
1138
|
+
}
|
1139
|
+
|
1140
|
+
}
|
1141
|
+
(*workspace.tree)[0].binary_branches.shrink_to_fit();
|
1142
|
+
|
1143
|
+
}
|
1144
|
+
|
1145
|
+
}
|
1146
|
+
|
1147
|
+
|
1148
|
+
void recursive_split_categ(Workspace &workspace,
|
1149
|
+
InputData &input_data,
|
1150
|
+
ModelParams &model_params,
|
1151
|
+
size_t curr_depth, bool is_NA_branch)
|
1152
|
+
{
|
1153
|
+
/* idea is the same as its numeric counterpart, only splitting by another categorical
|
1154
|
+
is less clear how to do and offers different options */
|
1155
|
+
workspace.best_gain = -HUGE_VAL;
|
1156
|
+
workspace.column_type_best = NoType;
|
1157
|
+
workspace.lev_has_outliers = false;
|
1158
|
+
size_t ix1, ix2, ix3;
|
1159
|
+
SplitType spl1, spl2;
|
1160
|
+
size_t tree_from = workspace.tree->size() - 1;
|
1161
|
+
|
1162
|
+
/* when using 'follow_all' need to keep track of a lot more things */
|
1163
|
+
std::unique_ptr<RecursionState> state_backup;
|
1164
|
+
if (model_params.follow_all) state_backup = std::unique_ptr<RecursionState>(new RecursionState);
|
1165
|
+
|
1166
|
+
if (curr_depth > 0) {
|
1167
|
+
workspace.base_info_orig = total_info(&workspace.ix_arr[0], workspace.untransf_target_col, workspace.st, workspace.end,
|
1168
|
+
workspace.ncat_this, &workspace.buffer_cat_cnt[0]);
|
1169
|
+
|
1170
|
+
/* check that there's still more than 1 category */
|
1171
|
+
size_t ncat_present = 0;
|
1172
|
+
for (int cat = 0; cat < workspace.ncat_this; cat++) {
|
1173
|
+
ncat_present += (workspace.buffer_cat_cnt[cat])? 1 : 0;
|
1174
|
+
if (ncat_present >= 2) break;
|
1175
|
+
}
|
1176
|
+
if (ncat_present < 2) goto drop_if_not_needed;
|
1177
|
+
if (workspace.col_is_bin && workspace.ncat_this > 2) {
|
1178
|
+
workspace.base_info = total_info(&workspace.ix_arr[0], workspace.target_categ_col, workspace.st, workspace.end,
|
1179
|
+
2, &workspace.buffer_cat_cnt[0]);
|
1180
|
+
if (workspace.buffer_cat_cnt[0] < model_params.min_size_categ || workspace.buffer_cat_cnt[1] == model_params.min_size_categ) goto drop_if_not_needed;
|
1181
|
+
} else {
|
1182
|
+
workspace.base_info = workspace.base_info_orig;
|
1183
|
+
}
|
1184
|
+
}
|
1185
|
+
|
1186
|
+
/* split with each other column */
|
1187
|
+
|
1188
|
+
|
1189
|
+
/* first numeric */
|
1190
|
+
for (size_t col = 0; col < input_data.ncols_numeric; col++) {
|
1191
|
+
|
1192
|
+
if (curr_depth == 0 && workspace.col_is_bin && workspace.ncat_this > 2 && workspace.already_split_main) break;
|
1193
|
+
if (input_data.skip_col[col]) continue;
|
1194
|
+
split_numericx_categy(&workspace.ix_arr[0], workspace.st, workspace.end, input_data.numeric_data + col * input_data.nrows,
|
1195
|
+
workspace.untransf_target_col, workspace.ncat_this, workspace.base_info_orig,
|
1196
|
+
&workspace.buffer_cat_cnt[0], (bool)(input_data.has_NA[col]), model_params.min_size_categ,
|
1197
|
+
model_params.take_mid, &(workspace.this_gain), &(workspace.this_split_point),
|
1198
|
+
&(workspace.this_split_ix), &(workspace.this_split_NA));
|
1199
|
+
if (model_params.gain_as_pct) workspace.this_gain /= workspace.base_info_orig;
|
1200
|
+
|
1201
|
+
if (workspace.this_gain >= model_params.min_gain) {
|
1202
|
+
|
1203
|
+
/* NA branch */
|
1204
|
+
if (workspace.this_split_NA > workspace.st &&
|
1205
|
+
(workspace.this_split_NA - workspace.st) > model_params.min_size_categ) {
|
1206
|
+
|
1207
|
+
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1208
|
+
workspace.clusters->emplace_back(Numeric, col, IsNa, -HUGE_VAL, true);
|
1209
|
+
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1210
|
+
&workspace.ix_arr[0], workspace.st, workspace.this_split_NA - 1,
|
1211
|
+
workspace.ncat_this, model_params.categ_from_maj,
|
1212
|
+
&workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
|
1213
|
+
&workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
|
1214
|
+
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1215
|
+
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1216
|
+
workspace.prop_small_this, workspace.prior_prob,
|
1217
|
+
&workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
|
1218
|
+
&workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
|
1219
|
+
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1220
|
+
if (workspace.drop_cluster) {
|
1221
|
+
workspace.clusters->pop_back();
|
1222
|
+
(*workspace.tree)[tree_from].clusters.pop_back();
|
1223
|
+
}
|
1224
|
+
|
1225
|
+
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
1226
|
+
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
1227
|
+
workspace.tree->emplace_back(tree_from, col, HUGE_VAL, IsNa);
|
1228
|
+
backup_recursion_state(workspace, *state_backup);
|
1229
|
+
workspace.end = workspace.this_split_NA - 1;
|
1230
|
+
recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, true);
|
1231
|
+
restore_recursion_state(workspace, *state_backup);
|
1232
|
+
}
|
1233
|
+
|
1234
|
+
}
|
1235
|
+
|
1236
|
+
/* left branch */
|
1237
|
+
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1238
|
+
workspace.clusters->emplace_back(Numeric, col, LessOrEqual, workspace.this_split_point, is_NA_branch);
|
1239
|
+
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1240
|
+
&workspace.ix_arr[0], workspace.this_split_NA, workspace.this_split_ix,
|
1241
|
+
workspace.ncat_this, model_params.categ_from_maj,
|
1242
|
+
&workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
|
1243
|
+
&workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
|
1244
|
+
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1245
|
+
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1246
|
+
workspace.prop_small_this, workspace.prior_prob,
|
1247
|
+
&workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
|
1248
|
+
&workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
|
1249
|
+
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1250
|
+
if (workspace.drop_cluster) {
|
1251
|
+
workspace.clusters->pop_back();
|
1252
|
+
(*workspace.tree)[tree_from].clusters.pop_back();
|
1253
|
+
}
|
1254
|
+
|
1255
|
+
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
1256
|
+
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
1257
|
+
workspace.tree->emplace_back(tree_from, col, workspace.this_split_point, LessOrEqual);
|
1258
|
+
backup_recursion_state(workspace, *state_backup);
|
1259
|
+
workspace.st = workspace.this_split_NA;
|
1260
|
+
workspace.end = workspace.this_split_ix;
|
1261
|
+
recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
1262
|
+
restore_recursion_state(workspace, *state_backup);
|
1263
|
+
}
|
1264
|
+
|
1265
|
+
|
1266
|
+
/* right branch */
|
1267
|
+
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1268
|
+
workspace.clusters->emplace_back(Numeric, col, Greater, workspace.this_split_point, is_NA_branch);
|
1269
|
+
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1270
|
+
&workspace.ix_arr[0], workspace.this_split_ix + 1, workspace.end,
|
1271
|
+
workspace.ncat_this, model_params.categ_from_maj,
|
1272
|
+
&workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
|
1273
|
+
&workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
|
1274
|
+
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1275
|
+
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1276
|
+
workspace.prop_small_this, workspace.prior_prob,
|
1277
|
+
&workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
|
1278
|
+
&workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
|
1279
|
+
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1280
|
+
if (workspace.drop_cluster) {
|
1281
|
+
workspace.clusters->pop_back();
|
1282
|
+
(*workspace.tree)[tree_from].clusters.pop_back();
|
1283
|
+
}
|
1284
|
+
|
1285
|
+
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
1286
|
+
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
1287
|
+
workspace.tree->emplace_back(tree_from, col, workspace.this_split_point, Greater);
|
1288
|
+
backup_recursion_state(workspace, *state_backup);
|
1289
|
+
workspace.st = workspace.this_split_ix + 1;
|
1290
|
+
recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
1291
|
+
restore_recursion_state(workspace, *state_backup);
|
1292
|
+
}
|
1293
|
+
|
1294
|
+
|
1295
|
+
/* if this is the best split, remember it for later */
|
1296
|
+
if (workspace.this_gain > workspace.best_gain) {
|
1297
|
+
workspace.best_gain = workspace.this_gain;
|
1298
|
+
workspace.column_type_best = Numeric;
|
1299
|
+
workspace.col_best = col;
|
1300
|
+
workspace.split_point_best = workspace.this_split_point;
|
1301
|
+
}
|
1302
|
+
|
1303
|
+
}
|
1304
|
+
|
1305
|
+
}
|
1306
|
+
|
1307
|
+
|
1308
|
+
/* then categorical */
|
1309
|
+
for (size_t col = 0; col < input_data.ncols_categ; col++) {
|
1310
|
+
|
1311
|
+
/* TODO: could make a pre-check that the splitting column up to this recursion still has
|
1312
|
+
more than 1 category, and skip for this and further recursions otherwise */
|
1313
|
+
|
1314
|
+
if (col == workspace.target_col_num && !workspace.target_col_is_ord) continue;
|
1315
|
+
if (input_data.skip_col[col + input_data.ncols_numeric]) continue;
|
1316
|
+
|
1317
|
+
if (workspace.col_is_bin) {
|
1318
|
+
|
1319
|
+
split_categx_biny(&workspace.ix_arr[0], workspace.st, workspace.end,
|
1320
|
+
input_data.categorical_data + col * input_data.nrows, workspace.target_categ_col,
|
1321
|
+
input_data.ncat[col], workspace.base_info, &workspace.buffer_cat_cnt[0],
|
1322
|
+
&workspace.buffer_crosstab[0], &workspace.buffer_cat_sorted[0],
|
1323
|
+
(bool)(input_data.has_NA[col + input_data.ncols_numeric]), model_params.min_size_categ,
|
1324
|
+
&(workspace.this_gain), &workspace.buffer_subset_categ[0]);
|
1325
|
+
|
1326
|
+
/* If it was forcibly binarized, need to calculate the gain on the original categories to make it comparable */
|
1327
|
+
if (
|
1328
|
+
!isinf(workspace.this_gain) &&
|
1329
|
+
(
|
1330
|
+
(!workspace.target_col_is_ord && input_data.ncat[workspace.target_col_num] > 2) ||
|
1331
|
+
(workspace.target_col_is_ord && input_data.ncat_ord[workspace.target_col_num - input_data.ncols_categ] > 2)
|
1332
|
+
)
|
1333
|
+
)
|
1334
|
+
{
|
1335
|
+
divide_subset_split(&workspace.ix_arr[0], input_data.categorical_data + col * input_data.nrows,
|
1336
|
+
workspace.st, workspace.end, &workspace.buffer_subset_categ[0], input_data.ncat[col],
|
1337
|
+
(bool)input_data.has_NA[col + input_data.ncols_numeric],
|
1338
|
+
&(workspace.this_split_NA), &(workspace.this_split_ix) );
|
1339
|
+
workspace.this_gain = categ_gain_from_split(&workspace.ix_arr[0], workspace.untransf_target_col, workspace.st,
|
1340
|
+
workspace.this_split_NA, workspace.this_split_ix, workspace.end,
|
1341
|
+
workspace.ncat_this, &workspace.buffer_cat_cnt[0], workspace.base_info_orig);
|
1342
|
+
}
|
1343
|
+
|
1344
|
+
} else {
|
1345
|
+
|
1346
|
+
if (model_params.cat_bruteforce_subset && input_data.ncat[col] > 2) {
|
1347
|
+
split_categx_categy_subset(&workspace.ix_arr[0], workspace.st, workspace.end,
|
1348
|
+
input_data.categorical_data + col * input_data.nrows, workspace.target_categ_col,
|
1349
|
+
input_data.ncat[col], workspace.ncat_this, workspace.base_info_orig,
|
1350
|
+
&workspace.buffer_cat_sorted[0], &workspace.buffer_crosstab[0], &workspace.buffer_cat_cnt[0],
|
1351
|
+
(bool)(input_data.has_NA[col + input_data.ncols_numeric]), model_params.min_size_categ,
|
1352
|
+
&(workspace.this_gain), &workspace.buffer_subset_categ[0]);
|
1353
|
+
} else {
|
1354
|
+
split_categx_categy_separate(&workspace.ix_arr[0], workspace.st, workspace.end,
|
1355
|
+
input_data.categorical_data + col * input_data.nrows, workspace.target_categ_col,
|
1356
|
+
input_data.ncat[col], workspace.ncat_this, workspace.base_info_orig,
|
1357
|
+
&workspace.buffer_cat_cnt[0], &workspace.buffer_crosstab[0],
|
1358
|
+
(bool)(input_data.has_NA[col + input_data.ncols_numeric]),
|
1359
|
+
model_params.min_size_categ, &(workspace.this_gain));
|
1360
|
+
}
|
1361
|
+
|
1362
|
+
}
|
1363
|
+
|
1364
|
+
if (model_params.gain_as_pct) workspace.this_gain /= workspace.base_info_orig;
|
1365
|
+
if (workspace.this_gain >= model_params.min_gain) {
|
1366
|
+
|
1367
|
+
/* NA branch */
|
1368
|
+
workspace.this_split_NA = move_NAs_to_front(&workspace.ix_arr[0], input_data.categorical_data + col * input_data.nrows, workspace.st, workspace.end);
|
1369
|
+
if ((workspace.this_split_NA - workspace.st) > model_params.min_size_categ) {
|
1370
|
+
|
1371
|
+
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1372
|
+
workspace.clusters->emplace_back(Categorical, col, IsNa, (char*)NULL, (int)0, true);
|
1373
|
+
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1374
|
+
&workspace.ix_arr[0], workspace.st, workspace.this_split_NA - 1,
|
1375
|
+
workspace.ncat_this, model_params.categ_from_maj,
|
1376
|
+
&workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
|
1377
|
+
&workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
|
1378
|
+
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1379
|
+
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1380
|
+
workspace.prop_small_this, workspace.prior_prob,
|
1381
|
+
&workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
|
1382
|
+
&workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
|
1383
|
+
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1384
|
+
if (workspace.drop_cluster) {
|
1385
|
+
workspace.clusters->pop_back();
|
1386
|
+
(*workspace.tree)[tree_from].clusters.pop_back();
|
1387
|
+
}
|
1388
|
+
|
1389
|
+
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
1390
|
+
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
1391
|
+
workspace.tree->emplace_back(tree_from, col, IsNa, (char*)NULL, 0);
|
1392
|
+
backup_recursion_state(workspace, *state_backup);
|
1393
|
+
workspace.end = workspace.this_split_NA - 1;
|
1394
|
+
recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, true);
|
1395
|
+
restore_recursion_state(workspace, *state_backup);
|
1396
|
+
}
|
1397
|
+
|
1398
|
+
}
|
1399
|
+
|
1400
|
+
if (!model_params.cat_bruteforce_subset && !workspace.col_is_bin && input_data.ncat[col] > 2) {
|
1401
|
+
|
1402
|
+
/* sort by the splitting variable and iterate over to determine the split points */
|
1403
|
+
workspace.temp_ptr_x = input_data.categorical_data + col * input_data.nrows;
|
1404
|
+
std::sort(&workspace.ix_arr[0] + workspace.this_split_NA, &workspace.ix_arr[0] + workspace.end + 1,
|
1405
|
+
[&workspace](const size_t a, const size_t b){return workspace.temp_ptr_x[a] < workspace.temp_ptr_x[b];});
|
1406
|
+
workspace.this_split_ix = workspace.this_split_NA;
|
1407
|
+
|
1408
|
+
/* TODO: should instead use std::lower_bound to calculate the start and end indices of each category */
|
1409
|
+
for (size_t row = workspace.this_split_NA + 1; row <= workspace.end; row++) {
|
1410
|
+
|
1411
|
+
/* if the next observation is in a different category, then the split ends here */
|
1412
|
+
if (workspace.temp_ptr_x[workspace.ix_arr[row]] != workspace.temp_ptr_x[workspace.ix_arr[row-1]]) {
|
1413
|
+
|
1414
|
+
if ((row - workspace.this_split_ix) >= model_params.min_size_categ) {
|
1415
|
+
|
1416
|
+
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1417
|
+
workspace.clusters->emplace_back(col, workspace.temp_ptr_x[workspace.ix_arr[row-1]], input_data.ncat[col], is_NA_branch);
|
1418
|
+
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1419
|
+
&workspace.ix_arr[0], workspace.this_split_ix, row - 1,
|
1420
|
+
workspace.ncat_this, model_params.categ_from_maj,
|
1421
|
+
&workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
|
1422
|
+
&workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
|
1423
|
+
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1424
|
+
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1425
|
+
workspace.prop_small_this, workspace.prior_prob,
|
1426
|
+
&workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
|
1427
|
+
&workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
|
1428
|
+
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1429
|
+
if (workspace.drop_cluster) {
|
1430
|
+
workspace.clusters->pop_back();
|
1431
|
+
(*workspace.tree)[tree_from].clusters.pop_back();
|
1432
|
+
}
|
1433
|
+
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
1434
|
+
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
1435
|
+
workspace.tree->emplace_back(tree_from, col, workspace.temp_ptr_x[workspace.ix_arr[workspace.this_split_ix]]);
|
1436
|
+
backup_recursion_state(workspace, *state_backup);
|
1437
|
+
workspace.st = workspace.this_split_ix;
|
1438
|
+
workspace.end = row - 1;
|
1439
|
+
recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
1440
|
+
restore_recursion_state(workspace, *state_backup);
|
1441
|
+
}
|
1442
|
+
}
|
1443
|
+
workspace.this_split_ix = row;
|
1444
|
+
}
|
1445
|
+
}
|
1446
|
+
/* last category is given by the end indices */
|
1447
|
+
if ((workspace.end - workspace.this_split_ix) > model_params.min_size_categ) {
|
1448
|
+
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1449
|
+
workspace.clusters->emplace_back(col, workspace.temp_ptr_x[workspace.ix_arr[workspace.end]], input_data.ncat[col], is_NA_branch);
|
1450
|
+
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1451
|
+
&workspace.ix_arr[0], workspace.this_split_ix, workspace.end,
|
1452
|
+
workspace.ncat_this, model_params.categ_from_maj,
|
1453
|
+
&workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
|
1454
|
+
&workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
|
1455
|
+
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1456
|
+
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1457
|
+
workspace.prop_small_this, workspace.prior_prob,
|
1458
|
+
&workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
|
1459
|
+
&workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
|
1460
|
+
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1461
|
+
if (workspace.drop_cluster) {
|
1462
|
+
workspace.clusters->pop_back();
|
1463
|
+
(*workspace.tree)[tree_from].clusters.pop_back();
|
1464
|
+
}
|
1465
|
+
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
1466
|
+
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
1467
|
+
workspace.tree->emplace_back(tree_from, col, workspace.temp_ptr_x[workspace.ix_arr[workspace.end]]);
|
1468
|
+
backup_recursion_state(workspace, *state_backup);
|
1469
|
+
workspace.st = workspace.this_split_ix;
|
1470
|
+
recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
1471
|
+
restore_recursion_state(workspace, *state_backup);
|
1472
|
+
}
|
1473
|
+
|
1474
|
+
}
|
1475
|
+
|
1476
|
+
if (workspace.this_gain > workspace.best_gain) {
|
1477
|
+
workspace.best_gain = workspace.this_gain;
|
1478
|
+
workspace.column_type_best = Categorical;
|
1479
|
+
workspace.col_best = col;
|
1480
|
+
}
|
1481
|
+
|
1482
|
+
|
1483
|
+
} else {
|
1484
|
+
|
1485
|
+
/* split by subsets of categories */
|
1486
|
+
|
1487
|
+
if (input_data.ncat[col] == 2) {
|
1488
|
+
|
1489
|
+
workspace.buffer_subset_categ[0] = 1;
|
1490
|
+
workspace.buffer_subset_categ[1] = 0;
|
1491
|
+
divide_subset_split(&workspace.ix_arr[0], input_data.categorical_data + col * input_data.nrows, workspace.this_split_NA, workspace.end,
|
1492
|
+
(int)0, false, &(workspace.this_split_NA), &(workspace.this_split_ix));
|
1493
|
+
if (
|
1494
|
+
(workspace.end - workspace.this_split_ix) < model_params.min_size_categ ||
|
1495
|
+
(workspace.this_split_ix - workspace.this_split_NA) < model_params.min_size_categ
|
1496
|
+
) continue;
|
1497
|
+
|
1498
|
+
} else {
|
1499
|
+
|
1500
|
+
divide_subset_split(&workspace.ix_arr[0], input_data.categorical_data + col * input_data.nrows, workspace.this_split_NA, workspace.end,
|
1501
|
+
&workspace.buffer_subset_categ[0], input_data.ncat[col], false,
|
1502
|
+
&(workspace.this_split_NA), &(workspace.this_split_ix));
|
1503
|
+
}
|
1504
|
+
|
1505
|
+
/* left branch */
|
1506
|
+
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1507
|
+
workspace.clusters->emplace_back(Categorical, col, InSubset, &workspace.buffer_subset_categ[0], input_data.ncat[col], is_NA_branch);
|
1508
|
+
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1509
|
+
&workspace.ix_arr[0], workspace.this_split_NA, workspace.this_split_ix - 1,
|
1510
|
+
workspace.ncat_this, model_params.categ_from_maj,
|
1511
|
+
&workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
|
1512
|
+
&workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
|
1513
|
+
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1514
|
+
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1515
|
+
workspace.prop_small_this, workspace.prior_prob,
|
1516
|
+
&workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
|
1517
|
+
&workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
|
1518
|
+
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1519
|
+
if (workspace.drop_cluster) {
|
1520
|
+
workspace.clusters->pop_back();
|
1521
|
+
(*workspace.tree)[tree_from].clusters.pop_back();
|
1522
|
+
}
|
1523
|
+
|
1524
|
+
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
1525
|
+
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
1526
|
+
workspace.tree->emplace_back(tree_from, col, InSubset, &workspace.buffer_subset_categ[0], input_data.ncat[col]);
|
1527
|
+
backup_recursion_state(workspace, *state_backup);
|
1528
|
+
workspace.st = workspace.this_split_NA;
|
1529
|
+
workspace.end = workspace.this_split_ix - 1;
|
1530
|
+
recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
1531
|
+
restore_recursion_state(workspace, *state_backup);
|
1532
|
+
}
|
1533
|
+
|
1534
|
+
/* right branch */
|
1535
|
+
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1536
|
+
workspace.clusters->emplace_back(Categorical, col, NotInSubset, &workspace.buffer_subset_categ[0], input_data.ncat[col], is_NA_branch);
|
1537
|
+
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1538
|
+
&workspace.ix_arr[0], workspace.this_split_ix, workspace.end,
|
1539
|
+
workspace.ncat_this, model_params.categ_from_maj,
|
1540
|
+
&workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
|
1541
|
+
&workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
|
1542
|
+
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1543
|
+
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1544
|
+
workspace.prop_small_this, workspace.prior_prob,
|
1545
|
+
&workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
|
1546
|
+
&workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
|
1547
|
+
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1548
|
+
if (workspace.drop_cluster) {
|
1549
|
+
workspace.clusters->pop_back();
|
1550
|
+
(*workspace.tree)[tree_from].clusters.pop_back();
|
1551
|
+
}
|
1552
|
+
|
1553
|
+
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
1554
|
+
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
1555
|
+
workspace.tree->emplace_back(tree_from, col, NotInSubset, &workspace.buffer_subset_categ[0], input_data.ncat[col]);
|
1556
|
+
backup_recursion_state(workspace, *state_backup);
|
1557
|
+
workspace.st = workspace.this_split_ix;
|
1558
|
+
recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
1559
|
+
restore_recursion_state(workspace, *state_backup);
|
1560
|
+
}
|
1561
|
+
|
1562
|
+
if (workspace.this_gain > workspace.best_gain) {
|
1563
|
+
workspace.best_gain = workspace.this_gain;
|
1564
|
+
workspace.column_type_best = Categorical;
|
1565
|
+
workspace.col_best = col;
|
1566
|
+
memcpy(&workspace.buffer_subset_categ_best[0], &workspace.buffer_subset_categ[0], input_data.ncat[col] * sizeof(char));
|
1567
|
+
}
|
1568
|
+
|
1569
|
+
}
|
1570
|
+
|
1571
|
+
}
|
1572
|
+
|
1573
|
+
}
|
1574
|
+
|
1575
|
+
|
1576
|
+
/* then ordinal */
|
1577
|
+
for (size_t col = 0; col < input_data.ncols_ord; col++) {
|
1578
|
+
|
1579
|
+
if (curr_depth == 0 && workspace.col_is_bin && workspace.ncat_this > 2 && workspace.already_split_main) break;
|
1580
|
+
if (input_data.skip_col[col + input_data.ncols_numeric + input_data.ncols_categ]) continue;
|
1581
|
+
if (workspace.target_col_is_ord && col == (workspace.target_col_num - input_data.ncols_categ)) continue;
|
1582
|
+
|
1583
|
+
split_ordx_categy(&workspace.ix_arr[0], workspace.st, workspace.end,
|
1584
|
+
input_data.ordinal_data + col * input_data.nrows, workspace.untransf_target_col,
|
1585
|
+
input_data.ncat_ord[col], workspace.ncat_this,
|
1586
|
+
workspace.base_info_orig, &workspace.buffer_cat_cnt[0], &workspace.buffer_crosstab[0], &workspace.buffer_cat_sorted[0],
|
1587
|
+
(bool)(input_data.has_NA[col + input_data.ncols_numeric + input_data.ncols_categ]),
|
1588
|
+
model_params.min_size_categ, &(workspace.this_gain), &(workspace.this_split_lev));
|
1589
|
+
if (model_params.gain_as_pct) workspace.this_gain /= workspace.base_info_orig;
|
1590
|
+
|
1591
|
+
if (workspace.this_gain >= model_params.min_gain) {
|
1592
|
+
|
1593
|
+
divide_subset_split(&workspace.ix_arr[0], input_data.ordinal_data + col * input_data.nrows, workspace.st, workspace.end,
|
1594
|
+
workspace.this_split_lev, (bool)(workspace.buffer_cat_cnt[ input_data.ncat_ord[col] ] > 0),
|
1595
|
+
&(workspace.this_split_NA), &(workspace.this_split_ix) );
|
1596
|
+
|
1597
|
+
/* NA branch */
|
1598
|
+
if ((workspace.this_split_NA - workspace.st) > model_params.min_size_categ) {
|
1599
|
+
|
1600
|
+
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1601
|
+
workspace.clusters->emplace_back(Ordinal, col, IsNa, (int)0, true);
|
1602
|
+
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1603
|
+
&workspace.ix_arr[0], workspace.st, workspace.this_split_NA - 1,
|
1604
|
+
workspace.ncat_this, model_params.categ_from_maj,
|
1605
|
+
&workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
|
1606
|
+
&workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
|
1607
|
+
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1608
|
+
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1609
|
+
workspace.prop_small_this, workspace.prior_prob,
|
1610
|
+
&workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
|
1611
|
+
&workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
|
1612
|
+
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1613
|
+
if (workspace.drop_cluster) {
|
1614
|
+
workspace.clusters->pop_back();
|
1615
|
+
(*workspace.tree)[tree_from].clusters.pop_back();
|
1616
|
+
}
|
1617
|
+
|
1618
|
+
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
1619
|
+
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
1620
|
+
workspace.tree->emplace_back(tree_from, col, (int)-1, IsNa);
|
1621
|
+
backup_recursion_state(workspace, *state_backup);
|
1622
|
+
workspace.end = workspace.this_split_NA - 1;
|
1623
|
+
recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, true);
|
1624
|
+
restore_recursion_state(workspace, *state_backup);
|
1625
|
+
}
|
1626
|
+
|
1627
|
+
}
|
1628
|
+
|
1629
|
+
/* left branch */
|
1630
|
+
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1631
|
+
workspace.clusters->emplace_back(Ordinal, col, LessOrEqual, workspace.this_split_lev, is_NA_branch);
|
1632
|
+
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1633
|
+
&workspace.ix_arr[0], workspace.this_split_NA, workspace.this_split_ix - 1,
|
1634
|
+
workspace.ncat_this, model_params.categ_from_maj,
|
1635
|
+
&workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
|
1636
|
+
&workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
|
1637
|
+
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1638
|
+
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1639
|
+
workspace.prop_small_this, workspace.prior_prob,
|
1640
|
+
&workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
|
1641
|
+
&workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
|
1642
|
+
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1643
|
+
if (workspace.drop_cluster) {
|
1644
|
+
workspace.clusters->pop_back();
|
1645
|
+
(*workspace.tree)[tree_from].clusters.pop_back();
|
1646
|
+
}
|
1647
|
+
|
1648
|
+
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
1649
|
+
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
1650
|
+
workspace.tree->emplace_back(tree_from, col, workspace.this_split_lev, LessOrEqual);
|
1651
|
+
backup_recursion_state(workspace, *state_backup);
|
1652
|
+
workspace.st = workspace.this_split_NA;
|
1653
|
+
workspace.end = workspace.this_split_ix - 1;
|
1654
|
+
recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
1655
|
+
restore_recursion_state(workspace, *state_backup);
|
1656
|
+
}
|
1657
|
+
|
1658
|
+
/* right branch */
|
1659
|
+
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1660
|
+
workspace.clusters->emplace_back(Ordinal, col, Greater, workspace.this_split_lev, is_NA_branch);
|
1661
|
+
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1662
|
+
&workspace.ix_arr[0], workspace.this_split_ix, workspace.end,
|
1663
|
+
workspace.ncat_this, model_params.categ_from_maj,
|
1664
|
+
&workspace.outlier_scores[0], &workspace.outlier_clusters[0], &workspace.outlier_trees[0],
|
1665
|
+
&workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
|
1666
|
+
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1667
|
+
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1668
|
+
workspace.prop_small_this, workspace.prior_prob,
|
1669
|
+
&workspace.buffer_cat_cnt[0], &workspace.buffer_cat_sum[0],
|
1670
|
+
&workspace.buffer_crosstab[0], &workspace.buffer_subset_outlier[0], &(workspace.drop_cluster));
|
1671
|
+
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1672
|
+
if (workspace.drop_cluster) {
|
1673
|
+
workspace.clusters->pop_back();
|
1674
|
+
(*workspace.tree)[tree_from].clusters.pop_back();
|
1675
|
+
}
|
1676
|
+
|
1677
|
+
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
1678
|
+
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
1679
|
+
workspace.tree->emplace_back(tree_from, col, workspace.this_split_lev, Greater);
|
1680
|
+
backup_recursion_state(workspace, *state_backup);
|
1681
|
+
workspace.st = workspace.this_split_ix;
|
1682
|
+
recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
1683
|
+
restore_recursion_state(workspace, *state_backup);
|
1684
|
+
}
|
1685
|
+
|
1686
|
+
|
1687
|
+
if (workspace.this_gain > workspace.best_gain) {
|
1688
|
+
workspace.best_gain = workspace.this_gain;
|
1689
|
+
workspace.column_type_best = Ordinal;
|
1690
|
+
workspace.col_best = col;
|
1691
|
+
workspace.split_lev_best = workspace.this_split_lev;
|
1692
|
+
}
|
1693
|
+
|
1694
|
+
}
|
1695
|
+
|
1696
|
+
}
|
1697
|
+
|
1698
|
+
|
1699
|
+
/* avoid unnecessary memory usage or repeats */
|
1700
|
+
workspace.col_has_outliers = workspace.lev_has_outliers? true : workspace.col_has_outliers;
|
1701
|
+
(*workspace.tree)[tree_from].clusters.shrink_to_fit();
|
1702
|
+
if ((*workspace.tree)[tree_from].all_branches.size() > 0) (*workspace.tree)[tree_from].all_branches.shrink_to_fit();
|
1703
|
+
if (curr_depth == 0 && workspace.col_is_bin && workspace.ncat_this > 2 && !workspace.already_split_main)
|
1704
|
+
workspace.already_split_main = true;
|
1705
|
+
|
1706
|
+
|
1707
|
+
/* if there is a non-insignificant gain, continue splitting from the branches of the best column */
|
1708
|
+
if (workspace.best_gain >= model_params.min_gain && !model_params.follow_all) {
|
1709
|
+
|
1710
|
+
curr_depth++;
|
1711
|
+
if (curr_depth >= model_params.max_depth) goto drop_if_not_needed;
|
1712
|
+
|
1713
|
+
/* discard outliers if any */
|
1714
|
+
if (workspace.lev_has_outliers)
|
1715
|
+
workspace.st = move_outliers_to_front(&workspace.ix_arr[0], &workspace.outlier_scores[0], workspace.st, workspace.end);
|
1716
|
+
|
1717
|
+
/* assign rows to their corresponding branch */
|
1718
|
+
switch(workspace.column_type_best) {
|
1719
|
+
case Numeric:
|
1720
|
+
{
|
1721
|
+
divide_subset_split(&workspace.ix_arr[0], input_data.numeric_data + workspace.col_best * input_data.nrows,
|
1722
|
+
workspace.st, workspace.end, workspace.split_point_best,
|
1723
|
+
(bool)(input_data.has_NA[workspace.col_best]),
|
1724
|
+
&(workspace.this_split_NA), &(workspace.this_split_ix) );
|
1725
|
+
spl1 = LessOrEqual; spl2 = Greater;
|
1726
|
+
set_tree_as_numeric(workspace.tree->back(), workspace.split_point_best, workspace.col_best);
|
1727
|
+
break;
|
1728
|
+
}
|
1729
|
+
|
1730
|
+
case Ordinal:
|
1731
|
+
{
|
1732
|
+
divide_subset_split(&workspace.ix_arr[0], input_data.ordinal_data + workspace.col_best * input_data.nrows,
|
1733
|
+
workspace.st, workspace.end, workspace.split_lev_best,
|
1734
|
+
(bool)(input_data.has_NA[workspace.col_best + input_data.ncols_numeric + input_data.ncols_categ]),
|
1735
|
+
&(workspace.this_split_NA), &(workspace.this_split_ix) );
|
1736
|
+
spl1 = LessOrEqual; spl2 = Greater;
|
1737
|
+
set_tree_as_ordinal(workspace.tree->back(), workspace.split_lev_best, workspace.col_best);
|
1738
|
+
break;
|
1739
|
+
}
|
1740
|
+
|
1741
|
+
case Categorical:
|
1742
|
+
{
|
1743
|
+
|
1744
|
+
if (input_data.ncat[workspace.col_best] == 2) {
|
1745
|
+
|
1746
|
+
divide_subset_split(&workspace.ix_arr[0], input_data.categorical_data + workspace.col_best * input_data.nrows,
|
1747
|
+
workspace.st, workspace.end, (int)0,
|
1748
|
+
(bool)(input_data.has_NA[workspace.col_best + input_data.ncols_numeric]),
|
1749
|
+
&(workspace.this_split_NA), &(workspace.this_split_ix) );
|
1750
|
+
spl1 = InSubset; spl2 = NotInSubset;
|
1751
|
+
set_tree_as_categorical(workspace.tree->back(), workspace.col_best);
|
1752
|
+
|
1753
|
+
} else if (workspace.col_is_bin || model_params.cat_bruteforce_subset) {
|
1754
|
+
|
1755
|
+
divide_subset_split(&workspace.ix_arr[0], input_data.categorical_data + workspace.col_best * input_data.nrows,
|
1756
|
+
workspace.st, workspace.end, &workspace.buffer_subset_categ_best[0], input_data.ncat[workspace.col_best],
|
1757
|
+
(bool)(input_data.has_NA[workspace.col_best + input_data.ncols_numeric]),
|
1758
|
+
&(workspace.this_split_NA), &(workspace.this_split_ix) );
|
1759
|
+
spl1 = InSubset; spl2 = NotInSubset;
|
1760
|
+
set_tree_as_categorical(workspace.tree->back(), input_data.ncat[workspace.col_best],
|
1761
|
+
&workspace.buffer_subset_categ_best[0], workspace.col_best);
|
1762
|
+
|
1763
|
+
} else {
|
1764
|
+
spl1 = SingleCateg;
|
1765
|
+
workspace.temp_ptr_x = input_data.categorical_data + workspace.col_best * input_data.nrows;
|
1766
|
+
std::sort(&workspace.ix_arr[0] + workspace.st, &workspace.ix_arr[0] + workspace.end + 1,
|
1767
|
+
[&workspace](const size_t a, const size_t b){return workspace.temp_ptr_x[a] < workspace.temp_ptr_x[b];});
|
1768
|
+
set_tree_as_categorical(workspace.tree->back(), workspace.col_best, input_data.ncat[workspace.col_best]);
|
1769
|
+
|
1770
|
+
for (size_t row = workspace.st; row <= workspace.end; row++) {
|
1771
|
+
if (workspace.temp_ptr_x[ workspace.ix_arr[row] ] >= 0) {
|
1772
|
+
workspace.this_split_NA = row;
|
1773
|
+
break;
|
1774
|
+
}
|
1775
|
+
}
|
1776
|
+
}
|
1777
|
+
break;
|
1778
|
+
}
|
1779
|
+
|
1780
|
+
|
1781
|
+
}
|
1782
|
+
|
1783
|
+
|
1784
|
+
ix1 = workspace.this_split_NA;
|
1785
|
+
ix2 = workspace.this_split_ix;
|
1786
|
+
ix3 = workspace.end;
|
1787
|
+
|
1788
|
+
/* NA branch */
|
1789
|
+
if (workspace.st > workspace.this_split_NA &&
|
1790
|
+
(workspace.st - workspace.this_split_NA) >= 2 * model_params.min_size_categ) {
|
1791
|
+
|
1792
|
+
workspace.end = ix1 - 1;
|
1793
|
+
(*workspace.tree)[tree_from].tree_NA = workspace.tree->size();
|
1794
|
+
workspace.tree->emplace_back(tree_from, IsNa);
|
1795
|
+
recursive_split_categ(workspace, input_data, model_params, curr_depth, true);
|
1796
|
+
}
|
1797
|
+
|
1798
|
+
if (spl1 == SingleCateg) {
|
1799
|
+
|
1800
|
+
/* TODO: this should be done instead in a loop per category looking for the start and end positions
|
1801
|
+
in ix_arr of each category using std::lower_bound */
|
1802
|
+
|
1803
|
+
/* TODO: it's not necessary to backup everything like when using 'follow_all', only need 'best_col' and 'temp_ptr_x' */
|
1804
|
+
state_backup = std::unique_ptr<RecursionState>(new RecursionState);
|
1805
|
+
for (int cat = 1; cat < input_data.ncat[workspace.col_best]; cat++) {
|
1806
|
+
|
1807
|
+
/* TODO: this is inefficient when some categories are not present, should instead at first do a pass over 'ix_arr'
|
1808
|
+
to calculate the start and end indices of each category, then loop over that array instead */
|
1809
|
+
for (size_t row = ix1 + 1; row < ix3; row++) {
|
1810
|
+
if (workspace.temp_ptr_x[ workspace.ix_arr[row] ] == cat) {
|
1811
|
+
if ((row - ix1) >= 2 * model_params.min_size_categ) {
|
1812
|
+
(*workspace.tree)[tree_from].binary_branches[cat-1] = workspace.tree->size();
|
1813
|
+
workspace.tree->emplace_back(tree_from, spl1);
|
1814
|
+
backup_recursion_state(workspace, *state_backup);
|
1815
|
+
workspace.st = ix1;
|
1816
|
+
workspace.end = row - 1;
|
1817
|
+
recursive_split_categ(workspace, input_data, model_params, curr_depth, is_NA_branch);
|
1818
|
+
restore_recursion_state(workspace, *state_backup);
|
1819
|
+
}
|
1820
|
+
ix1 = row;
|
1821
|
+
break;
|
1822
|
+
}
|
1823
|
+
else if (workspace.temp_ptr_x[ workspace.ix_arr[row] ] > cat) {
|
1824
|
+
ix1 = row;
|
1825
|
+
break;
|
1826
|
+
}
|
1827
|
+
}
|
1828
|
+
|
1829
|
+
}
|
1830
|
+
/* last category is given by the end index */
|
1831
|
+
if ((ix3 - ix1) >= 2 * model_params.min_size_categ) {
|
1832
|
+
(*workspace.tree)[tree_from].binary_branches[input_data.ncat[workspace.col_best]-1] = workspace.tree->size();
|
1833
|
+
workspace.tree->emplace_back(tree_from, spl1);
|
1834
|
+
workspace.st = ix1;
|
1835
|
+
workspace.end = ix3;
|
1836
|
+
recursive_split_categ(workspace, input_data, model_params, curr_depth, is_NA_branch);
|
1837
|
+
} else {
|
1838
|
+
(*workspace.tree)[tree_from].binary_branches.push_back(0);
|
1839
|
+
}
|
1840
|
+
|
1841
|
+
} else {
|
1842
|
+
/* numeric, ordinal, and subset split */
|
1843
|
+
|
1844
|
+
/* left branch */
|
1845
|
+
if ((ix2 - ix1) >= 2 * model_params.min_size_categ) {
|
1846
|
+
workspace.st = ix1;
|
1847
|
+
workspace.end = ix2 - 1;
|
1848
|
+
(*workspace.tree)[tree_from].tree_left = workspace.tree->size();
|
1849
|
+
workspace.tree->emplace_back(tree_from, spl1);
|
1850
|
+
recursive_split_categ(workspace, input_data, model_params, curr_depth, is_NA_branch);
|
1851
|
+
}
|
1852
|
+
|
1853
|
+
/* right branch */
|
1854
|
+
if ((ix3 - ix2) > 2 * model_params.min_size_categ) {
|
1855
|
+
workspace.st = ix2;
|
1856
|
+
workspace.end = ix3;
|
1857
|
+
(*workspace.tree)[tree_from].tree_right = workspace.tree->size();
|
1858
|
+
workspace.tree->emplace_back(tree_from, spl2);
|
1859
|
+
recursive_split_categ(workspace, input_data, model_params, curr_depth, is_NA_branch);
|
1860
|
+
}
|
1861
|
+
|
1862
|
+
}
|
1863
|
+
|
1864
|
+
|
1865
|
+
}
|
1866
|
+
|
1867
|
+
|
1868
|
+
/* if tree has no clusters and no subtrees, disconnect it from parent and then drop */
|
1869
|
+
drop_if_not_needed:
|
1870
|
+
if (check_tree_is_not_needed((*workspace.tree)[tree_from])) {
|
1871
|
+
|
1872
|
+
if (tree_from == 0) {
|
1873
|
+
workspace.tree->clear();
|
1874
|
+
} else if ((*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.size() > 0) {
|
1875
|
+
(*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.pop_back();
|
1876
|
+
workspace.tree->pop_back();
|
1877
|
+
} else {
|
1878
|
+
switch((*workspace.tree)[tree_from].parent_branch) {
|
1879
|
+
|
1880
|
+
case IsNa:
|
1881
|
+
{
|
1882
|
+
(*workspace.tree)[(*workspace.tree)[tree_from].parent].tree_NA = 0;
|
1883
|
+
workspace.tree->pop_back();
|
1884
|
+
break;
|
1885
|
+
}
|
1886
|
+
|
1887
|
+
case LessOrEqual:
|
1888
|
+
{
|
1889
|
+
(*workspace.tree)[(*workspace.tree)[tree_from].parent].tree_left = 0;
|
1890
|
+
workspace.tree->pop_back();
|
1891
|
+
break;
|
1892
|
+
}
|
1893
|
+
|
1894
|
+
case Greater:
|
1895
|
+
{
|
1896
|
+
(*workspace.tree)[(*workspace.tree)[tree_from].parent].tree_right = 0;
|
1897
|
+
workspace.tree->pop_back();
|
1898
|
+
break;
|
1899
|
+
}
|
1900
|
+
|
1901
|
+
case InSubset:
|
1902
|
+
{
|
1903
|
+
(*workspace.tree)[(*workspace.tree)[tree_from].parent].tree_left = 0;
|
1904
|
+
workspace.tree->pop_back();
|
1905
|
+
break;
|
1906
|
+
}
|
1907
|
+
|
1908
|
+
case NotInSubset:
|
1909
|
+
{
|
1910
|
+
(*workspace.tree)[(*workspace.tree)[tree_from].parent].tree_right = 0;
|
1911
|
+
workspace.tree->pop_back();
|
1912
|
+
break;
|
1913
|
+
}
|
1914
|
+
|
1915
|
+
case SingleCateg:
|
1916
|
+
{
|
1917
|
+
(*workspace.tree)[(*workspace.tree)[tree_from].parent].binary_branches.back() = 0;
|
1918
|
+
workspace.tree->pop_back();
|
1919
|
+
break;
|
1920
|
+
}
|
1921
|
+
|
1922
|
+
case SubTrees:
|
1923
|
+
{
|
1924
|
+
(*workspace.tree)[(*workspace.tree)[tree_from].parent].binary_branches.pop_back();
|
1925
|
+
workspace.tree->pop_back();
|
1926
|
+
break;
|
1927
|
+
}
|
1928
|
+
}
|
1929
|
+
}
|
1930
|
+
}
|
1931
|
+
|
1932
|
+
}
|