outliertree 0.2.1 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/NOTICE.txt +1 -1
- data/README.md +11 -10
- data/ext/outliertree/ext.cpp +23 -0
- data/ext/outliertree/extconf.rb +1 -1
- data/lib/outliertree/result.rb +3 -3
- data/lib/outliertree/version.rb +1 -1
- data/vendor/outliertree/README.md +83 -41
- data/vendor/outliertree/src/Makevars.in +3 -0
- data/vendor/outliertree/src/Makevars.win +3 -0
- data/vendor/outliertree/src/RcppExports.cpp +17 -27
- data/vendor/outliertree/src/Rwrapper.cpp +354 -62
- data/vendor/outliertree/src/cat_outlier.cpp +6 -6
- data/vendor/outliertree/src/clusters.cpp +114 -9
- data/vendor/outliertree/src/fit_model.cpp +525 -331
- data/vendor/outliertree/src/misc.cpp +166 -17
- data/vendor/outliertree/src/outlier_tree.hpp +164 -56
- data/vendor/outliertree/src/outliertree-win.def +3 -0
- data/vendor/outliertree/src/predict.cpp +33 -0
- data/vendor/outliertree/src/split.cpp +124 -20
- metadata +8 -6
- data/vendor/outliertree/src/Makevars +0 -3
@@ -134,6 +134,7 @@ bool fit_outliers_models(ModelOutputs &model_outputs,
|
|
134
134
|
size_t max_depth, double max_perc_outliers, size_t min_size_numeric, size_t min_size_categ,
|
135
135
|
double min_gain, bool gain_as_pct, bool follow_all, double z_norm, double z_outlier)
|
136
136
|
{
|
137
|
+
SignalSwitcher ss = SignalSwitcher();
|
137
138
|
|
138
139
|
/* put parameters and data into structs to avoid passing too many function arguments each time */
|
139
140
|
double z_tail = z_outlier - z_norm;
|
@@ -188,10 +189,10 @@ bool fit_outliers_models(ModelOutputs &model_outputs,
|
|
188
189
|
/* determine maximum number of categories in a column, allocate arrays for category counts and proportions */
|
189
190
|
model_outputs.start_ix_cat_counts[0] = 0;
|
190
191
|
if (tot_cols > ncols_numeric) {
|
191
|
-
input_data.max_categ = calculate_category_indices(
|
192
|
-
(
|
193
|
-
input_data.max_categ = calculate_category_indices(
|
194
|
-
(
|
192
|
+
input_data.max_categ = calculate_category_indices(model_outputs.start_ix_cat_counts.data(), input_data.ncat, input_data.ncols_categ,
|
193
|
+
input_data.skip_col.data() + ncols_numeric);
|
194
|
+
input_data.max_categ = calculate_category_indices(model_outputs.start_ix_cat_counts.data() + input_data.ncols_categ, input_data.ncat_ord, input_data.ncols_ord,
|
195
|
+
input_data.skip_col.data() + input_data.ncols_numeric + input_data.ncols_categ, input_data.max_categ);
|
195
196
|
} else {
|
196
197
|
input_data.max_categ = 0;
|
197
198
|
}
|
@@ -201,44 +202,46 @@ bool fit_outliers_models(ModelOutputs &model_outputs,
|
|
201
202
|
model_params.prop_small.resize(model_outputs.start_ix_cat_counts[ncols_categ + ncols_ord]);
|
202
203
|
model_outputs.prop_categ.resize(model_outputs.start_ix_cat_counts[ncols_categ + ncols_ord]);
|
203
204
|
|
205
|
+
check_interrupt_switch(ss);
|
206
|
+
#if defined(DONT_THROW_ON_INTERRUPT)
|
207
|
+
if (interrupt_switch) return false;
|
208
|
+
#endif
|
209
|
+
|
204
210
|
/* calculate prior probabilities for categorical variables (in parallel), see if any is unsplittable */
|
205
211
|
if (tot_cols > ncols_numeric) {
|
206
|
-
#pragma omp parallel
|
212
|
+
#pragma omp parallel sections if(nthreads > 1)
|
207
213
|
{
|
208
|
-
#pragma omp sections
|
209
|
-
{
|
210
214
|
|
211
|
-
|
212
|
-
|
213
|
-
|
214
|
-
|
215
|
-
|
216
|
-
|
217
|
-
|
218
|
-
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
215
|
+
#pragma omp section
|
216
|
+
{
|
217
|
+
if (ncols_categ > 0) {
|
218
|
+
calculate_all_cat_counts(model_outputs.start_ix_cat_counts.data(), input_data.cat_counts.data(), input_data.ncat,
|
219
|
+
input_data.categorical_data, input_data.ncols_categ, input_data.nrows,
|
220
|
+
input_data.has_NA.data() + ncols_numeric, input_data.skip_col.data() + input_data.ncols_numeric,
|
221
|
+
std::min(input_data.ncols_categ, (size_t)std::max(1, nthreads - 1)) );
|
222
|
+
|
223
|
+
check_cat_col_unsplittable(model_outputs.start_ix_cat_counts.data(), input_data.cat_counts.data(), input_data.ncat,
|
224
|
+
input_data.ncols_categ, std::min(model_params.min_size_numeric, model_params.min_size_categ), input_data.nrows,
|
225
|
+
input_data.skip_col.data() + input_data.ncols_numeric,
|
226
|
+
std::min(input_data.ncols_categ, (size_t)std::max(1, nthreads - 1)));
|
227
|
+
}
|
224
228
|
|
225
229
|
|
226
|
-
|
230
|
+
}
|
227
231
|
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
240
|
-
|
241
|
-
}
|
232
|
+
#pragma omp section
|
233
|
+
{
|
234
|
+
if (ncols_ord > 0) {
|
235
|
+
calculate_all_cat_counts(model_outputs.start_ix_cat_counts.data() + input_data.ncols_categ, input_data.cat_counts.data(), input_data.ncat_ord,
|
236
|
+
input_data.ordinal_data, input_data.ncols_ord, input_data.nrows,
|
237
|
+
input_data.has_NA.data() + input_data.ncols_numeric + input_data.ncols_categ,
|
238
|
+
input_data.skip_col.data() + input_data.ncols_numeric + input_data.ncols_categ,
|
239
|
+
std::max((int)1, nthreads - (int)input_data.ncols_categ) );
|
240
|
+
|
241
|
+
check_cat_col_unsplittable(model_outputs.start_ix_cat_counts.data() + input_data.ncols_categ, input_data.cat_counts.data(), input_data.ncat_ord,
|
242
|
+
ncols_ord, std::min(model_params.min_size_numeric, model_params.min_size_categ), input_data.nrows,
|
243
|
+
input_data.skip_col.data() + input_data.ncols_numeric + input_data.ncols_categ,
|
244
|
+
std::max((int)1, nthreads - (int)input_data.ncols_categ));
|
242
245
|
}
|
243
246
|
}
|
244
247
|
|
@@ -246,21 +249,21 @@ bool fit_outliers_models(ModelOutputs &model_outputs,
|
|
246
249
|
|
247
250
|
|
248
251
|
/* calculate proprotion limit and CI for each category of each column */
|
249
|
-
calculate_lowerlim_proportion(
|
250
|
-
|
251
|
-
calculate_lowerlim_proportion(
|
252
|
-
|
252
|
+
calculate_lowerlim_proportion(model_params.prop_small.data(), model_outputs.prop_categ.data(), model_outputs.start_ix_cat_counts.data(),
|
253
|
+
input_data.cat_counts.data(), input_data.ncols_categ, input_data.nrows, model_params.z_norm, model_params.z_tail);
|
254
|
+
calculate_lowerlim_proportion(model_params.prop_small.data(), model_outputs.prop_categ.data(), model_outputs.start_ix_cat_counts.data() + input_data.ncols_categ,
|
255
|
+
input_data.cat_counts.data(), input_data.ncols_ord, input_data.nrows, model_params.z_norm, model_params.z_tail);
|
253
256
|
}
|
254
257
|
|
255
258
|
/* for numerical columns, check if they have NAs or if total variance is too small */
|
256
259
|
check_missing_no_variance(input_data.numeric_data, input_data.ncols_numeric, input_data.nrows,
|
257
|
-
|
260
|
+
input_data.has_NA.data(), input_data.skip_col.data(),
|
258
261
|
model_outputs.min_decimals_col.data(), nthreads);
|
259
262
|
|
260
263
|
/* determine an approximate size for the output clusters, and reserve memory right away */
|
261
264
|
model_outputs.all_clusters.resize(tot_cols);
|
262
265
|
model_outputs.all_trees.resize(tot_cols);
|
263
|
-
#pragma omp parallel for shared(model_outputs, input_data, model_params, tot_cols)
|
266
|
+
#pragma omp parallel for num_threads(nthreads) shared(model_outputs, input_data, model_params, tot_cols)
|
264
267
|
for (size_t_for col = 0; col < tot_cols; col++) {
|
265
268
|
if (input_data.skip_col[col]) continue;
|
266
269
|
if (cols_ignore != NULL && cols_ignore[col]) continue;
|
@@ -269,133 +272,159 @@ bool fit_outliers_models(ModelOutputs &model_outputs,
|
|
269
272
|
/* this is not exact as categoricals and ordinals can also be split multiple times */
|
270
273
|
}
|
271
274
|
|
275
|
+
check_interrupt_switch(ss);
|
276
|
+
#if defined(DONT_THROW_ON_INTERRUPT)
|
277
|
+
if (interrupt_switch) return false;
|
278
|
+
#endif
|
272
279
|
|
273
280
|
/* now run the procedure on each column separately */
|
274
281
|
int tid;
|
282
|
+
bool threw_exception = false;
|
283
|
+
std::exception_ptr ex = NULL;
|
275
284
|
nthreads = std::min(nthreads, (int)(ncols_numeric + ncols_categ + ncols_ord));
|
276
285
|
#pragma omp parallel for num_threads(nthreads) schedule(dynamic, 1) private(tid) shared(workspace, model_outputs, input_data, model_params, tot_cols)
|
277
286
|
for (size_t_for col = 0; col < tot_cols; col++) {
|
278
287
|
|
288
|
+
if (interrupt_switch || threw_exception) continue;
|
289
|
+
|
279
290
|
if (cols_ignore != NULL && cols_ignore[col]) continue;
|
280
291
|
if (input_data.skip_col[col] && col < input_data.ncols_numeric) continue;
|
281
292
|
tid = omp_get_thread_num();
|
282
293
|
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
|
290
|
-
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
|
295
|
-
|
296
|
-
|
294
|
+
try {
|
295
|
+
/* re-use thread-private memory if possible */
|
296
|
+
if (!check_workspace_is_allocated(workspace[tid]))
|
297
|
+
allocate_thread_workspace(workspace[tid], input_data.nrows, input_data.max_categ);
|
298
|
+
|
299
|
+
/* numerical column */
|
300
|
+
if (col < input_data.ncols_numeric) {
|
301
|
+
process_numeric_col(model_outputs.all_clusters[col],
|
302
|
+
model_outputs.all_trees[col],
|
303
|
+
col,
|
304
|
+
workspace[tid],
|
305
|
+
input_data,
|
306
|
+
model_params, model_outputs);
|
307
|
+
calculate_cluster_minimums(model_outputs, col);
|
308
|
+
}
|
297
309
|
|
298
|
-
|
299
|
-
|
300
|
-
|
301
|
-
|
302
|
-
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
310
|
+
/* categorical column */
|
311
|
+
else if (col < (input_data.ncols_numeric + input_data.ncols_categ)) {
|
312
|
+
process_categ_col(model_outputs.all_clusters[col],
|
313
|
+
model_outputs.all_trees[col],
|
314
|
+
col, false,
|
315
|
+
workspace[tid],
|
316
|
+
input_data,
|
317
|
+
model_params, model_outputs);
|
318
|
+
calculate_cluster_poss_categs(model_outputs, col, col - input_data.ncols_numeric);
|
319
|
+
}
|
308
320
|
|
309
|
-
|
310
|
-
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
|
315
|
-
|
316
|
-
|
317
|
-
|
318
|
-
|
321
|
+
/* ordinal column */
|
322
|
+
else {
|
323
|
+
process_categ_col(model_outputs.all_clusters[col],
|
324
|
+
model_outputs.all_trees[col],
|
325
|
+
col, true,
|
326
|
+
workspace[tid],
|
327
|
+
input_data,
|
328
|
+
model_params, model_outputs);
|
329
|
+
calculate_cluster_poss_categs(model_outputs, col, col - input_data.ncols_numeric);
|
330
|
+
}
|
319
331
|
|
320
|
-
|
321
|
-
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
332
|
+
/* shrink the dynamic vectors to what ended up used only */
|
333
|
+
#ifdef TEST_MODE_DEFINE
|
334
|
+
prune_unused_trees(model_outputs.all_trees[col]);
|
335
|
+
#endif
|
336
|
+
if (
|
337
|
+
model_outputs.all_clusters[col].size() == 0 ||
|
338
|
+
model_outputs.all_trees[col].size() == 0 ||
|
339
|
+
check_tree_is_not_needed(model_outputs.all_trees[col][0])
|
340
|
+
)
|
341
|
+
{
|
342
|
+
model_outputs.all_trees[col].clear();
|
343
|
+
model_outputs.all_clusters[col].clear();
|
344
|
+
}
|
345
|
+
model_outputs.all_trees[col].shrink_to_fit();
|
346
|
+
model_outputs.all_clusters[col].shrink_to_fit();
|
347
|
+
|
348
|
+
/* simplify single-elements in subset to 'equals' or 'not equals' */
|
349
|
+
simplify_when_equal_cond(model_outputs.all_clusters[col], ncat_ord);
|
350
|
+
simplify_when_equal_cond(model_outputs.all_trees[col], ncat_ord);
|
339
351
|
|
340
|
-
|
341
|
-
|
342
|
-
|
352
|
+
/* remember only the best (rarest) value for each row */
|
353
|
+
#pragma omp critical
|
354
|
+
if (workspace[tid].col_has_outliers) {
|
343
355
|
|
344
|
-
|
345
|
-
|
356
|
+
found_outliers = true;
|
357
|
+
for (size_t row = 0; row < input_data.nrows; row++) {
|
346
358
|
|
347
|
-
|
359
|
+
if (workspace[tid].outlier_scores[row] < 1.0) {
|
348
360
|
|
349
|
-
|
350
|
-
|
351
|
-
(
|
352
|
-
workspace[tid].outlier_depth[row] < model_outputs.outlier_depth_final[row] &&
|
353
|
-
(
|
354
|
-
!model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch ||
|
355
|
-
model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
|
356
|
-
)
|
357
|
-
) ||
|
358
|
-
(
|
359
|
-
model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch &&
|
360
|
-
!model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch
|
361
|
-
) ||
|
361
|
+
if (
|
362
|
+
model_outputs.outlier_scores_final[row] >= 1.0 ||
|
362
363
|
(
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].cluster_size
|
369
|
-
<
|
370
|
-
model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].cluster_size
|
364
|
+
workspace[tid].outlier_depth[row] < model_outputs.outlier_depth_final[row] &&
|
365
|
+
(
|
366
|
+
!model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch ||
|
367
|
+
model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
|
368
|
+
)
|
371
369
|
) ||
|
372
|
-
|
370
|
+
(
|
371
|
+
model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch &&
|
372
|
+
!model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch
|
373
|
+
) ||
|
374
|
+
(
|
373
375
|
workspace[tid].outlier_depth[row] == model_outputs.outlier_depth_final[row] &&
|
374
|
-
|
375
|
-
|
376
|
-
|
377
|
-
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
|
376
|
+
model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch
|
377
|
+
==
|
378
|
+
model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
|
379
|
+
&&
|
380
|
+
model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].cluster_size
|
381
|
+
<
|
382
|
+
model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].cluster_size
|
383
|
+
) ||
|
384
|
+
(
|
385
|
+
workspace[tid].outlier_depth[row] == model_outputs.outlier_depth_final[row] &&
|
386
|
+
model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].cluster_size
|
387
|
+
==
|
388
|
+
model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].cluster_size
|
389
|
+
&&
|
390
|
+
model_outputs.all_clusters[col][workspace[tid].outlier_clusters[row]].has_NA_branch
|
391
|
+
==
|
392
|
+
model_outputs.all_clusters[model_outputs.outlier_columns_final[row]][model_outputs.outlier_clusters_final[row]].has_NA_branch
|
393
|
+
&&
|
394
|
+
workspace[tid].outlier_scores[row] < model_outputs.outlier_scores_final[row]
|
395
|
+
)
|
396
|
+
)
|
397
|
+
{
|
398
|
+
model_outputs.outlier_scores_final[row] = workspace[tid].outlier_scores[row];
|
399
|
+
model_outputs.outlier_clusters_final[row] = workspace[tid].outlier_clusters[row];
|
400
|
+
model_outputs.outlier_trees_final[row] = workspace[tid].outlier_trees[row];
|
401
|
+
model_outputs.outlier_depth_final[row] = workspace[tid].outlier_depth[row];
|
402
|
+
model_outputs.outlier_columns_final[row] = col;
|
403
|
+
}
|
391
404
|
}
|
405
|
+
|
392
406
|
}
|
407
|
+
}
|
408
|
+
}
|
393
409
|
|
410
|
+
catch(...) {
|
411
|
+
#pragma omp critical
|
412
|
+
{
|
413
|
+
if (!threw_exception) {
|
414
|
+
threw_exception = true;
|
415
|
+
ex = std::current_exception();
|
416
|
+
}
|
394
417
|
}
|
395
418
|
}
|
419
|
+
}
|
396
420
|
|
421
|
+
check_interrupt_switch(ss);
|
422
|
+
#if defined(DONT_THROW_ON_INTERRUPT)
|
423
|
+
if (interrupt_switch) return false;
|
424
|
+
#endif
|
397
425
|
|
398
|
-
|
426
|
+
if (threw_exception)
|
427
|
+
std::rethrow_exception(ex);
|
399
428
|
|
400
429
|
/* once finished, determine how many decimals to report for numerical outliers */
|
401
430
|
if (found_outliers)
|
@@ -443,21 +472,23 @@ void process_numeric_col(std::vector<Cluster> &cluster_root,
|
|
443
472
|
ModelParams &model_params,
|
444
473
|
ModelOutputs &model_outputs)
|
445
474
|
{
|
475
|
+
if (interrupt_switch) return;
|
476
|
+
|
446
477
|
/* discard NAs and infinites */
|
447
478
|
workspace.target_col_num = target_col_num;
|
448
479
|
workspace.target_numeric_col = input_data.numeric_data + target_col_num * input_data.nrows;
|
449
480
|
workspace.orig_target_col = workspace.target_numeric_col;
|
450
481
|
workspace.end = input_data.nrows - 1;
|
451
|
-
workspace.st = move_NAs_to_front(
|
482
|
+
workspace.st = move_NAs_to_front(workspace.ix_arr.data(), workspace.target_numeric_col, 0, workspace.end, true);
|
452
483
|
workspace.col_has_outliers = false;
|
453
484
|
|
454
485
|
/* check for problematic distributions - need to sort data first */
|
455
|
-
std::sort(
|
486
|
+
std::sort(workspace.ix_arr.data() + workspace.st, workspace.ix_arr.data() + workspace.end + 1,
|
456
487
|
[&workspace](const size_t a, const size_t b){return workspace.target_numeric_col[a] < workspace.target_numeric_col[b];});
|
457
488
|
|
458
489
|
long double running_mean = 0;
|
459
|
-
long double mean_prev = 0;
|
460
490
|
long double running_ssq = 0;
|
491
|
+
long double mean_prev = workspace.target_numeric_col[workspace.ix_arr[workspace.st]];
|
461
492
|
double xval;
|
462
493
|
for (size_t row = workspace.st; row <= workspace.end; row++) {
|
463
494
|
xval = workspace.target_numeric_col[workspace.ix_arr[row]];
|
@@ -466,10 +497,10 @@ void process_numeric_col(std::vector<Cluster> &cluster_root,
|
|
466
497
|
mean_prev = running_mean;
|
467
498
|
}
|
468
499
|
|
469
|
-
check_for_tails(
|
500
|
+
check_for_tails(workspace.ix_arr.data(), workspace.st, workspace.end, workspace.target_numeric_col,
|
470
501
|
model_params.z_norm, model_params.max_perc_outliers,
|
471
|
-
|
472
|
-
(double)
|
502
|
+
workspace.buffer_transf_y.data(), (double)running_mean,
|
503
|
+
(double)std::sqrt(running_ssq / (long double)(workspace.end - workspace.st)),
|
473
504
|
&workspace.left_tail, &workspace.right_tail,
|
474
505
|
&workspace.exp_transf, &workspace.log_transf);
|
475
506
|
|
@@ -480,11 +511,11 @@ void process_numeric_col(std::vector<Cluster> &cluster_root,
|
|
480
511
|
if (workspace.exp_transf) {
|
481
512
|
|
482
513
|
workspace.orig_mean = (double) running_mean;
|
483
|
-
workspace.orig_sd = (double)
|
514
|
+
workspace.orig_sd = (double) std::sqrt(running_ssq / (long double)(workspace.end - workspace.st));
|
484
515
|
for (size_t row = workspace.st; row <= workspace.end; row++) {
|
485
516
|
workspace.buffer_transf_y[workspace.ix_arr[row]] = exp(z_score(workspace.target_numeric_col[workspace.ix_arr[row]], workspace.orig_mean, workspace.orig_sd));
|
486
517
|
}
|
487
|
-
workspace.target_numeric_col =
|
518
|
+
workspace.target_numeric_col = workspace.buffer_transf_y.data();
|
488
519
|
model_outputs.col_transf[workspace.target_col_num] = Exp;
|
489
520
|
model_outputs.transf_offset[workspace.target_col_num] = workspace.orig_mean;
|
490
521
|
model_outputs.sd_div[workspace.target_col_num] = workspace.orig_sd;
|
@@ -501,7 +532,7 @@ void process_numeric_col(std::vector<Cluster> &cluster_root,
|
|
501
532
|
for (size_t row = workspace.st; row <= workspace.end; row++) {
|
502
533
|
workspace.buffer_transf_y[workspace.ix_arr[row]] = log(workspace.target_numeric_col[workspace.ix_arr[row]] - workspace.log_minval);
|
503
534
|
}
|
504
|
-
workspace.target_numeric_col =
|
535
|
+
workspace.target_numeric_col = workspace.buffer_transf_y.data();
|
505
536
|
model_outputs.col_transf[workspace.target_col_num] = Log;
|
506
537
|
model_outputs.transf_offset[workspace.target_col_num] = workspace.log_minval;
|
507
538
|
|
@@ -514,9 +545,9 @@ void process_numeric_col(std::vector<Cluster> &cluster_root,
|
|
514
545
|
workspace.tree->emplace_back(0, Root);
|
515
546
|
|
516
547
|
workspace.clusters->emplace_back(NoType, Root);
|
517
|
-
workspace.col_has_outliers = define_numerical_cluster(workspace.target_numeric_col,
|
518
|
-
workspace.end,
|
519
|
-
|
548
|
+
workspace.col_has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.st,
|
549
|
+
workspace.end, workspace.outlier_scores.data(),
|
550
|
+
workspace.outlier_clusters.data(), workspace.outlier_trees.data(), workspace.outlier_depth.data(),
|
520
551
|
workspace.clusters->back(), *(workspace.clusters), 0, 0, 0,
|
521
552
|
workspace.log_transf, workspace.log_minval, workspace.exp_transf,
|
522
553
|
workspace.orig_mean, workspace.orig_sd,
|
@@ -526,16 +557,19 @@ void process_numeric_col(std::vector<Cluster> &cluster_root,
|
|
526
557
|
|
527
558
|
/* remove outliers if any were found */
|
528
559
|
if (workspace.has_outliers)
|
529
|
-
workspace.st = move_outliers_to_front(
|
560
|
+
workspace.st = move_outliers_to_front(workspace.ix_arr.data(), workspace.outlier_scores.data(), workspace.st, workspace.end);
|
530
561
|
|
531
562
|
/* update statistics if they've changed */
|
532
563
|
if (workspace.has_outliers || workspace.exp_transf || workspace.log_transf)
|
533
|
-
workspace.sd_y = calc_sd(
|
564
|
+
workspace.sd_y = calc_sd(workspace.ix_arr.data(), workspace.target_numeric_col,
|
534
565
|
workspace.st, workspace.end, &workspace.mean_y);
|
535
566
|
else
|
536
567
|
workspace.sd_y = sqrtl(running_ssq / (long double)(workspace.end - workspace.st));
|
537
568
|
|
538
|
-
if (model_params.max_depth > 0
|
569
|
+
if (model_params.max_depth > 0 && workspace.sd_y > 0) {
|
570
|
+
workspace.exhausted_col_tracker.initialize(input_data.tot_cols, model_params.max_depth);
|
571
|
+
recursive_split_numeric(workspace, input_data, model_params, 0, false);
|
572
|
+
}
|
539
573
|
}
|
540
574
|
|
541
575
|
void recursive_split_numeric(Workspace &workspace,
|
@@ -543,19 +577,27 @@ void recursive_split_numeric(Workspace &workspace,
|
|
543
577
|
ModelParams &model_params,
|
544
578
|
size_t curr_depth, bool is_NA_branch)
|
545
579
|
{
|
580
|
+
if (interrupt_switch) return;
|
581
|
+
|
546
582
|
workspace.best_gain = -HUGE_VAL;
|
547
583
|
workspace.column_type_best = NoType;
|
548
584
|
workspace.lev_has_outliers = false;
|
549
|
-
if (curr_depth > 0) workspace.sd_y = calc_sd(&workspace.ix_arr[0], workspace.target_numeric_col,
|
550
|
-
workspace.st, workspace.end, &workspace.mean_y);
|
551
585
|
|
552
586
|
/* these are used to keep track of where to continue after calling a further recursion */
|
553
587
|
size_t ix1, ix2, ix3;
|
554
588
|
SplitType spl1, spl2;
|
555
589
|
size_t tree_from = workspace.tree->size() - 1;
|
590
|
+
std::unique_ptr<RecursionState> state_backup;
|
591
|
+
ExhaustedColumnsLevel level_col_tracker;
|
592
|
+
|
593
|
+
if (curr_depth > 0) {
|
594
|
+
workspace.sd_y = calc_sd(workspace.ix_arr.data(), workspace.target_numeric_col,
|
595
|
+
workspace.st, workspace.end, &workspace.mean_y);
|
596
|
+
if (workspace.sd_y <= 0) goto abandon_column;
|
597
|
+
}
|
598
|
+
level_col_tracker.initialize(&workspace.exhausted_col_tracker);
|
556
599
|
|
557
600
|
/* when using 'follow_all' need to keep track of a lot more things */
|
558
|
-
std::unique_ptr<RecursionState> state_backup;
|
559
601
|
if (model_params.follow_all) state_backup = std::unique_ptr<RecursionState>(new RecursionState);
|
560
602
|
|
561
603
|
|
@@ -566,10 +608,16 @@ void recursive_split_numeric(Workspace &workspace,
|
|
566
608
|
|
567
609
|
if (col == workspace.target_col_num) continue;
|
568
610
|
if (input_data.skip_col[col]) continue;
|
569
|
-
|
611
|
+
if (workspace.exhausted_col_tracker.is_exhausted[col]) continue;
|
612
|
+
split_numericx_numericy(workspace.ix_arr.data(), workspace.st, workspace.end, input_data.numeric_data + col * input_data.nrows,
|
570
613
|
workspace.target_numeric_col, workspace.sd_y, (bool)(input_data.has_NA[col]), model_params.min_size_numeric,
|
571
|
-
model_params.take_mid,
|
572
|
-
&(workspace.this_split_ix), &(workspace.this_split_NA));
|
614
|
+
model_params.take_mid, workspace.buffer_sd.data(), &(workspace.this_gain), &(workspace.this_split_point),
|
615
|
+
&(workspace.this_split_ix), &(workspace.this_split_NA), &workspace.has_zero_variance);
|
616
|
+
if (workspace.has_zero_variance) {
|
617
|
+
workspace.has_zero_variance = false;
|
618
|
+
workspace.exhausted_col_tracker.push_col(col);
|
619
|
+
continue;
|
620
|
+
}
|
573
621
|
if (model_params.gain_as_pct) workspace.this_gain /= workspace.sd_y;
|
574
622
|
|
575
623
|
/* if the gain is not insignificant, check clusters created by this split */
|
@@ -581,9 +629,9 @@ void recursive_split_numeric(Workspace &workspace,
|
|
581
629
|
|
582
630
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
583
631
|
workspace.clusters->emplace_back(Numeric, col, IsNa, -HUGE_VAL, true);
|
584
|
-
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col,
|
585
|
-
workspace.this_split_NA - 1,
|
586
|
-
|
632
|
+
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.st,
|
633
|
+
workspace.this_split_NA - 1, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
|
634
|
+
workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
587
635
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
588
636
|
workspace.log_transf, workspace.log_minval, workspace.exp_transf,
|
589
637
|
workspace.orig_mean, workspace.orig_sd,
|
@@ -596,6 +644,9 @@ void recursive_split_numeric(Workspace &workspace,
|
|
596
644
|
workspace.tree->emplace_back(tree_from, col, HUGE_VAL, IsNa);
|
597
645
|
backup_recursion_state(workspace, *state_backup);
|
598
646
|
workspace.end = workspace.this_split_NA - 1;
|
647
|
+
ExhaustedColumnsLevel level_col_tracker2;
|
648
|
+
level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
|
649
|
+
workspace.exhausted_col_tracker.push_col(col);
|
599
650
|
recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, true);
|
600
651
|
restore_recursion_state(workspace, *state_backup);
|
601
652
|
}
|
@@ -605,9 +656,9 @@ void recursive_split_numeric(Workspace &workspace,
|
|
605
656
|
/* left branch */
|
606
657
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
607
658
|
workspace.clusters->emplace_back(Numeric, col, LessOrEqual, workspace.this_split_point, is_NA_branch);
|
608
|
-
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col,
|
609
|
-
workspace.this_split_ix,
|
610
|
-
|
659
|
+
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.this_split_NA,
|
660
|
+
workspace.this_split_ix, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
|
661
|
+
workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
611
662
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
612
663
|
workspace.log_transf, workspace.log_minval, workspace.exp_transf,
|
613
664
|
workspace.orig_mean, workspace.orig_sd,
|
@@ -629,9 +680,9 @@ void recursive_split_numeric(Workspace &workspace,
|
|
629
680
|
/* right branch */
|
630
681
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
631
682
|
workspace.clusters->emplace_back(Numeric, col, Greater, workspace.this_split_point, is_NA_branch);
|
632
|
-
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col,
|
633
|
-
workspace.end,
|
634
|
-
|
683
|
+
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.this_split_ix + 1,
|
684
|
+
workspace.end, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
|
685
|
+
workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
635
686
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
636
687
|
workspace.log_transf, workspace.log_minval, workspace.exp_transf,
|
637
688
|
workspace.orig_mean, workspace.orig_sd,
|
@@ -665,29 +716,36 @@ void recursive_split_numeric(Workspace &workspace,
|
|
665
716
|
for (size_t col = 0; col < input_data.ncols_categ; col++) {
|
666
717
|
|
667
718
|
if (input_data.skip_col[col + input_data.ncols_numeric]) continue;
|
719
|
+
if (workspace.exhausted_col_tracker.is_exhausted[col + input_data.ncols_numeric]) continue;
|
668
720
|
|
669
|
-
split_categx_numericy(
|
670
|
-
workspace.target_numeric_col, workspace.sd_y, workspace.mean_y, false, input_data.ncat[col],
|
671
|
-
|
721
|
+
split_categx_numericy(workspace.ix_arr.data(), workspace.st, workspace.end, input_data.categorical_data + col * input_data.nrows,
|
722
|
+
workspace.target_numeric_col, workspace.sd_y, workspace.mean_y, false, input_data.ncat[col], workspace.buffer_cat_cnt.data(),
|
723
|
+
workspace.buffer_cat_sum.data(), workspace.buffer_cat_sum_sq.data(), workspace.buffer_cat_sorted.data(),
|
672
724
|
(bool)(input_data.has_NA[col + input_data.ncols_numeric]), model_params.min_size_numeric,
|
673
|
-
&(workspace.this_gain),
|
725
|
+
&(workspace.this_gain), workspace.buffer_subset_categ.data(), NULL,
|
726
|
+
&workspace.has_zero_variance, &workspace.is_binary_split);
|
727
|
+
if (workspace.has_zero_variance) {
|
728
|
+
workspace.has_zero_variance = false;
|
729
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
|
730
|
+
continue;
|
731
|
+
}
|
674
732
|
if (model_params.gain_as_pct) workspace.this_gain /= workspace.sd_y;
|
675
733
|
|
676
734
|
if (workspace.this_gain >= model_params.min_gain) {
|
677
735
|
|
678
736
|
/* data is not arranged inside the splitting function, need to now assign to the branches as determined */
|
679
|
-
divide_subset_split(
|
680
|
-
|
737
|
+
divide_subset_split(workspace.ix_arr.data(), input_data.categorical_data + col * input_data.nrows, workspace.st, workspace.end,
|
738
|
+
workspace.buffer_subset_categ.data(), input_data.ncat[col], (bool)(workspace.buffer_cat_cnt[input_data.ncat[col]] > 0),
|
681
739
|
&(workspace.this_split_NA), &(workspace.this_split_ix));
|
682
740
|
|
683
741
|
/* NA branch */
|
684
742
|
if ((workspace.this_split_NA - workspace.st) > model_params.min_size_numeric) {
|
685
743
|
|
686
744
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
687
|
-
workspace.clusters->emplace_back(Categorical, col, IsNa, (char*)NULL, (int)0, true);
|
688
|
-
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col,
|
689
|
-
workspace.this_split_NA - 1,
|
690
|
-
|
745
|
+
workspace.clusters->emplace_back(Categorical, col, IsNa, (signed char*)NULL, (int)0, true);
|
746
|
+
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.st,
|
747
|
+
workspace.this_split_NA - 1, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
|
748
|
+
workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
691
749
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
692
750
|
workspace.log_transf, workspace.log_minval, workspace.exp_transf,
|
693
751
|
workspace.orig_mean, workspace.orig_sd,
|
@@ -697,9 +755,12 @@ void recursive_split_numeric(Workspace &workspace,
|
|
697
755
|
|
698
756
|
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
699
757
|
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
700
|
-
workspace.tree->emplace_back(tree_from, col, IsNa, (char*)NULL, 0);
|
758
|
+
workspace.tree->emplace_back(tree_from, col, IsNa, (signed char*)NULL, 0);
|
701
759
|
backup_recursion_state(workspace, *state_backup);
|
702
760
|
workspace.end = workspace.this_split_NA - 1;
|
761
|
+
ExhaustedColumnsLevel level_col_tracker2;
|
762
|
+
level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
|
763
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
|
703
764
|
recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, true);
|
704
765
|
restore_recursion_state(workspace, *state_backup);
|
705
766
|
}
|
@@ -708,10 +769,10 @@ void recursive_split_numeric(Workspace &workspace,
|
|
708
769
|
|
709
770
|
/* left branch */
|
710
771
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
711
|
-
workspace.clusters->emplace_back(Categorical, col, InSubset,
|
712
|
-
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col,
|
713
|
-
workspace.this_split_ix - 1,
|
714
|
-
|
772
|
+
workspace.clusters->emplace_back(Categorical, col, InSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col], is_NA_branch);
|
773
|
+
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.this_split_NA,
|
774
|
+
workspace.this_split_ix - 1, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
|
775
|
+
workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
715
776
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
716
777
|
workspace.log_transf, workspace.log_minval, workspace.exp_transf,
|
717
778
|
workspace.orig_mean, workspace.orig_sd,
|
@@ -721,20 +782,25 @@ void recursive_split_numeric(Workspace &workspace,
|
|
721
782
|
|
722
783
|
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
723
784
|
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
724
|
-
workspace.tree->emplace_back(tree_from, col, InSubset,
|
785
|
+
workspace.tree->emplace_back(tree_from, col, InSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col]);
|
725
786
|
backup_recursion_state(workspace, *state_backup);
|
726
787
|
workspace.st = workspace.this_split_NA;
|
727
788
|
workspace.end = workspace.this_split_ix - 1;
|
789
|
+
ExhaustedColumnsLevel level_col_tracker2;
|
790
|
+
if (workspace.is_binary_split) {
|
791
|
+
level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
|
792
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
|
793
|
+
}
|
728
794
|
recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
729
795
|
restore_recursion_state(workspace, *state_backup);
|
730
796
|
}
|
731
797
|
|
732
798
|
/* right branch */
|
733
799
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
734
|
-
workspace.clusters->emplace_back(Categorical, col, NotInSubset,
|
735
|
-
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col,
|
736
|
-
workspace.end,
|
737
|
-
|
800
|
+
workspace.clusters->emplace_back(Categorical, col, NotInSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col], is_NA_branch);
|
801
|
+
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.this_split_ix,
|
802
|
+
workspace.end, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
|
803
|
+
workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
738
804
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
739
805
|
workspace.log_transf, workspace.log_minval, workspace.exp_transf,
|
740
806
|
workspace.orig_mean, workspace.orig_sd,
|
@@ -744,9 +810,14 @@ void recursive_split_numeric(Workspace &workspace,
|
|
744
810
|
|
745
811
|
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
746
812
|
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
747
|
-
workspace.tree->emplace_back(tree_from, col, NotInSubset,
|
813
|
+
workspace.tree->emplace_back(tree_from, col, NotInSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col]);
|
748
814
|
backup_recursion_state(workspace, *state_backup);
|
749
815
|
workspace.st = workspace.this_split_ix;
|
816
|
+
ExhaustedColumnsLevel level_col_tracker2;
|
817
|
+
if (workspace.is_binary_split) {
|
818
|
+
level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
|
819
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
|
820
|
+
}
|
750
821
|
recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
751
822
|
restore_recursion_state(workspace, *state_backup);
|
752
823
|
}
|
@@ -755,7 +826,8 @@ void recursive_split_numeric(Workspace &workspace,
|
|
755
826
|
workspace.best_gain = workspace.this_gain;
|
756
827
|
workspace.column_type_best = Categorical;
|
757
828
|
workspace.col_best = col;
|
758
|
-
|
829
|
+
workspace.best_cat_split_is_binary = workspace.is_binary_split;
|
830
|
+
memcpy(workspace.buffer_subset_categ_best.data(), workspace.buffer_subset_categ.data(), input_data.ncat[col] * sizeof(signed char));
|
759
831
|
}
|
760
832
|
|
761
833
|
}
|
@@ -766,18 +838,25 @@ void recursive_split_numeric(Workspace &workspace,
|
|
766
838
|
for (size_t col = 0; col < input_data.ncols_ord; col++) {
|
767
839
|
|
768
840
|
if (input_data.skip_col[col + input_data.ncols_numeric + input_data.ncols_categ]) continue;
|
841
|
+
if (workspace.exhausted_col_tracker.is_exhausted[col + input_data.ncols_numeric + input_data.ncols_categ]) continue;
|
769
842
|
|
770
843
|
/* same code as for categorical, but this time with split level as int instead of boolean array as subset */
|
771
|
-
split_categx_numericy(
|
772
|
-
workspace.target_numeric_col, workspace.sd_y, workspace.mean_y, true, input_data.ncat_ord[col],
|
773
|
-
|
844
|
+
split_categx_numericy(workspace.ix_arr.data(), workspace.st, workspace.end, input_data.ordinal_data + col * input_data.nrows,
|
845
|
+
workspace.target_numeric_col, workspace.sd_y, workspace.mean_y, true, input_data.ncat_ord[col], workspace.buffer_cat_cnt.data(),
|
846
|
+
workspace.buffer_cat_sum.data(), workspace.buffer_cat_sum_sq.data(), workspace.buffer_cat_sorted.data(),
|
774
847
|
(bool)(input_data.has_NA[col + input_data.ncols_numeric + input_data.ncols_categ]), model_params.min_size_numeric,
|
775
|
-
&(workspace.this_gain),
|
848
|
+
&(workspace.this_gain), workspace.buffer_subset_categ.data(), &(workspace.this_split_lev),
|
849
|
+
&workspace.has_zero_variance, &workspace.is_binary_split);
|
850
|
+
if (workspace.has_zero_variance) {
|
851
|
+
workspace.has_zero_variance = false;
|
852
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_categ);
|
853
|
+
continue;
|
854
|
+
}
|
776
855
|
if (model_params.gain_as_pct) workspace.this_gain /= workspace.sd_y;
|
777
856
|
|
778
857
|
if (workspace.this_gain >= model_params.min_gain) {
|
779
858
|
|
780
|
-
divide_subset_split(
|
859
|
+
divide_subset_split(workspace.ix_arr.data(), input_data.ordinal_data + col * input_data.nrows, workspace.st, workspace.end,
|
781
860
|
workspace.this_split_lev, (bool)(workspace.buffer_cat_cnt[ input_data.ncat_ord[col] ] > 0),
|
782
861
|
&(workspace.this_split_NA), &(workspace.this_split_ix) );
|
783
862
|
|
@@ -785,9 +864,9 @@ void recursive_split_numeric(Workspace &workspace,
|
|
785
864
|
|
786
865
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
787
866
|
workspace.clusters->emplace_back(Ordinal, col, IsNa, (int)0, true);
|
788
|
-
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col,
|
789
|
-
workspace.this_split_NA - 1,
|
790
|
-
|
867
|
+
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.st,
|
868
|
+
workspace.this_split_NA - 1, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
|
869
|
+
workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
791
870
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
792
871
|
workspace.log_transf, workspace.log_minval, workspace.exp_transf,
|
793
872
|
workspace.orig_mean, workspace.orig_sd,
|
@@ -800,6 +879,9 @@ void recursive_split_numeric(Workspace &workspace,
|
|
800
879
|
workspace.tree->emplace_back(tree_from, col, (int)-1, IsNa);
|
801
880
|
backup_recursion_state(workspace, *state_backup);
|
802
881
|
workspace.end = workspace.this_split_NA - 1;
|
882
|
+
ExhaustedColumnsLevel level_col_tracker2;
|
883
|
+
level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
|
884
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_ord);
|
803
885
|
recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, true);
|
804
886
|
restore_recursion_state(workspace, *state_backup);
|
805
887
|
}
|
@@ -809,9 +891,9 @@ void recursive_split_numeric(Workspace &workspace,
|
|
809
891
|
/* left branch */
|
810
892
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
811
893
|
workspace.clusters->emplace_back(Ordinal, col, LessOrEqual, workspace.this_split_lev, is_NA_branch);
|
812
|
-
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col,
|
813
|
-
workspace.this_split_ix - 1,
|
814
|
-
|
894
|
+
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.this_split_NA,
|
895
|
+
workspace.this_split_ix - 1, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
|
896
|
+
workspace.outlier_trees.data(), &workspace.outlier_depth[0], workspace.clusters->back(), *(workspace.clusters),
|
815
897
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
816
898
|
workspace.log_transf, workspace.log_minval, workspace.exp_transf,
|
817
899
|
workspace.orig_mean, workspace.orig_sd,
|
@@ -825,6 +907,11 @@ void recursive_split_numeric(Workspace &workspace,
|
|
825
907
|
backup_recursion_state(workspace, *state_backup);
|
826
908
|
workspace.st = workspace.this_split_NA;
|
827
909
|
workspace.end = workspace.this_split_ix - 1;
|
910
|
+
ExhaustedColumnsLevel level_col_tracker2;
|
911
|
+
if (workspace.is_binary_split) {
|
912
|
+
level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
|
913
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_ord);
|
914
|
+
}
|
828
915
|
recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
829
916
|
restore_recursion_state(workspace, *state_backup);
|
830
917
|
}
|
@@ -834,9 +921,9 @@ void recursive_split_numeric(Workspace &workspace,
|
|
834
921
|
/* right branch */
|
835
922
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
836
923
|
workspace.clusters->emplace_back(Ordinal, col, Greater, workspace.this_split_lev, is_NA_branch);
|
837
|
-
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col,
|
838
|
-
workspace.end,
|
839
|
-
|
924
|
+
workspace.has_outliers = define_numerical_cluster(workspace.target_numeric_col, workspace.ix_arr.data(), workspace.this_split_ix,
|
925
|
+
workspace.end, workspace.outlier_scores.data(), workspace.outlier_clusters.data(),
|
926
|
+
workspace.outlier_trees.data(), workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
840
927
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
841
928
|
workspace.log_transf, workspace.log_minval, workspace.exp_transf,
|
842
929
|
workspace.orig_mean, workspace.orig_sd,
|
@@ -849,6 +936,11 @@ void recursive_split_numeric(Workspace &workspace,
|
|
849
936
|
workspace.tree->emplace_back(tree_from, col, workspace.this_split_lev, Greater);
|
850
937
|
backup_recursion_state(workspace, *state_backup);
|
851
938
|
workspace.st = workspace.this_split_ix;
|
939
|
+
ExhaustedColumnsLevel level_col_tracker2;
|
940
|
+
if (workspace.is_binary_split) {
|
941
|
+
level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
|
942
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_ord);
|
943
|
+
}
|
852
944
|
recursive_split_numeric(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
853
945
|
restore_recursion_state(workspace, *state_backup);
|
854
946
|
}
|
@@ -858,6 +950,7 @@ void recursive_split_numeric(Workspace &workspace,
|
|
858
950
|
workspace.column_type_best = Ordinal;
|
859
951
|
workspace.col_best = col;
|
860
952
|
workspace.split_lev_best = workspace.this_split_lev;
|
953
|
+
workspace.best_cat_split_is_binary = workspace.is_binary_split;
|
861
954
|
}
|
862
955
|
|
863
956
|
}
|
@@ -867,7 +960,7 @@ void recursive_split_numeric(Workspace &workspace,
|
|
867
960
|
/* avoid unnecessary memory usage */
|
868
961
|
workspace.col_has_outliers = workspace.lev_has_outliers? true : workspace.col_has_outliers;
|
869
962
|
(*workspace.tree)[tree_from].clusters.shrink_to_fit();
|
870
|
-
if ((*workspace.tree)[tree_from].all_branches.
|
963
|
+
if (!(*workspace.tree)[tree_from].all_branches.empty()) (*workspace.tree)[tree_from].all_branches.shrink_to_fit();
|
871
964
|
|
872
965
|
|
873
966
|
/* continue splitting further if meeting threshold criteria */
|
@@ -879,13 +972,13 @@ void recursive_split_numeric(Workspace &workspace,
|
|
879
972
|
|
880
973
|
/* discard outliers if any */
|
881
974
|
if (workspace.lev_has_outliers)
|
882
|
-
workspace.st = move_outliers_to_front(
|
975
|
+
workspace.st = move_outliers_to_front(workspace.ix_arr.data(), workspace.outlier_scores.data(), workspace.st, workspace.end);
|
883
976
|
|
884
977
|
/* assign rows to their corresponding branch */
|
885
978
|
switch(workspace.column_type_best) {
|
886
979
|
case Numeric:
|
887
980
|
{
|
888
|
-
divide_subset_split(
|
981
|
+
divide_subset_split(workspace.ix_arr.data(), input_data.numeric_data + workspace.col_best * input_data.nrows,
|
889
982
|
workspace.st, workspace.end, workspace.split_point_best,
|
890
983
|
(bool)(input_data.has_NA[workspace.col_best]),
|
891
984
|
&(workspace.this_split_NA), &(workspace.this_split_ix) );
|
@@ -896,28 +989,35 @@ void recursive_split_numeric(Workspace &workspace,
|
|
896
989
|
|
897
990
|
case Categorical:
|
898
991
|
{
|
899
|
-
divide_subset_split(
|
900
|
-
workspace.st, workspace.end,
|
992
|
+
divide_subset_split(workspace.ix_arr.data(), input_data.categorical_data + workspace.col_best * input_data.nrows,
|
993
|
+
workspace.st, workspace.end, workspace.buffer_subset_categ_best.data(), input_data.ncat[workspace.col_best],
|
901
994
|
(bool)(input_data.has_NA[workspace.col_best + input_data.ncols_numeric]),
|
902
995
|
&(workspace.this_split_NA), &(workspace.this_split_ix) );
|
903
996
|
spl1 = InSubset; spl2 = NotInSubset;
|
904
997
|
set_tree_as_categorical(workspace.tree->back(), input_data.ncat[workspace.col_best],
|
905
|
-
|
998
|
+
workspace.buffer_subset_categ_best.data(), workspace.col_best);
|
999
|
+
if (input_data.ncat[workspace.col_best] == 2 || workspace.best_cat_split_is_binary)
|
1000
|
+
workspace.exhausted_col_tracker.push_col(workspace.col_best + input_data.ncols_numeric);
|
906
1001
|
break;
|
907
1002
|
}
|
908
1003
|
|
909
1004
|
case Ordinal:
|
910
1005
|
{
|
911
|
-
divide_subset_split(
|
1006
|
+
divide_subset_split(workspace.ix_arr.data(), input_data.ordinal_data + workspace.col_best * input_data.nrows,
|
912
1007
|
workspace.st, workspace.end, workspace.split_lev_best,
|
913
1008
|
(bool)(input_data.has_NA[workspace.col_best + input_data.ncols_numeric + input_data.ncols_categ]),
|
914
1009
|
&(workspace.this_split_NA), &(workspace.this_split_ix) );
|
915
1010
|
spl1 = LessOrEqual; spl2 = Greater;
|
916
1011
|
set_tree_as_ordinal(workspace.tree->back(), workspace.split_lev_best, workspace.col_best);
|
1012
|
+
if (input_data.ncat_ord[workspace.col_best] == 2 || workspace.best_cat_split_is_binary)
|
1013
|
+
workspace.exhausted_col_tracker.push_col(workspace.col_best + input_data.ncols_numeric + input_data.ncols_categ);
|
917
1014
|
break;
|
918
1015
|
}
|
919
1016
|
|
920
|
-
|
1017
|
+
default:
|
1018
|
+
{
|
1019
|
+
unexpected_error();
|
1020
|
+
}
|
921
1021
|
}
|
922
1022
|
|
923
1023
|
/* continue splitting recursively - need to remember from where */
|
@@ -956,11 +1056,12 @@ void recursive_split_numeric(Workspace &workspace,
|
|
956
1056
|
}
|
957
1057
|
|
958
1058
|
/* if tree has no clusters and no subtrees, disconnect it from parent and then drop */
|
1059
|
+
abandon_column:
|
959
1060
|
if (check_tree_is_not_needed((*workspace.tree)[tree_from])) {
|
960
1061
|
|
961
1062
|
if (tree_from == 0) {
|
962
1063
|
workspace.tree->clear();
|
963
|
-
} else if ((*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.
|
1064
|
+
} else if (!(*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.empty()) {
|
964
1065
|
(*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.pop_back();
|
965
1066
|
workspace.tree->pop_back();
|
966
1067
|
} else {
|
@@ -995,6 +1096,11 @@ void recursive_split_numeric(Workspace &workspace,
|
|
995
1096
|
(*workspace.tree)[(*workspace.tree)[tree_from].parent].tree_right = 0;
|
996
1097
|
break;
|
997
1098
|
}
|
1099
|
+
|
1100
|
+
default:
|
1101
|
+
{
|
1102
|
+
unexpected_error();
|
1103
|
+
}
|
998
1104
|
}
|
999
1105
|
workspace.tree->pop_back();
|
1000
1106
|
}
|
@@ -1010,6 +1116,8 @@ void process_categ_col(std::vector<Cluster> &cluster_root,
|
|
1010
1116
|
ModelParams &model_params,
|
1011
1117
|
ModelOutputs &model_outputs)
|
1012
1118
|
{
|
1119
|
+
if (interrupt_switch) return;
|
1120
|
+
|
1013
1121
|
if (model_params.max_depth <= 0) return;
|
1014
1122
|
|
1015
1123
|
/* extract necesary info from column and discard NAs */
|
@@ -1024,10 +1132,10 @@ void process_categ_col(std::vector<Cluster> &cluster_root,
|
|
1024
1132
|
}
|
1025
1133
|
workspace.untransf_target_col = workspace.target_categ_col;
|
1026
1134
|
workspace.end = input_data.nrows - 1;
|
1027
|
-
workspace.st = move_NAs_to_front(
|
1135
|
+
workspace.st = move_NAs_to_front(workspace.ix_arr.data(), workspace.target_categ_col, 0, workspace.end);
|
1028
1136
|
workspace.col_has_outliers = false;
|
1029
1137
|
workspace.col_is_bin = workspace.ncat_this <= 2;
|
1030
|
-
workspace.prop_small_this =
|
1138
|
+
workspace.prop_small_this = model_params.prop_small.data() + model_outputs.start_ix_cat_counts[workspace.target_col_num];
|
1031
1139
|
workspace.prior_prob = &model_outputs.prop_categ[ model_outputs.start_ix_cat_counts[workspace.target_col_num] ];
|
1032
1140
|
|
1033
1141
|
/* create cluster root and reset outlier scores for this column */
|
@@ -1038,20 +1146,20 @@ void process_categ_col(std::vector<Cluster> &cluster_root,
|
|
1038
1146
|
|
1039
1147
|
|
1040
1148
|
/* at first, see if there's a category with 1-2 observations among only categories with large counts */
|
1041
|
-
workspace.col_has_outliers = find_outlier_categories_no_cond(
|
1149
|
+
workspace.col_has_outliers = find_outlier_categories_no_cond(input_data.cat_counts.data() + model_outputs.start_ix_cat_counts[workspace.target_col_num],
|
1042
1150
|
workspace.ncat_this, workspace.end - workspace.st + 1,
|
1043
|
-
|
1151
|
+
workspace.buffer_subset_categ.data(), &(workspace.orig_mean));
|
1044
1152
|
|
1045
1153
|
/* if there is any such case, create a cluster for them */
|
1046
1154
|
if (workspace.col_has_outliers) {
|
1047
1155
|
workspace.tree->back().clusters.push_back(0);
|
1048
1156
|
workspace.clusters->emplace_back(NoType, Root);
|
1049
|
-
define_categ_cluster_no_cond(workspace.untransf_target_col,
|
1050
|
-
|
1051
|
-
|
1052
|
-
|
1053
|
-
|
1054
|
-
workspace.st = move_outliers_to_front(
|
1157
|
+
define_categ_cluster_no_cond(workspace.untransf_target_col, workspace.ix_arr.data(), workspace.st, workspace.end, workspace.ncat_this,
|
1158
|
+
workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
|
1159
|
+
workspace.outlier_depth.data(), workspace.clusters->back(),
|
1160
|
+
input_data.cat_counts.data() + model_outputs.start_ix_cat_counts[workspace.target_col_num],
|
1161
|
+
workspace.buffer_subset_categ.data(), workspace.orig_mean);
|
1162
|
+
workspace.st = move_outliers_to_front(workspace.ix_arr.data(), workspace.outlier_scores.data(), workspace.st, workspace.end);
|
1055
1163
|
}
|
1056
1164
|
|
1057
1165
|
/* if no conditional outliers are required, stop there */
|
@@ -1069,6 +1177,8 @@ void process_categ_col(std::vector<Cluster> &cluster_root,
|
|
1069
1177
|
}
|
1070
1178
|
if (should_skip) return;
|
1071
1179
|
|
1180
|
+
workspace.exhausted_col_tracker.initialize(input_data.tot_cols, model_params.max_depth);
|
1181
|
+
|
1072
1182
|
|
1073
1183
|
/* if the column is already binary, or if using multiple categories, or if there are no more categorical columns, split the data as-is */
|
1074
1184
|
if (
|
@@ -1080,7 +1190,7 @@ void process_categ_col(std::vector<Cluster> &cluster_root,
|
|
1080
1190
|
{
|
1081
1191
|
|
1082
1192
|
/* calculate base information */
|
1083
|
-
workspace.base_info = total_info(
|
1193
|
+
workspace.base_info = total_info(input_data.cat_counts.data() + model_outputs.start_ix_cat_counts[workspace.target_col_num],
|
1084
1194
|
workspace.ncat_this, workspace.end - workspace.st + 1);
|
1085
1195
|
workspace.base_info_orig = workspace.base_info;
|
1086
1196
|
|
@@ -1096,7 +1206,7 @@ void process_categ_col(std::vector<Cluster> &cluster_root,
|
|
1096
1206
|
size_t cat_counts_bin[2];
|
1097
1207
|
workspace.col_is_bin = true;
|
1098
1208
|
workspace.already_split_main = false;
|
1099
|
-
workspace.base_info_orig = total_info(
|
1209
|
+
workspace.base_info_orig = total_info(input_data.cat_counts.data() + model_outputs.start_ix_cat_counts[workspace.target_col_num],
|
1100
1210
|
workspace.ncat_this, workspace.end - workspace.st + 1);
|
1101
1211
|
workspace.tree->back().column_type = NoType;
|
1102
1212
|
|
@@ -1130,7 +1240,7 @@ void process_categ_col(std::vector<Cluster> &cluster_root,
|
|
1130
1240
|
}
|
1131
1241
|
|
1132
1242
|
if (cat_counts_bin[0] > 0 && cat_counts_bin[1] > 0) {
|
1133
|
-
workspace.target_categ_col =
|
1243
|
+
workspace.target_categ_col = workspace.buffer_bin_y.data();
|
1134
1244
|
workspace.base_info = total_info(cat_counts_bin, 2, workspace.end - workspace.st + 1);
|
1135
1245
|
(*workspace.tree)[0].binary_branches.push_back(workspace.tree->size());
|
1136
1246
|
workspace.tree->emplace_back(0, SubTrees);
|
@@ -1150,6 +1260,8 @@ void recursive_split_categ(Workspace &workspace,
|
|
1150
1260
|
ModelParams &model_params,
|
1151
1261
|
size_t curr_depth, bool is_NA_branch)
|
1152
1262
|
{
|
1263
|
+
if (interrupt_switch) return;
|
1264
|
+
|
1153
1265
|
/* idea is the same as its numeric counterpart, only splitting by another categorical
|
1154
1266
|
is less clear how to do and offers different options */
|
1155
1267
|
workspace.best_gain = -HUGE_VAL;
|
@@ -1158,14 +1270,15 @@ void recursive_split_categ(Workspace &workspace,
|
|
1158
1270
|
size_t ix1, ix2, ix3;
|
1159
1271
|
SplitType spl1, spl2;
|
1160
1272
|
size_t tree_from = workspace.tree->size() - 1;
|
1273
|
+
ExhaustedColumnsLevel level_col_tracker;
|
1161
1274
|
|
1162
1275
|
/* when using 'follow_all' need to keep track of a lot more things */
|
1163
1276
|
std::unique_ptr<RecursionState> state_backup;
|
1164
1277
|
if (model_params.follow_all) state_backup = std::unique_ptr<RecursionState>(new RecursionState);
|
1165
1278
|
|
1166
1279
|
if (curr_depth > 0) {
|
1167
|
-
workspace.base_info_orig = total_info(
|
1168
|
-
workspace.ncat_this,
|
1280
|
+
workspace.base_info_orig = total_info(workspace.ix_arr.data(), workspace.untransf_target_col, workspace.st, workspace.end,
|
1281
|
+
workspace.ncat_this, workspace.buffer_cat_cnt.data());
|
1169
1282
|
|
1170
1283
|
/* check that there's still more than 1 category */
|
1171
1284
|
size_t ncat_present = 0;
|
@@ -1175,14 +1288,16 @@ void recursive_split_categ(Workspace &workspace,
|
|
1175
1288
|
}
|
1176
1289
|
if (ncat_present < 2) goto drop_if_not_needed;
|
1177
1290
|
if (workspace.col_is_bin && workspace.ncat_this > 2) {
|
1178
|
-
workspace.base_info = total_info(
|
1179
|
-
2,
|
1291
|
+
workspace.base_info = total_info(workspace.ix_arr.data(), workspace.target_categ_col, workspace.st, workspace.end,
|
1292
|
+
2, workspace.buffer_cat_cnt.data());
|
1180
1293
|
if (workspace.buffer_cat_cnt[0] < model_params.min_size_categ || workspace.buffer_cat_cnt[1] == model_params.min_size_categ) goto drop_if_not_needed;
|
1181
1294
|
} else {
|
1182
1295
|
workspace.base_info = workspace.base_info_orig;
|
1183
1296
|
}
|
1184
1297
|
}
|
1185
1298
|
|
1299
|
+
level_col_tracker.initialize(&workspace.exhausted_col_tracker);
|
1300
|
+
|
1186
1301
|
/* split with each other column */
|
1187
1302
|
|
1188
1303
|
|
@@ -1191,11 +1306,17 @@ void recursive_split_categ(Workspace &workspace,
|
|
1191
1306
|
|
1192
1307
|
if (curr_depth == 0 && workspace.col_is_bin && workspace.ncat_this > 2 && workspace.already_split_main) break;
|
1193
1308
|
if (input_data.skip_col[col]) continue;
|
1194
|
-
|
1309
|
+
if (workspace.exhausted_col_tracker.is_exhausted[col]) continue;
|
1310
|
+
split_numericx_categy(workspace.ix_arr.data(), workspace.st, workspace.end, input_data.numeric_data + col * input_data.nrows,
|
1195
1311
|
workspace.untransf_target_col, workspace.ncat_this, workspace.base_info_orig,
|
1196
|
-
|
1197
|
-
model_params.take_mid, &
|
1198
|
-
&
|
1312
|
+
workspace.buffer_cat_cnt.data(), (bool)(input_data.has_NA[col]), model_params.min_size_categ,
|
1313
|
+
model_params.take_mid, &workspace.this_gain, &workspace.this_split_point,
|
1314
|
+
&workspace.this_split_ix, &workspace.this_split_NA, &workspace.has_zero_variance);
|
1315
|
+
if (workspace.has_zero_variance) {
|
1316
|
+
workspace.has_zero_variance = false;
|
1317
|
+
workspace.exhausted_col_tracker.push_col(col);
|
1318
|
+
continue;
|
1319
|
+
}
|
1199
1320
|
if (model_params.gain_as_pct) workspace.this_gain /= workspace.base_info_orig;
|
1200
1321
|
|
1201
1322
|
if (workspace.this_gain >= model_params.min_gain) {
|
@@ -1207,15 +1328,15 @@ void recursive_split_categ(Workspace &workspace,
|
|
1207
1328
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1208
1329
|
workspace.clusters->emplace_back(Numeric, col, IsNa, -HUGE_VAL, true);
|
1209
1330
|
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1210
|
-
|
1331
|
+
workspace.ix_arr.data(), workspace.st, workspace.this_split_NA - 1,
|
1211
1332
|
workspace.ncat_this, model_params.categ_from_maj,
|
1212
|
-
|
1213
|
-
|
1333
|
+
workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
|
1334
|
+
workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
1214
1335
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1215
1336
|
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1216
1337
|
workspace.prop_small_this, workspace.prior_prob,
|
1217
|
-
|
1218
|
-
|
1338
|
+
workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
|
1339
|
+
workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
|
1219
1340
|
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1220
1341
|
if (workspace.drop_cluster) {
|
1221
1342
|
workspace.clusters->pop_back();
|
@@ -1227,6 +1348,9 @@ void recursive_split_categ(Workspace &workspace,
|
|
1227
1348
|
workspace.tree->emplace_back(tree_from, col, HUGE_VAL, IsNa);
|
1228
1349
|
backup_recursion_state(workspace, *state_backup);
|
1229
1350
|
workspace.end = workspace.this_split_NA - 1;
|
1351
|
+
ExhaustedColumnsLevel level_col_tracker2;
|
1352
|
+
level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
|
1353
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
|
1230
1354
|
recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, true);
|
1231
1355
|
restore_recursion_state(workspace, *state_backup);
|
1232
1356
|
}
|
@@ -1237,15 +1361,15 @@ void recursive_split_categ(Workspace &workspace,
|
|
1237
1361
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1238
1362
|
workspace.clusters->emplace_back(Numeric, col, LessOrEqual, workspace.this_split_point, is_NA_branch);
|
1239
1363
|
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1240
|
-
|
1364
|
+
workspace.ix_arr.data(), workspace.this_split_NA, workspace.this_split_ix,
|
1241
1365
|
workspace.ncat_this, model_params.categ_from_maj,
|
1242
|
-
|
1243
|
-
|
1366
|
+
workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
|
1367
|
+
workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
1244
1368
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1245
1369
|
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1246
1370
|
workspace.prop_small_this, workspace.prior_prob,
|
1247
|
-
|
1248
|
-
|
1371
|
+
workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
|
1372
|
+
workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
|
1249
1373
|
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1250
1374
|
if (workspace.drop_cluster) {
|
1251
1375
|
workspace.clusters->pop_back();
|
@@ -1267,15 +1391,15 @@ void recursive_split_categ(Workspace &workspace,
|
|
1267
1391
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1268
1392
|
workspace.clusters->emplace_back(Numeric, col, Greater, workspace.this_split_point, is_NA_branch);
|
1269
1393
|
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1270
|
-
|
1394
|
+
workspace.ix_arr.data(), workspace.this_split_ix + 1, workspace.end,
|
1271
1395
|
workspace.ncat_this, model_params.categ_from_maj,
|
1272
|
-
|
1273
|
-
|
1396
|
+
workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
|
1397
|
+
workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
1274
1398
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1275
1399
|
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1276
1400
|
workspace.prop_small_this, workspace.prior_prob,
|
1277
|
-
|
1278
|
-
|
1401
|
+
workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
|
1402
|
+
workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
|
1279
1403
|
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1280
1404
|
if (workspace.drop_cluster) {
|
1281
1405
|
workspace.clusters->pop_back();
|
@@ -1313,15 +1437,23 @@ void recursive_split_categ(Workspace &workspace,
|
|
1313
1437
|
|
1314
1438
|
if (col == workspace.target_col_num && !workspace.target_col_is_ord) continue;
|
1315
1439
|
if (input_data.skip_col[col + input_data.ncols_numeric]) continue;
|
1440
|
+
if (workspace.exhausted_col_tracker.is_exhausted[col + input_data.ncols_numeric]) continue;
|
1441
|
+
workspace.is_binary_split = false;
|
1316
1442
|
|
1317
1443
|
if (workspace.col_is_bin) {
|
1318
1444
|
|
1319
|
-
split_categx_biny(
|
1445
|
+
split_categx_biny(workspace.ix_arr.data(), workspace.st, workspace.end,
|
1320
1446
|
input_data.categorical_data + col * input_data.nrows, workspace.target_categ_col,
|
1321
|
-
input_data.ncat[col], workspace.base_info,
|
1322
|
-
|
1447
|
+
input_data.ncat[col], workspace.base_info, workspace.buffer_cat_cnt.data(),
|
1448
|
+
workspace.buffer_crosstab.data(), workspace.buffer_cat_sorted.data(),
|
1323
1449
|
(bool)(input_data.has_NA[col + input_data.ncols_numeric]), model_params.min_size_categ,
|
1324
|
-
&(workspace.this_gain),
|
1450
|
+
&(workspace.this_gain), workspace.buffer_subset_categ.data(),
|
1451
|
+
&workspace.has_zero_variance, &workspace.is_binary_split);
|
1452
|
+
if (workspace.has_zero_variance) {
|
1453
|
+
workspace.has_zero_variance = false;
|
1454
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
|
1455
|
+
continue;
|
1456
|
+
}
|
1325
1457
|
|
1326
1458
|
/* If it was forcibly binarized, need to calculate the gain on the original categories to make it comparable */
|
1327
1459
|
if (
|
@@ -1332,29 +1464,35 @@ void recursive_split_categ(Workspace &workspace,
|
|
1332
1464
|
)
|
1333
1465
|
)
|
1334
1466
|
{
|
1335
|
-
divide_subset_split(
|
1336
|
-
workspace.st, workspace.end,
|
1467
|
+
divide_subset_split(workspace.ix_arr.data(), input_data.categorical_data + col * input_data.nrows,
|
1468
|
+
workspace.st, workspace.end, workspace.buffer_subset_categ.data(), input_data.ncat[col],
|
1337
1469
|
(bool)input_data.has_NA[col + input_data.ncols_numeric],
|
1338
1470
|
&(workspace.this_split_NA), &(workspace.this_split_ix) );
|
1339
|
-
workspace.this_gain = categ_gain_from_split(
|
1471
|
+
workspace.this_gain = categ_gain_from_split(workspace.ix_arr.data(), workspace.untransf_target_col, workspace.st,
|
1340
1472
|
workspace.this_split_NA, workspace.this_split_ix, workspace.end,
|
1341
|
-
workspace.ncat_this,
|
1473
|
+
workspace.ncat_this, workspace.buffer_cat_cnt.data(), workspace.base_info_orig);
|
1342
1474
|
}
|
1343
1475
|
|
1344
1476
|
} else {
|
1345
1477
|
|
1346
1478
|
if (model_params.cat_bruteforce_subset && input_data.ncat[col] > 2) {
|
1347
|
-
split_categx_categy_subset(
|
1479
|
+
split_categx_categy_subset(workspace.ix_arr.data(), workspace.st, workspace.end,
|
1348
1480
|
input_data.categorical_data + col * input_data.nrows, workspace.target_categ_col,
|
1349
1481
|
input_data.ncat[col], workspace.ncat_this, workspace.base_info_orig,
|
1350
|
-
|
1482
|
+
workspace.buffer_cat_sorted.data(), workspace.buffer_crosstab.data(), workspace.buffer_cat_cnt.data(),
|
1351
1483
|
(bool)(input_data.has_NA[col + input_data.ncols_numeric]), model_params.min_size_categ,
|
1352
|
-
&(workspace.this_gain),
|
1484
|
+
&(workspace.this_gain), workspace.buffer_subset_categ.data(),
|
1485
|
+
&workspace.has_zero_variance, &workspace.is_binary_split);
|
1486
|
+
if (workspace.has_zero_variance) {
|
1487
|
+
workspace.has_zero_variance = false;
|
1488
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
|
1489
|
+
continue;
|
1490
|
+
}
|
1353
1491
|
} else {
|
1354
|
-
split_categx_categy_separate(
|
1492
|
+
split_categx_categy_separate(workspace.ix_arr.data(), workspace.st, workspace.end,
|
1355
1493
|
input_data.categorical_data + col * input_data.nrows, workspace.target_categ_col,
|
1356
1494
|
input_data.ncat[col], workspace.ncat_this, workspace.base_info_orig,
|
1357
|
-
|
1495
|
+
workspace.buffer_cat_cnt.data(), workspace.buffer_crosstab.data(),
|
1358
1496
|
(bool)(input_data.has_NA[col + input_data.ncols_numeric]),
|
1359
1497
|
model_params.min_size_categ, &(workspace.this_gain));
|
1360
1498
|
}
|
@@ -1365,21 +1503,21 @@ void recursive_split_categ(Workspace &workspace,
|
|
1365
1503
|
if (workspace.this_gain >= model_params.min_gain) {
|
1366
1504
|
|
1367
1505
|
/* NA branch */
|
1368
|
-
workspace.this_split_NA = move_NAs_to_front(
|
1506
|
+
workspace.this_split_NA = move_NAs_to_front(workspace.ix_arr.data(), input_data.categorical_data + col * input_data.nrows, workspace.st, workspace.end);
|
1369
1507
|
if ((workspace.this_split_NA - workspace.st) > model_params.min_size_categ) {
|
1370
1508
|
|
1371
1509
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1372
|
-
workspace.clusters->emplace_back(Categorical, col, IsNa, (char*)NULL, (int)0, true);
|
1510
|
+
workspace.clusters->emplace_back(Categorical, col, IsNa, (signed char*)NULL, (int)0, true);
|
1373
1511
|
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1374
|
-
|
1512
|
+
workspace.ix_arr.data(), workspace.st, workspace.this_split_NA - 1,
|
1375
1513
|
workspace.ncat_this, model_params.categ_from_maj,
|
1376
|
-
|
1377
|
-
|
1514
|
+
workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
|
1515
|
+
workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
1378
1516
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1379
1517
|
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1380
1518
|
workspace.prop_small_this, workspace.prior_prob,
|
1381
|
-
|
1382
|
-
|
1519
|
+
workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
|
1520
|
+
workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
|
1383
1521
|
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1384
1522
|
if (workspace.drop_cluster) {
|
1385
1523
|
workspace.clusters->pop_back();
|
@@ -1388,9 +1526,12 @@ void recursive_split_categ(Workspace &workspace,
|
|
1388
1526
|
|
1389
1527
|
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
1390
1528
|
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
1391
|
-
workspace.tree->emplace_back(tree_from, col, IsNa, (char*)NULL, 0);
|
1529
|
+
workspace.tree->emplace_back(tree_from, col, IsNa, (signed char*)NULL, 0);
|
1392
1530
|
backup_recursion_state(workspace, *state_backup);
|
1393
1531
|
workspace.end = workspace.this_split_NA - 1;
|
1532
|
+
ExhaustedColumnsLevel level_col_tracker2;
|
1533
|
+
level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
|
1534
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
|
1394
1535
|
recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, true);
|
1395
1536
|
restore_recursion_state(workspace, *state_backup);
|
1396
1537
|
}
|
@@ -1401,7 +1542,7 @@ void recursive_split_categ(Workspace &workspace,
|
|
1401
1542
|
|
1402
1543
|
/* sort by the splitting variable and iterate over to determine the split points */
|
1403
1544
|
workspace.temp_ptr_x = input_data.categorical_data + col * input_data.nrows;
|
1404
|
-
std::sort(
|
1545
|
+
std::sort(workspace.ix_arr.data() + workspace.this_split_NA, workspace.ix_arr.data() + workspace.end + 1,
|
1405
1546
|
[&workspace](const size_t a, const size_t b){return workspace.temp_ptr_x[a] < workspace.temp_ptr_x[b];});
|
1406
1547
|
workspace.this_split_ix = workspace.this_split_NA;
|
1407
1548
|
|
@@ -1416,15 +1557,15 @@ void recursive_split_categ(Workspace &workspace,
|
|
1416
1557
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1417
1558
|
workspace.clusters->emplace_back(col, workspace.temp_ptr_x[workspace.ix_arr[row-1]], input_data.ncat[col], is_NA_branch);
|
1418
1559
|
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1419
|
-
|
1560
|
+
workspace.ix_arr.data(), workspace.this_split_ix, row - 1,
|
1420
1561
|
workspace.ncat_this, model_params.categ_from_maj,
|
1421
|
-
|
1422
|
-
|
1562
|
+
workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
|
1563
|
+
workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
1423
1564
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1424
1565
|
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1425
1566
|
workspace.prop_small_this, workspace.prior_prob,
|
1426
|
-
|
1427
|
-
|
1567
|
+
workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
|
1568
|
+
workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
|
1428
1569
|
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1429
1570
|
if (workspace.drop_cluster) {
|
1430
1571
|
workspace.clusters->pop_back();
|
@@ -1436,6 +1577,9 @@ void recursive_split_categ(Workspace &workspace,
|
|
1436
1577
|
backup_recursion_state(workspace, *state_backup);
|
1437
1578
|
workspace.st = workspace.this_split_ix;
|
1438
1579
|
workspace.end = row - 1;
|
1580
|
+
ExhaustedColumnsLevel level_col_tracker2;
|
1581
|
+
level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
|
1582
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
|
1439
1583
|
recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
1440
1584
|
restore_recursion_state(workspace, *state_backup);
|
1441
1585
|
}
|
@@ -1448,15 +1592,15 @@ void recursive_split_categ(Workspace &workspace,
|
|
1448
1592
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1449
1593
|
workspace.clusters->emplace_back(col, workspace.temp_ptr_x[workspace.ix_arr[workspace.end]], input_data.ncat[col], is_NA_branch);
|
1450
1594
|
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1451
|
-
|
1595
|
+
workspace.ix_arr.data(), workspace.this_split_ix, workspace.end,
|
1452
1596
|
workspace.ncat_this, model_params.categ_from_maj,
|
1453
|
-
|
1454
|
-
|
1597
|
+
workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
|
1598
|
+
workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
1455
1599
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1456
1600
|
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1457
1601
|
workspace.prop_small_this, workspace.prior_prob,
|
1458
|
-
|
1459
|
-
|
1602
|
+
workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
|
1603
|
+
workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
|
1460
1604
|
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1461
1605
|
if (workspace.drop_cluster) {
|
1462
1606
|
workspace.clusters->pop_back();
|
@@ -1467,6 +1611,9 @@ void recursive_split_categ(Workspace &workspace,
|
|
1467
1611
|
workspace.tree->emplace_back(tree_from, col, workspace.temp_ptr_x[workspace.ix_arr[workspace.end]]);
|
1468
1612
|
backup_recursion_state(workspace, *state_backup);
|
1469
1613
|
workspace.st = workspace.this_split_ix;
|
1614
|
+
ExhaustedColumnsLevel level_col_tracker2;
|
1615
|
+
level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
|
1616
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
|
1470
1617
|
recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
1471
1618
|
restore_recursion_state(workspace, *state_backup);
|
1472
1619
|
}
|
@@ -1477,6 +1624,7 @@ void recursive_split_categ(Workspace &workspace,
|
|
1477
1624
|
workspace.best_gain = workspace.this_gain;
|
1478
1625
|
workspace.column_type_best = Categorical;
|
1479
1626
|
workspace.col_best = col;
|
1627
|
+
workspace.best_cat_split_is_binary = false;
|
1480
1628
|
}
|
1481
1629
|
|
1482
1630
|
|
@@ -1488,7 +1636,7 @@ void recursive_split_categ(Workspace &workspace,
|
|
1488
1636
|
|
1489
1637
|
workspace.buffer_subset_categ[0] = 1;
|
1490
1638
|
workspace.buffer_subset_categ[1] = 0;
|
1491
|
-
divide_subset_split(
|
1639
|
+
divide_subset_split(workspace.ix_arr.data(), input_data.categorical_data + col * input_data.nrows, workspace.this_split_NA, workspace.end,
|
1492
1640
|
(int)0, false, &(workspace.this_split_NA), &(workspace.this_split_ix));
|
1493
1641
|
if (
|
1494
1642
|
(workspace.end - workspace.this_split_ix) < model_params.min_size_categ ||
|
@@ -1497,24 +1645,24 @@ void recursive_split_categ(Workspace &workspace,
|
|
1497
1645
|
|
1498
1646
|
} else {
|
1499
1647
|
|
1500
|
-
divide_subset_split(
|
1501
|
-
|
1648
|
+
divide_subset_split(workspace.ix_arr.data(), input_data.categorical_data + col * input_data.nrows, workspace.this_split_NA, workspace.end,
|
1649
|
+
workspace.buffer_subset_categ.data(), input_data.ncat[col], false,
|
1502
1650
|
&(workspace.this_split_NA), &(workspace.this_split_ix));
|
1503
1651
|
}
|
1504
1652
|
|
1505
1653
|
/* left branch */
|
1506
1654
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1507
|
-
workspace.clusters->emplace_back(Categorical, col, InSubset,
|
1655
|
+
workspace.clusters->emplace_back(Categorical, col, InSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col], is_NA_branch);
|
1508
1656
|
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1509
|
-
|
1657
|
+
workspace.ix_arr.data(), workspace.this_split_NA, workspace.this_split_ix - 1,
|
1510
1658
|
workspace.ncat_this, model_params.categ_from_maj,
|
1511
|
-
|
1512
|
-
|
1659
|
+
workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
|
1660
|
+
workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
1513
1661
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1514
1662
|
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1515
1663
|
workspace.prop_small_this, workspace.prior_prob,
|
1516
|
-
|
1517
|
-
|
1664
|
+
workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
|
1665
|
+
workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
|
1518
1666
|
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1519
1667
|
if (workspace.drop_cluster) {
|
1520
1668
|
workspace.clusters->pop_back();
|
@@ -1523,27 +1671,32 @@ void recursive_split_categ(Workspace &workspace,
|
|
1523
1671
|
|
1524
1672
|
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
1525
1673
|
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
1526
|
-
workspace.tree->emplace_back(tree_from, col, InSubset,
|
1674
|
+
workspace.tree->emplace_back(tree_from, col, InSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col]);
|
1527
1675
|
backup_recursion_state(workspace, *state_backup);
|
1528
1676
|
workspace.st = workspace.this_split_NA;
|
1529
1677
|
workspace.end = workspace.this_split_ix - 1;
|
1678
|
+
ExhaustedColumnsLevel level_col_tracker2;
|
1679
|
+
if (input_data.ncat[col] == 2 || workspace.is_binary_split) {
|
1680
|
+
level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
|
1681
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
|
1682
|
+
}
|
1530
1683
|
recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
1531
1684
|
restore_recursion_state(workspace, *state_backup);
|
1532
1685
|
}
|
1533
1686
|
|
1534
1687
|
/* right branch */
|
1535
1688
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1536
|
-
workspace.clusters->emplace_back(Categorical, col, NotInSubset,
|
1689
|
+
workspace.clusters->emplace_back(Categorical, col, NotInSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col], is_NA_branch);
|
1537
1690
|
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1538
|
-
|
1691
|
+
workspace.ix_arr.data(), workspace.this_split_ix, workspace.end,
|
1539
1692
|
workspace.ncat_this, model_params.categ_from_maj,
|
1540
|
-
|
1541
|
-
|
1693
|
+
workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
|
1694
|
+
workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
1542
1695
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1543
1696
|
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1544
1697
|
workspace.prop_small_this, workspace.prior_prob,
|
1545
|
-
|
1546
|
-
|
1698
|
+
workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
|
1699
|
+
workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
|
1547
1700
|
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1548
1701
|
if (workspace.drop_cluster) {
|
1549
1702
|
workspace.clusters->pop_back();
|
@@ -1552,9 +1705,14 @@ void recursive_split_categ(Workspace &workspace,
|
|
1552
1705
|
|
1553
1706
|
if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
|
1554
1707
|
(*workspace.tree)[tree_from].all_branches.push_back(workspace.tree->size());
|
1555
|
-
workspace.tree->emplace_back(tree_from, col, NotInSubset,
|
1708
|
+
workspace.tree->emplace_back(tree_from, col, NotInSubset, workspace.buffer_subset_categ.data(), input_data.ncat[col]);
|
1556
1709
|
backup_recursion_state(workspace, *state_backup);
|
1557
1710
|
workspace.st = workspace.this_split_ix;
|
1711
|
+
ExhaustedColumnsLevel level_col_tracker2;
|
1712
|
+
if (input_data.ncat[col] == 2 || workspace.is_binary_split) {
|
1713
|
+
level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
|
1714
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric);
|
1715
|
+
}
|
1558
1716
|
recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
1559
1717
|
restore_recursion_state(workspace, *state_backup);
|
1560
1718
|
}
|
@@ -1563,7 +1721,8 @@ void recursive_split_categ(Workspace &workspace,
|
|
1563
1721
|
workspace.best_gain = workspace.this_gain;
|
1564
1722
|
workspace.column_type_best = Categorical;
|
1565
1723
|
workspace.col_best = col;
|
1566
|
-
|
1724
|
+
workspace.best_cat_split_is_binary = workspace.is_binary_split;
|
1725
|
+
memcpy(workspace.buffer_subset_categ_best.data(), workspace.buffer_subset_categ.data(), input_data.ncat[col] * sizeof(signed char));
|
1567
1726
|
}
|
1568
1727
|
|
1569
1728
|
}
|
@@ -1579,18 +1738,25 @@ void recursive_split_categ(Workspace &workspace,
|
|
1579
1738
|
if (curr_depth == 0 && workspace.col_is_bin && workspace.ncat_this > 2 && workspace.already_split_main) break;
|
1580
1739
|
if (input_data.skip_col[col + input_data.ncols_numeric + input_data.ncols_categ]) continue;
|
1581
1740
|
if (workspace.target_col_is_ord && col == (workspace.target_col_num - input_data.ncols_categ)) continue;
|
1741
|
+
if (workspace.exhausted_col_tracker.is_exhausted[col + input_data.ncols_numeric + input_data.ncols_categ]) continue;
|
1582
1742
|
|
1583
|
-
split_ordx_categy(
|
1743
|
+
split_ordx_categy(workspace.ix_arr.data(), workspace.st, workspace.end,
|
1584
1744
|
input_data.ordinal_data + col * input_data.nrows, workspace.untransf_target_col,
|
1585
1745
|
input_data.ncat_ord[col], workspace.ncat_this,
|
1586
|
-
workspace.base_info_orig,
|
1746
|
+
workspace.base_info_orig, workspace.buffer_cat_cnt.data(), workspace.buffer_crosstab.data(), workspace.buffer_cat_sorted.data(),
|
1587
1747
|
(bool)(input_data.has_NA[col + input_data.ncols_numeric + input_data.ncols_categ]),
|
1588
|
-
model_params.min_size_categ, &(workspace.this_gain), &(workspace.this_split_lev)
|
1748
|
+
model_params.min_size_categ, &(workspace.this_gain), &(workspace.this_split_lev),
|
1749
|
+
&workspace.has_zero_variance, &workspace.is_binary_split);
|
1750
|
+
if (workspace.has_zero_variance) {
|
1751
|
+
workspace.has_zero_variance = false;
|
1752
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_categ);
|
1753
|
+
continue;
|
1754
|
+
}
|
1589
1755
|
if (model_params.gain_as_pct) workspace.this_gain /= workspace.base_info_orig;
|
1590
1756
|
|
1591
1757
|
if (workspace.this_gain >= model_params.min_gain) {
|
1592
1758
|
|
1593
|
-
divide_subset_split(
|
1759
|
+
divide_subset_split(workspace.ix_arr.data(), input_data.ordinal_data + col * input_data.nrows, workspace.st, workspace.end,
|
1594
1760
|
workspace.this_split_lev, (bool)(workspace.buffer_cat_cnt[ input_data.ncat_ord[col] ] > 0),
|
1595
1761
|
&(workspace.this_split_NA), &(workspace.this_split_ix) );
|
1596
1762
|
|
@@ -1600,15 +1766,15 @@ void recursive_split_categ(Workspace &workspace,
|
|
1600
1766
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1601
1767
|
workspace.clusters->emplace_back(Ordinal, col, IsNa, (int)0, true);
|
1602
1768
|
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1603
|
-
|
1769
|
+
workspace.ix_arr.data(), workspace.st, workspace.this_split_NA - 1,
|
1604
1770
|
workspace.ncat_this, model_params.categ_from_maj,
|
1605
|
-
|
1606
|
-
|
1771
|
+
workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
|
1772
|
+
workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
1607
1773
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1608
1774
|
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1609
1775
|
workspace.prop_small_this, workspace.prior_prob,
|
1610
|
-
|
1611
|
-
|
1776
|
+
workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
|
1777
|
+
workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
|
1612
1778
|
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1613
1779
|
if (workspace.drop_cluster) {
|
1614
1780
|
workspace.clusters->pop_back();
|
@@ -1620,6 +1786,9 @@ void recursive_split_categ(Workspace &workspace,
|
|
1620
1786
|
workspace.tree->emplace_back(tree_from, col, (int)-1, IsNa);
|
1621
1787
|
backup_recursion_state(workspace, *state_backup);
|
1622
1788
|
workspace.end = workspace.this_split_NA - 1;
|
1789
|
+
ExhaustedColumnsLevel level_col_tracker2;
|
1790
|
+
level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
|
1791
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_categ);
|
1623
1792
|
recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, true);
|
1624
1793
|
restore_recursion_state(workspace, *state_backup);
|
1625
1794
|
}
|
@@ -1630,15 +1799,15 @@ void recursive_split_categ(Workspace &workspace,
|
|
1630
1799
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1631
1800
|
workspace.clusters->emplace_back(Ordinal, col, LessOrEqual, workspace.this_split_lev, is_NA_branch);
|
1632
1801
|
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1633
|
-
|
1802
|
+
workspace.ix_arr.data(), workspace.this_split_NA, workspace.this_split_ix - 1,
|
1634
1803
|
workspace.ncat_this, model_params.categ_from_maj,
|
1635
|
-
|
1636
|
-
|
1804
|
+
workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
|
1805
|
+
workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
1637
1806
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1638
1807
|
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1639
1808
|
workspace.prop_small_this, workspace.prior_prob,
|
1640
|
-
|
1641
|
-
|
1809
|
+
workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
|
1810
|
+
workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
|
1642
1811
|
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1643
1812
|
if (workspace.drop_cluster) {
|
1644
1813
|
workspace.clusters->pop_back();
|
@@ -1651,6 +1820,11 @@ void recursive_split_categ(Workspace &workspace,
|
|
1651
1820
|
backup_recursion_state(workspace, *state_backup);
|
1652
1821
|
workspace.st = workspace.this_split_NA;
|
1653
1822
|
workspace.end = workspace.this_split_ix - 1;
|
1823
|
+
ExhaustedColumnsLevel level_col_tracker2;
|
1824
|
+
if (workspace.is_binary_split) {
|
1825
|
+
level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
|
1826
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_categ);
|
1827
|
+
}
|
1654
1828
|
recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
1655
1829
|
restore_recursion_state(workspace, *state_backup);
|
1656
1830
|
}
|
@@ -1659,15 +1833,15 @@ void recursive_split_categ(Workspace &workspace,
|
|
1659
1833
|
(*workspace.tree)[tree_from].clusters.push_back(workspace.clusters->size());
|
1660
1834
|
workspace.clusters->emplace_back(Ordinal, col, Greater, workspace.this_split_lev, is_NA_branch);
|
1661
1835
|
workspace.has_outliers = define_categ_cluster(workspace.untransf_target_col,
|
1662
|
-
|
1836
|
+
workspace.ix_arr.data(), workspace.this_split_ix, workspace.end,
|
1663
1837
|
workspace.ncat_this, model_params.categ_from_maj,
|
1664
|
-
|
1665
|
-
|
1838
|
+
workspace.outlier_scores.data(), workspace.outlier_clusters.data(), workspace.outlier_trees.data(),
|
1839
|
+
workspace.outlier_depth.data(), workspace.clusters->back(), *(workspace.clusters),
|
1666
1840
|
workspace.clusters->size() - 1, tree_from, curr_depth + 1,
|
1667
1841
|
model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
|
1668
1842
|
workspace.prop_small_this, workspace.prior_prob,
|
1669
|
-
|
1670
|
-
|
1843
|
+
workspace.buffer_cat_cnt.data(), workspace.buffer_cat_sum.data(),
|
1844
|
+
workspace.buffer_crosstab.data(), workspace.buffer_subset_outlier.data(), &(workspace.drop_cluster));
|
1671
1845
|
workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
|
1672
1846
|
if (workspace.drop_cluster) {
|
1673
1847
|
workspace.clusters->pop_back();
|
@@ -1679,6 +1853,11 @@ void recursive_split_categ(Workspace &workspace,
|
|
1679
1853
|
workspace.tree->emplace_back(tree_from, col, workspace.this_split_lev, Greater);
|
1680
1854
|
backup_recursion_state(workspace, *state_backup);
|
1681
1855
|
workspace.st = workspace.this_split_ix;
|
1856
|
+
ExhaustedColumnsLevel level_col_tracker2;
|
1857
|
+
if (workspace.is_binary_split) {
|
1858
|
+
level_col_tracker2.initialize(&workspace.exhausted_col_tracker);
|
1859
|
+
workspace.exhausted_col_tracker.push_col(col + input_data.ncols_numeric + input_data.ncols_categ);
|
1860
|
+
}
|
1682
1861
|
recursive_split_categ(workspace, input_data, model_params, curr_depth + 1, is_NA_branch);
|
1683
1862
|
restore_recursion_state(workspace, *state_backup);
|
1684
1863
|
}
|
@@ -1689,6 +1868,7 @@ void recursive_split_categ(Workspace &workspace,
|
|
1689
1868
|
workspace.column_type_best = Ordinal;
|
1690
1869
|
workspace.col_best = col;
|
1691
1870
|
workspace.split_lev_best = workspace.this_split_lev;
|
1871
|
+
workspace.best_cat_split_is_binary = workspace.is_binary_split;
|
1692
1872
|
}
|
1693
1873
|
|
1694
1874
|
}
|
@@ -1699,7 +1879,7 @@ void recursive_split_categ(Workspace &workspace,
|
|
1699
1879
|
/* avoid unnecessary memory usage or repeats */
|
1700
1880
|
workspace.col_has_outliers = workspace.lev_has_outliers? true : workspace.col_has_outliers;
|
1701
1881
|
(*workspace.tree)[tree_from].clusters.shrink_to_fit();
|
1702
|
-
if ((*workspace.tree)[tree_from].all_branches.
|
1882
|
+
if (!(*workspace.tree)[tree_from].all_branches.empty()) (*workspace.tree)[tree_from].all_branches.shrink_to_fit();
|
1703
1883
|
if (curr_depth == 0 && workspace.col_is_bin && workspace.ncat_this > 2 && !workspace.already_split_main)
|
1704
1884
|
workspace.already_split_main = true;
|
1705
1885
|
|
@@ -1712,13 +1892,13 @@ void recursive_split_categ(Workspace &workspace,
|
|
1712
1892
|
|
1713
1893
|
/* discard outliers if any */
|
1714
1894
|
if (workspace.lev_has_outliers)
|
1715
|
-
workspace.st = move_outliers_to_front(
|
1895
|
+
workspace.st = move_outliers_to_front(workspace.ix_arr.data(), workspace.outlier_scores.data(), workspace.st, workspace.end);
|
1716
1896
|
|
1717
1897
|
/* assign rows to their corresponding branch */
|
1718
1898
|
switch(workspace.column_type_best) {
|
1719
1899
|
case Numeric:
|
1720
1900
|
{
|
1721
|
-
divide_subset_split(
|
1901
|
+
divide_subset_split(workspace.ix_arr.data(), input_data.numeric_data + workspace.col_best * input_data.nrows,
|
1722
1902
|
workspace.st, workspace.end, workspace.split_point_best,
|
1723
1903
|
(bool)(input_data.has_NA[workspace.col_best]),
|
1724
1904
|
&(workspace.this_split_NA), &(workspace.this_split_ix) );
|
@@ -1729,12 +1909,14 @@ void recursive_split_categ(Workspace &workspace,
|
|
1729
1909
|
|
1730
1910
|
case Ordinal:
|
1731
1911
|
{
|
1732
|
-
divide_subset_split(
|
1912
|
+
divide_subset_split(workspace.ix_arr.data(), input_data.ordinal_data + workspace.col_best * input_data.nrows,
|
1733
1913
|
workspace.st, workspace.end, workspace.split_lev_best,
|
1734
1914
|
(bool)(input_data.has_NA[workspace.col_best + input_data.ncols_numeric + input_data.ncols_categ]),
|
1735
1915
|
&(workspace.this_split_NA), &(workspace.this_split_ix) );
|
1736
1916
|
spl1 = LessOrEqual; spl2 = Greater;
|
1737
1917
|
set_tree_as_ordinal(workspace.tree->back(), workspace.split_lev_best, workspace.col_best);
|
1918
|
+
if (input_data.ncat_ord[workspace.col_best] == 2 || workspace.best_cat_split_is_binary)
|
1919
|
+
workspace.exhausted_col_tracker.push_col(workspace.col_best + input_data.ncols_numeric + input_data.ncols_categ);
|
1738
1920
|
break;
|
1739
1921
|
}
|
1740
1922
|
|
@@ -1743,27 +1925,30 @@ void recursive_split_categ(Workspace &workspace,
|
|
1743
1925
|
|
1744
1926
|
if (input_data.ncat[workspace.col_best] == 2) {
|
1745
1927
|
|
1746
|
-
divide_subset_split(
|
1928
|
+
divide_subset_split(workspace.ix_arr.data(), input_data.categorical_data + workspace.col_best * input_data.nrows,
|
1747
1929
|
workspace.st, workspace.end, (int)0,
|
1748
1930
|
(bool)(input_data.has_NA[workspace.col_best + input_data.ncols_numeric]),
|
1749
1931
|
&(workspace.this_split_NA), &(workspace.this_split_ix) );
|
1750
1932
|
spl1 = InSubset; spl2 = NotInSubset;
|
1751
1933
|
set_tree_as_categorical(workspace.tree->back(), workspace.col_best);
|
1934
|
+
workspace.exhausted_col_tracker.push_col(workspace.col_best + input_data.ncols_numeric);
|
1752
1935
|
|
1753
1936
|
} else if (workspace.col_is_bin || model_params.cat_bruteforce_subset) {
|
1754
1937
|
|
1755
|
-
divide_subset_split(
|
1756
|
-
workspace.st, workspace.end,
|
1938
|
+
divide_subset_split(workspace.ix_arr.data(), input_data.categorical_data + workspace.col_best * input_data.nrows,
|
1939
|
+
workspace.st, workspace.end, workspace.buffer_subset_categ_best.data(), input_data.ncat[workspace.col_best],
|
1757
1940
|
(bool)(input_data.has_NA[workspace.col_best + input_data.ncols_numeric]),
|
1758
1941
|
&(workspace.this_split_NA), &(workspace.this_split_ix) );
|
1759
1942
|
spl1 = InSubset; spl2 = NotInSubset;
|
1760
1943
|
set_tree_as_categorical(workspace.tree->back(), input_data.ncat[workspace.col_best],
|
1761
|
-
|
1944
|
+
workspace.buffer_subset_categ_best.data(), workspace.col_best);
|
1945
|
+
if (workspace.best_cat_split_is_binary)
|
1946
|
+
workspace.exhausted_col_tracker.push_col(workspace.col_best + input_data.ncols_numeric);
|
1762
1947
|
|
1763
1948
|
} else {
|
1764
1949
|
spl1 = SingleCateg;
|
1765
1950
|
workspace.temp_ptr_x = input_data.categorical_data + workspace.col_best * input_data.nrows;
|
1766
|
-
std::sort(
|
1951
|
+
std::sort(workspace.ix_arr.data() + workspace.st, workspace.ix_arr.data() + workspace.end + 1,
|
1767
1952
|
[&workspace](const size_t a, const size_t b){return workspace.temp_ptr_x[a] < workspace.temp_ptr_x[b];});
|
1768
1953
|
set_tree_as_categorical(workspace.tree->back(), workspace.col_best, input_data.ncat[workspace.col_best]);
|
1769
1954
|
|
@@ -1773,11 +1958,15 @@ void recursive_split_categ(Workspace &workspace,
|
|
1773
1958
|
break;
|
1774
1959
|
}
|
1775
1960
|
}
|
1961
|
+
workspace.exhausted_col_tracker.push_col(workspace.col_best + input_data.ncols_numeric);
|
1776
1962
|
}
|
1777
1963
|
break;
|
1778
1964
|
}
|
1779
1965
|
|
1780
|
-
|
1966
|
+
default:
|
1967
|
+
{
|
1968
|
+
unexpected_error();
|
1969
|
+
}
|
1781
1970
|
}
|
1782
1971
|
|
1783
1972
|
|
@@ -1871,7 +2060,7 @@ void recursive_split_categ(Workspace &workspace,
|
|
1871
2060
|
|
1872
2061
|
if (tree_from == 0) {
|
1873
2062
|
workspace.tree->clear();
|
1874
|
-
} else if ((*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.
|
2063
|
+
} else if (!(*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.empty()) {
|
1875
2064
|
(*workspace.tree)[(*workspace.tree)[tree_from].parent].all_branches.pop_back();
|
1876
2065
|
workspace.tree->pop_back();
|
1877
2066
|
} else {
|
@@ -1925,6 +2114,11 @@ void recursive_split_categ(Workspace &workspace,
|
|
1925
2114
|
workspace.tree->pop_back();
|
1926
2115
|
break;
|
1927
2116
|
}
|
2117
|
+
|
2118
|
+
default:
|
2119
|
+
{
|
2120
|
+
unexpected_error();
|
2121
|
+
}
|
1928
2122
|
}
|
1929
2123
|
}
|
1930
2124
|
}
|