outliertree 0.2.1 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/NOTICE.txt +1 -1
- data/README.md +11 -10
- data/ext/outliertree/ext.cpp +23 -0
- data/ext/outliertree/extconf.rb +1 -1
- data/lib/outliertree/result.rb +3 -3
- data/lib/outliertree/version.rb +1 -1
- data/vendor/outliertree/README.md +83 -41
- data/vendor/outliertree/src/Makevars.in +3 -0
- data/vendor/outliertree/src/Makevars.win +3 -0
- data/vendor/outliertree/src/RcppExports.cpp +17 -27
- data/vendor/outliertree/src/Rwrapper.cpp +354 -62
- data/vendor/outliertree/src/cat_outlier.cpp +6 -6
- data/vendor/outliertree/src/clusters.cpp +114 -9
- data/vendor/outliertree/src/fit_model.cpp +525 -331
- data/vendor/outliertree/src/misc.cpp +166 -17
- data/vendor/outliertree/src/outlier_tree.hpp +164 -56
- data/vendor/outliertree/src/outliertree-win.def +3 -0
- data/vendor/outliertree/src/predict.cpp +33 -0
- data/vendor/outliertree/src/split.cpp +124 -20
- metadata +8 -6
- data/vendor/outliertree/src/Makevars +0 -3
@@ -41,23 +41,34 @@
|
|
41
41
|
#include <algorithm>
|
42
42
|
#include <numeric>
|
43
43
|
#include <unordered_set>
|
44
|
+
#include <exception>
|
45
|
+
#include <stdexcept>
|
46
|
+
#include <cassert>
|
44
47
|
#include <math.h>
|
45
48
|
#include <cmath>
|
46
49
|
#include <stddef.h>
|
47
50
|
#include <limits.h>
|
51
|
+
#include <limits>
|
48
52
|
#include <stdlib.h>
|
49
53
|
#include <stddef.h>
|
50
54
|
#include <string.h>
|
55
|
+
#include <stdint.h>
|
51
56
|
#ifdef _OPENMP
|
52
57
|
#include <omp.h>
|
53
58
|
#endif
|
59
|
+
#ifdef _FOR_R
|
60
|
+
#include <Rcpp.h>
|
61
|
+
#endif
|
62
|
+
#include <signal.h>
|
63
|
+
typedef void (*sig_t_)(int);
|
64
|
+
|
54
65
|
|
55
66
|
/************************
|
56
67
|
Short Functions
|
57
68
|
*************************/
|
58
69
|
#define extract_bit(number, bit) (((number) >> (bit)) & 1) /* https://stackoverflow.com/questions/2249731/how-do-i-get-bit-by-bit-data-from-an-integer-value-in-c */
|
59
70
|
#define pow2(n) ( ((size_t) 1) << (n) ) /* https://stackoverflow.com/questions/101439/the-most-efficient-way-to-implement-an-integer-based-power-function-powint-int */
|
60
|
-
#define avg_between(a, b) ((
|
71
|
+
#define avg_between(a, b) ((a) + 0.5*((b) - (a)))
|
61
72
|
#define square(x) ((x) * (x))
|
62
73
|
#ifndef isinf
|
63
74
|
#define isinf std::isinf
|
@@ -68,7 +79,7 @@
|
|
68
79
|
#define is_na_or_inf(x) (isnan(x) || isinf(x))
|
69
80
|
|
70
81
|
/* Aliasing for compiler optimizations */
|
71
|
-
#if defined(__GNUG__) || defined(__GNUC__) || defined(_MSC_VER) || defined(__clang__) || defined(__INTEL_COMPILER)
|
82
|
+
#if defined(__GNUG__) || defined(__GNUC__) || defined(_MSC_VER) || defined(__clang__) || defined(__INTEL_COMPILER) || defined(__IBMCPP__) || defined(__ibmxl__) || defined(SUPPORTS_RESTRICT)
|
72
83
|
#define restrict __restrict
|
73
84
|
#else
|
74
85
|
#define restrict
|
@@ -89,6 +100,8 @@
|
|
89
100
|
#define omp_get_thread_num() 0
|
90
101
|
#endif
|
91
102
|
|
103
|
+
#define unexpected_error() throw std::runtime_error("Unexpected error. Please open an issue in GitHub.\n")
|
104
|
+
|
92
105
|
|
93
106
|
/****************************************************************
|
94
107
|
Data types and structs that are returned from this module
|
@@ -103,6 +116,8 @@ typedef enum SplitType {
|
|
103
116
|
} SplitType;
|
104
117
|
typedef enum ColTransf {NoTransf, Log, Exp} ColTransf; /* transformation to apply to numeric column */
|
105
118
|
|
119
|
+
/* TODO: should write serializers for the model objects, but need to somehow deal with long double types */
|
120
|
+
|
106
121
|
/*
|
107
122
|
* 1-d clusters that define homogeneous groups in which observations can be outliers.
|
108
123
|
* Note that these are associated to a tree and define one extra condition from what
|
@@ -114,7 +129,7 @@ typedef struct Cluster {
|
|
114
129
|
size_t col_num = 0; /* numer of the column by which its being split, the target column is given by index of the cluster vector */
|
115
130
|
SplitType split_type = Root;
|
116
131
|
double split_point = HUGE_VAL; /* numerical */
|
117
|
-
std::vector<char> split_subset
|
132
|
+
std::vector<signed char> split_subset; /* categorical */
|
118
133
|
int split_lev = INT_MAX; /* ordinal */
|
119
134
|
bool has_NA_branch = false; /* this is in order to determine the best outlier cluster when it fits under more than 1 */
|
120
135
|
|
@@ -127,7 +142,7 @@ typedef struct Cluster {
|
|
127
142
|
double display_lim_high = -HUGE_VAL; /* numerical target column */
|
128
143
|
double display_mean = -HUGE_VAL; /* numerical target column */
|
129
144
|
double display_sd = -HUGE_VAL; /* numerical target column */
|
130
|
-
std::vector<char> subset_common
|
145
|
+
std::vector<signed char> subset_common; /* categorical or ordinal target column (=0 is common) */
|
131
146
|
double perc_in_subset = HUGE_VAL; /* categorical or ordinal target column */
|
132
147
|
double perc_next_most_comm = -HUGE_VAL; /* categorical or ordinal target column */ /* TODO */
|
133
148
|
int categ_maj = -1; /* when using majority-criterion for categorical outliers */
|
@@ -156,7 +171,7 @@ typedef struct Cluster {
|
|
156
171
|
}
|
157
172
|
|
158
173
|
/* categorical split */
|
159
|
-
Cluster(ColType column_type, size_t col_num, SplitType split_type, char *split_subset, int ncat, bool has_NA_branch = false)
|
174
|
+
Cluster(ColType column_type, size_t col_num, SplitType split_type, signed char *split_subset, int ncat, bool has_NA_branch = false)
|
160
175
|
{
|
161
176
|
this->column_type = column_type;
|
162
177
|
this->col_num = col_num;
|
@@ -245,21 +260,21 @@ typedef struct Cluster {
|
|
245
260
|
typedef struct ClusterTree {
|
246
261
|
size_t parent = 0; /* index in a vector */
|
247
262
|
SplitType parent_branch = Root; /* this tree follows this branch in the split given by its parent */
|
248
|
-
std::vector<size_t> clusters
|
263
|
+
std::vector<size_t> clusters; /* these clusters define additional splits */
|
249
264
|
|
250
265
|
SplitType split_this_branch = Root; /* when using 'follow_all' */
|
251
|
-
std::vector<size_t> all_branches
|
266
|
+
std::vector<size_t> all_branches; /* when using 'follow_all' */
|
252
267
|
|
253
268
|
ColType column_type = NoType;
|
254
269
|
size_t col_num = 0;
|
255
270
|
double split_point = HUGE_VAL;
|
256
|
-
std::vector<char> split_subset
|
271
|
+
std::vector<signed char> split_subset;
|
257
272
|
int split_lev = INT_MAX;
|
258
273
|
|
259
274
|
size_t tree_NA = 0; /* binary splits */
|
260
275
|
size_t tree_left = 0; /* binary splits */
|
261
276
|
size_t tree_right = 0; /* binary splits */
|
262
|
-
std::vector<size_t> binary_branches
|
277
|
+
std::vector<size_t> binary_branches; /* multiple splits (single category or binarized categories) */
|
263
278
|
|
264
279
|
ClusterTree(size_t parent, SplitType parent_branch)
|
265
280
|
{
|
@@ -286,7 +301,7 @@ typedef struct ClusterTree {
|
|
286
301
|
this->split_lev = split_lev;
|
287
302
|
}
|
288
303
|
|
289
|
-
ClusterTree(size_t parent, size_t col_num, SplitType split_this_branch, char *split_subset, int ncat)
|
304
|
+
ClusterTree(size_t parent, size_t col_num, SplitType split_this_branch, signed char *split_subset, int ncat)
|
290
305
|
{
|
291
306
|
this->parent = parent;
|
292
307
|
this->col_num = col_num;
|
@@ -336,6 +351,8 @@ typedef struct ClusterTree {
|
|
336
351
|
|
337
352
|
} ClusterTree;
|
338
353
|
|
354
|
+
/* TODO: should separate the results from the actual model object */
|
355
|
+
|
339
356
|
/* these are needed for prediction time, and are thus returned from the function that fits the model */
|
340
357
|
typedef struct ModelOutputs {
|
341
358
|
std::vector< std::vector<ClusterTree> > all_trees; /* clusters in which observations can be outliers, required for prediction time */
|
@@ -370,11 +387,6 @@ typedef struct ModelOutputs {
|
|
370
387
|
archive(
|
371
388
|
this->all_trees,
|
372
389
|
this->all_clusters,
|
373
|
-
this->outlier_scores_final,
|
374
|
-
this->outlier_clusters_final,
|
375
|
-
this->outlier_columns_final,
|
376
|
-
this->outlier_trees_final,
|
377
|
-
this->outlier_depth_final,
|
378
390
|
this->start_ix_cat_counts,
|
379
391
|
this->prop_categ,
|
380
392
|
this->col_transf,
|
@@ -421,7 +433,69 @@ bool fit_outliers_models(ModelOutputs &model_outputs,
|
|
421
433
|
size_t max_depth = 3, double max_perc_outliers = 0.01, size_t min_size_numeric = 25, size_t min_size_categ = 50,
|
422
434
|
double min_gain = 1e-2, bool gain_as_pct = false, bool follow_all = false, double z_norm = 2.67, double z_outlier = 8.0);
|
423
435
|
|
424
|
-
|
436
|
+
class ExhaustedColumnTracker
|
437
|
+
{
|
438
|
+
public:
|
439
|
+
std::vector<bool> is_exhausted;
|
440
|
+
std::vector<size_t> col_indices;
|
441
|
+
std::vector<size_t> n_held;
|
442
|
+
|
443
|
+
void initialize(size_t ncols, size_t max_depth)
|
444
|
+
{
|
445
|
+
this->is_exhausted.assign(ncols, false);
|
446
|
+
this->n_held.clear();
|
447
|
+
this->n_held.reserve(max_depth+1);
|
448
|
+
this->col_indices.clear();
|
449
|
+
this->col_indices.reserve(ncols);
|
450
|
+
}
|
451
|
+
|
452
|
+
void push_branch()
|
453
|
+
{
|
454
|
+
this->n_held.push_back(0);
|
455
|
+
}
|
456
|
+
|
457
|
+
void push_col(size_t col)
|
458
|
+
{
|
459
|
+
this->is_exhausted[col] = true;
|
460
|
+
this->col_indices.push_back(col);
|
461
|
+
this->n_held.back() += 1;
|
462
|
+
}
|
463
|
+
|
464
|
+
void pop_branch()
|
465
|
+
{
|
466
|
+
size_t col;
|
467
|
+
while (this->n_held.back() > 0)
|
468
|
+
{
|
469
|
+
col = this->col_indices.back();
|
470
|
+
this->is_exhausted[col] = false;
|
471
|
+
this->col_indices.pop_back();
|
472
|
+
this->n_held.back() -= 1;
|
473
|
+
}
|
474
|
+
|
475
|
+
this->n_held.pop_back();
|
476
|
+
}
|
477
|
+
};
|
478
|
+
|
479
|
+
class ExhaustedColumnsLevel
|
480
|
+
{
|
481
|
+
public:
|
482
|
+
bool pop = false;
|
483
|
+
ExhaustedColumnTracker* tracker = nullptr;
|
484
|
+
ExhaustedColumnsLevel() = default;
|
485
|
+
void initialize(ExhaustedColumnTracker* tracker) {
|
486
|
+
this->pop = true;
|
487
|
+
this->tracker = tracker;
|
488
|
+
this->tracker->push_branch();
|
489
|
+
}
|
490
|
+
~ExhaustedColumnsLevel() {
|
491
|
+
if (this->pop) {
|
492
|
+
this->tracker->pop_branch();
|
493
|
+
this->pop = false;
|
494
|
+
}
|
495
|
+
}
|
496
|
+
};
|
497
|
+
|
498
|
+
struct Workspace {
|
425
499
|
|
426
500
|
std::vector<size_t> ix_arr; /* indices from the target column */
|
427
501
|
size_t st; /* chunk of the indices to take for current function calls */
|
@@ -460,7 +534,7 @@ typedef struct {
|
|
460
534
|
int *untransf_target_col; /* column as it was before forcibly binarizing (dynamic pointer) */
|
461
535
|
int *temp_ptr_x; /* dynamic pointer */
|
462
536
|
|
463
|
-
std::vector<char> buffer_subset_categ_best; /* categorical split that gave the best gain */
|
537
|
+
std::vector<signed char> buffer_subset_categ_best; /* categorical split that gave the best gain */
|
464
538
|
long double this_gain; /* buffer where to store gain */
|
465
539
|
double this_split_point; /* numeric split threshold */
|
466
540
|
int this_split_lev; /* ordinal split threshold */
|
@@ -477,8 +551,8 @@ typedef struct {
|
|
477
551
|
std::vector<size_t> buffer_crosstab; /* buffer arrays where to allocate values required by functions and not used outside them */
|
478
552
|
std::vector<size_t> buffer_cat_cnt; /* buffer arrays where to allocate values required by functions and not used outside them */
|
479
553
|
std::vector<size_t> buffer_cat_sorted; /* buffer arrays where to allocate values required by functions and not used outside them */
|
480
|
-
std::vector<char>
|
481
|
-
std::vector<char>
|
554
|
+
std::vector<signed char> buffer_subset_categ; /* buffer arrays where to allocate values required by functions and not used outside them */
|
555
|
+
std::vector<signed char> buffer_subset_outlier; /* buffer arrays where to allocate values required by functions and not used outside them */
|
482
556
|
std::vector<long double> buffer_sd; /* used for a more numerically-stable two-pass gain calculation */
|
483
557
|
|
484
558
|
bool drop_cluster; /* for categorical and ordinal variables, not all clusters can flag observations as outliers, so those are not kept */
|
@@ -486,10 +560,14 @@ typedef struct {
|
|
486
560
|
bool target_col_is_ord; /* whether the target column is ordinal (rest is the same as in categoricals) */
|
487
561
|
int ncat_this; /* number of categories in the target column */
|
488
562
|
|
489
|
-
|
563
|
+
ExhaustedColumnTracker exhausted_col_tracker;
|
564
|
+
bool has_zero_variance;
|
565
|
+
bool is_binary_split;
|
566
|
+
bool best_cat_split_is_binary;
|
567
|
+
};
|
490
568
|
|
491
569
|
/* info holders to shorten function call arguments */
|
492
|
-
|
570
|
+
struct ModelParams {
|
493
571
|
bool categ_as_bin;
|
494
572
|
bool ord_as_bin;
|
495
573
|
bool cat_bruteforce_subset;
|
@@ -506,16 +584,16 @@ typedef struct {
|
|
506
584
|
double z_outlier;
|
507
585
|
double z_tail;
|
508
586
|
std::vector<long double> prop_small; /* this is not a parameter, but a shared array determined from the parameters and data */
|
509
|
-
}
|
587
|
+
};
|
510
588
|
|
511
589
|
/* Note: the vectors here are filled within the function that fits the model, while the pointers are passed from outside */
|
512
|
-
|
590
|
+
struct InputData {
|
513
591
|
double *restrict numeric_data; size_t ncols_numeric;
|
514
592
|
int *restrict categorical_data; size_t ncols_categ; int *restrict ncat;
|
515
593
|
int *restrict ordinal_data; size_t ncols_ord; int *restrict ncat_ord;
|
516
594
|
size_t nrows; size_t tot_cols; std::vector<char> has_NA; std::vector<char> skip_col; int max_categ;
|
517
595
|
std::vector<size_t> cat_counts;
|
518
|
-
}
|
596
|
+
};
|
519
597
|
|
520
598
|
|
521
599
|
void process_numeric_col(std::vector<Cluster> &cluster_root,
|
@@ -547,12 +625,12 @@ void recursive_split_categ(Workspace &workspace,
|
|
547
625
|
(This is the module from which
|
548
626
|
new data can be flagged as outliers)
|
549
627
|
********************************************/
|
550
|
-
|
628
|
+
struct PredictionData {
|
551
629
|
double *restrict numeric_data;
|
552
630
|
int *restrict categorical_data;
|
553
631
|
int *restrict ordinal_data;
|
554
632
|
size_t nrows;
|
555
|
-
}
|
633
|
+
};
|
556
634
|
|
557
635
|
bool find_new_outliers(double *restrict numeric_data,
|
558
636
|
int *restrict categorical_data,
|
@@ -570,19 +648,21 @@ bool check_is_outlier_in_tree(std::vector<size_t> &clusters_in_tree, size_t curr
|
|
570
648
|
*********************************/
|
571
649
|
#define SD_REG 1e-5 /* Regularization for standard deviation estimation */
|
572
650
|
|
573
|
-
|
651
|
+
/* TODO: should make long doubles optional */
|
652
|
+
|
653
|
+
struct NumericBranch {
|
574
654
|
size_t cnt;
|
575
655
|
long double sum;
|
576
656
|
long double sum_sq;
|
577
|
-
}
|
657
|
+
};
|
578
658
|
|
579
|
-
|
659
|
+
struct NumericSplit {
|
580
660
|
NumericBranch NA_branch = {0, 0, 0};
|
581
661
|
NumericBranch left_branch = {0, 0, 0};
|
582
662
|
NumericBranch right_branch = {0, 0, 0};
|
583
|
-
}
|
663
|
+
};
|
584
664
|
|
585
|
-
|
665
|
+
struct CategSplit {
|
586
666
|
size_t *restrict NA_branch; /* array of counts of the target variable's categories */
|
587
667
|
size_t *restrict left_branch; /* array of counts of the target variable's categories */
|
588
668
|
size_t *restrict right_branch; /* array of counts of the target variable's categories */
|
@@ -591,11 +671,11 @@ typedef struct {
|
|
591
671
|
size_t size_NA = 0;
|
592
672
|
size_t size_left = 0;
|
593
673
|
size_t size_right = 0;
|
594
|
-
}
|
674
|
+
};
|
595
675
|
|
596
|
-
void subset_to_onehot(size_t ix_arr[], size_t n_true, size_t n_tot,
|
676
|
+
void subset_to_onehot(size_t ix_arr[], size_t n_true, size_t n_tot, signed char onehot[]);
|
597
677
|
size_t move_zero_count_to_front(size_t *restrict cat_sorted, size_t *restrict cat_cnt, size_t ncat_x);
|
598
|
-
void flag_zero_counts(char split_subset[], size_t buffer_cat_cnt[], size_t ncat_x);
|
678
|
+
void flag_zero_counts(signed char split_subset[], size_t buffer_cat_cnt[], size_t ncat_x);
|
599
679
|
long double calc_sd(size_t cnt, long double sum, long double sum_sq);
|
600
680
|
long double calc_sd(NumericBranch &branch);
|
601
681
|
long double calc_sd(size_t ix_arr[], double *restrict x, size_t st, size_t end, double *restrict mean);
|
@@ -610,23 +690,25 @@ long double categ_gain_from_split(size_t *restrict ix_arr, int *restrict x, size
|
|
610
690
|
size_t ncat, size_t *restrict buffer_cat_cnt, long double base_info);
|
611
691
|
void split_numericx_numericy(size_t *restrict ix_arr, size_t st, size_t end, double *restrict x, double *restrict y,
|
612
692
|
long double sd_y, bool has_na, size_t min_size, bool take_mid, long double *restrict buffer_sd,
|
613
|
-
long double *restrict gain, double *restrict split_point, size_t *restrict split_left, size_t *restrict split_NA);
|
693
|
+
long double *restrict gain, double *restrict split_point, size_t *restrict split_left, size_t *restrict split_NA, bool *restrict has_zero_variance);
|
614
694
|
void split_categx_numericy(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, double *restrict y, long double sd_y, double ymean,
|
615
695
|
bool x_is_ordinal, size_t ncat_x, size_t *restrict buffer_cat_cnt, long double *restrict buffer_cat_sum,
|
616
696
|
long double *restrict buffer_cat_sum_sq, size_t *restrict buffer_cat_sorted,
|
617
|
-
bool has_na, size_t min_size, long double *gain, char *restrict split_subset, int *restrict split_point);
|
697
|
+
bool has_na, size_t min_size, long double *gain, signed char *restrict split_subset, int *restrict split_point, bool *restrict has_zero_variance, bool *restrict binary_split);
|
618
698
|
void split_numericx_categy(size_t *restrict ix_arr, size_t st, size_t end, double *restrict x, int *restrict y,
|
619
699
|
size_t ncat_y, long double base_info, size_t *restrict buffer_cat_cnt,
|
620
700
|
bool has_na, size_t min_size, bool take_mid, long double *restrict gain, double *restrict split_point,
|
621
|
-
size_t *restrict split_left, size_t *restrict split_NA);
|
701
|
+
size_t *restrict split_left, size_t *restrict split_NA, bool *restrict has_zero_variance);
|
622
702
|
void split_ordx_categy(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, int *restrict y,
|
623
703
|
size_t ncat_y, size_t ncat_x, long double base_info,
|
624
704
|
size_t *restrict buffer_cat_cnt, size_t *restrict buffer_crosstab, size_t *restrict buffer_ord_cnt,
|
625
|
-
bool has_na, size_t min_size, long double *gain, int *split_point
|
705
|
+
bool has_na, size_t min_size, long double *gain, int *split_point,
|
706
|
+
bool *restrict has_zero_variance, bool *restrict binary_split);
|
626
707
|
void split_categx_biny(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, int *restrict y,
|
627
708
|
size_t ncat_x, long double base_info,
|
628
709
|
size_t *restrict buffer_cat_cnt, size_t *restrict buffer_crosstab, size_t *restrict buffer_cat_sorted,
|
629
|
-
bool has_na, size_t min_size, long double *gain, char *restrict split_subset
|
710
|
+
bool has_na, size_t min_size, long double *gain, signed char *restrict split_subset,
|
711
|
+
bool *restrict has_zero_variance, bool *restrict binary_split);
|
630
712
|
void split_categx_categy_separate(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, int *restrict y,
|
631
713
|
size_t ncat_x, size_t ncat_y, long double base_info,
|
632
714
|
size_t *restrict buffer_cat_cnt, size_t *restrict buffer_crosstab,
|
@@ -634,7 +716,8 @@ void split_categx_categy_separate(size_t *restrict ix_arr, size_t st, size_t end
|
|
634
716
|
void split_categx_categy_subset(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, int *restrict y,
|
635
717
|
size_t ncat_x, size_t ncat_y, long double base_info,
|
636
718
|
size_t *restrict buffer_cat_cnt, size_t *restrict buffer_crosstab, size_t *restrict buffer_split,
|
637
|
-
bool has_na, size_t min_size, long double *gain, char *restrict split_subset
|
719
|
+
bool has_na, size_t min_size, long double *gain, signed char *restrict split_subset,
|
720
|
+
bool *restrict has_zero_variance, bool *restrict binary_split);
|
638
721
|
|
639
722
|
|
640
723
|
|
@@ -642,8 +725,8 @@ void split_categx_categy_subset(size_t *restrict ix_arr, size_t st, size_t end,
|
|
642
725
|
Prototypes from clusters.cpp
|
643
726
|
************************************/
|
644
727
|
#define calculate_max_outliers(n, perc) ( (n) * (perc) + (long double)2 * sqrtl( (n) * (perc) * ((long double)1 - perc) ) + (long double)1 )
|
645
|
-
#define z_score(x, mu, sd) ( ((x) - (mu)) / (sd) )
|
646
|
-
#define chebyshyov_bound(
|
728
|
+
#define z_score(x, mu, sd) ( ((x) - (mu)) / std::max((sd), 1e-12) )
|
729
|
+
#define chebyshyov_bound(zval) (1.0 / std::max(square(zval), 1.))
|
647
730
|
|
648
731
|
bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_t st, size_t end,
|
649
732
|
double *restrict outlier_scores, size_t *restrict outlier_clusters, size_t *restrict outlier_trees,
|
@@ -654,7 +737,7 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
|
|
654
737
|
void define_categ_cluster_no_cond(int *restrict x, size_t *restrict ix_arr, size_t st, size_t end, size_t ncateg,
|
655
738
|
double *restrict outlier_scores, size_t *restrict outlier_clusters, size_t *restrict outlier_trees,
|
656
739
|
size_t *restrict outlier_depth, Cluster &cluster,
|
657
|
-
size_t *restrict categ_counts, char *restrict is_outlier, double perc_next_most_comm);
|
740
|
+
size_t *restrict categ_counts, signed char *restrict is_outlier, double perc_next_most_comm);
|
658
741
|
bool define_categ_cluster(int *restrict x, size_t *restrict ix_arr, size_t st, size_t end, size_t ncateg, bool by_maj,
|
659
742
|
double *restrict outlier_scores, size_t *restrict outlier_clusters, size_t *restrict outlier_trees,
|
660
743
|
size_t *restrict outlier_depth, Cluster &cluster, std::vector<Cluster> &clusters,
|
@@ -662,7 +745,7 @@ bool define_categ_cluster(int *restrict x, size_t *restrict ix_arr, size_t st, s
|
|
662
745
|
double max_perc_outliers, double z_norm, double z_outlier,
|
663
746
|
long double *restrict perc_threshold, long double *restrict prop_prior,
|
664
747
|
size_t *restrict buffer_categ_counts, long double *restrict buffer_categ_pct,
|
665
|
-
size_t *restrict buffer_categ_ix, char *restrict buffer_outliers,
|
748
|
+
size_t *restrict buffer_categ_ix, signed char *restrict buffer_outliers,
|
666
749
|
bool *restrict drop_cluster);
|
667
750
|
void simplify_when_equal_cond(std::vector<Cluster> &clusters, int ncat_ord[]);
|
668
751
|
void simplify_when_equal_cond(std::vector<ClusterTree> &trees, int ncat_ord[]);
|
@@ -680,12 +763,12 @@ void calculate_cluster_poss_categs(ModelOutputs &model_outputs, size_t col, size
|
|
680
763
|
#define calculate_max_cat_outliers(n, perc, z_norm) ((long double)1 + ((n) * (perc) / z_norm)) /* Note: this is not anyhow probabilistic, nor based on provable bounds */
|
681
764
|
void find_outlier_categories(size_t categ_counts[], size_t ncateg, size_t tot, double max_perc_outliers,
|
682
765
|
long double perc_threshold[], size_t buffer_ix[], long double buffer_perc[],
|
683
|
-
double z_norm, char is_outlier[], bool *found_outliers, bool *new_is_outlier, double *next_most_comm);
|
766
|
+
double z_norm, signed char is_outlier[], bool *found_outliers, bool *new_is_outlier, double *next_most_comm);
|
684
767
|
void find_outlier_categories_by_maj(size_t categ_counts[], size_t ncateg, size_t tot, double max_perc_outliers,
|
685
|
-
long double prior_prob[], double z_outlier, char is_outlier[],
|
768
|
+
long double prior_prob[], double z_outlier, signed char is_outlier[],
|
686
769
|
bool *found_outliers, bool *new_is_outlier, int *categ_maj);
|
687
770
|
bool find_outlier_categories_no_cond(size_t categ_counts[], size_t ncateg, size_t tot,
|
688
|
-
char is_outlier[], double *next_most_comm);
|
771
|
+
signed char is_outlier[], double *next_most_comm);
|
689
772
|
|
690
773
|
|
691
774
|
|
@@ -699,7 +782,7 @@ typedef struct {
|
|
699
782
|
double gain_best_restore;
|
700
783
|
double split_point_restore;
|
701
784
|
int split_lev_restore;
|
702
|
-
std::vector<char> split_subset_restore;
|
785
|
+
std::vector<signed char> split_subset_restore;
|
703
786
|
size_t ix1_restore;
|
704
787
|
size_t ix2_restore;
|
705
788
|
size_t ix3_restore;
|
@@ -709,26 +792,27 @@ typedef struct {
|
|
709
792
|
ColType col_type_best_rememer;
|
710
793
|
double split_point_best_restore;
|
711
794
|
int split_lev_best_restore;
|
712
|
-
std::vector<char> split_subset_best_restore;
|
795
|
+
std::vector<signed char> split_subset_best_restore;
|
713
796
|
long double base_info_restore;
|
714
797
|
long double base_info_orig_restore;
|
715
798
|
double sd_y_restore;
|
716
799
|
bool has_outliers_restore;
|
717
800
|
bool lev_has_outliers_restore;
|
801
|
+
bool is_binary_split_restore;
|
718
802
|
} RecursionState;
|
719
803
|
|
720
804
|
|
721
|
-
int calculate_category_indices(size_t start_ix_cat_counts[], int ncat[], size_t ncols,
|
805
|
+
int calculate_category_indices(size_t start_ix_cat_counts[], int ncat[], size_t ncols, char skip_col[], int max_categ = 0);
|
722
806
|
void calculate_all_cat_counts(size_t start_ix_cat_counts[], size_t cat_counts[], int ncat[],
|
723
807
|
int categorical_data[], size_t ncols, size_t nrows,
|
724
|
-
|
808
|
+
char has_NA[], char skip_col[], int nthreads);
|
725
809
|
void check_cat_col_unsplittable(size_t start_ix_cat_counts[], size_t cat_counts[], int ncat[],
|
726
|
-
size_t ncols, size_t min_conditioned_size, size_t nrows,
|
810
|
+
size_t ncols, size_t min_conditioned_size, size_t nrows, char skip_col[], int nthreads);
|
727
811
|
void calculate_lowerlim_proportion(long double *restrict prop_small, long double *restrict prop,
|
728
812
|
size_t start_ix_cat_counts[], size_t cat_counts[],
|
729
813
|
size_t ncols, size_t nrows, double z_norm, double z_tail);
|
730
|
-
void check_missing_no_variance(double numeric_data[], size_t ncols, size_t nrows,
|
731
|
-
|
814
|
+
void check_missing_no_variance(double numeric_data[], size_t ncols, size_t nrows, char has_NA[],
|
815
|
+
char skip_col[], int min_decimals[], int nthreads);
|
732
816
|
void calc_central_mean_and_sd(size_t ix_arr[], size_t st, size_t end, double x[], size_t size_quarter, double *mean_central, double *sd_central);
|
733
817
|
void check_for_tails(size_t ix_arr[], size_t st, size_t end, double *restrict x,
|
734
818
|
double z_norm, double max_perc_outliers,
|
@@ -739,14 +823,14 @@ size_t move_outliers_to_front(size_t ix_arr[], double outlier_scores[], size_t s
|
|
739
823
|
size_t move_NAs_to_front(size_t ix_arr[], double x[], size_t st, size_t end, bool inf_as_NA);
|
740
824
|
size_t move_NAs_to_front(size_t ix_arr[], int x[], size_t st, size_t end);
|
741
825
|
void divide_subset_split(size_t ix_arr[], double x[], size_t st, size_t end, double split_point, bool has_NA, size_t *split_NA, size_t *st_right);
|
742
|
-
void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, char subset_categ[], int ncat, bool has_NA, size_t *split_NA, size_t *st_right);
|
826
|
+
void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, signed char subset_categ[], int ncat, bool has_NA, size_t *split_NA, size_t *st_right);
|
743
827
|
void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, int split_lev, bool has_NA, size_t *split_NA, size_t *st_right);
|
744
828
|
bool check_workspace_is_allocated(Workspace &workspace);
|
745
829
|
void allocate_thread_workspace(Workspace &workspace, size_t nrows, int max_categ);
|
746
830
|
void backup_recursion_state(Workspace &workspace, RecursionState &state_backup);
|
747
831
|
void restore_recursion_state(Workspace &workspace, RecursionState &state_backup);
|
748
832
|
void set_tree_as_numeric(ClusterTree &tree, double split_point, size_t col);
|
749
|
-
void set_tree_as_categorical(ClusterTree &tree, int ncat, char *split_subset, size_t col);
|
833
|
+
void set_tree_as_categorical(ClusterTree &tree, int ncat, signed char *split_subset, size_t col);
|
750
834
|
void set_tree_as_categorical(ClusterTree &tree, size_t col);
|
751
835
|
void set_tree_as_categorical(ClusterTree &tree, size_t col, int ncat);
|
752
836
|
void set_tree_as_ordinal(ClusterTree &tree, int split_lev, size_t col);
|
@@ -756,3 +840,27 @@ void check_more_two_values(double arr_num[], size_t nrows, size_t ncols, int nth
|
|
756
840
|
void calc_min_decimals_to_print(ModelOutputs &model_outputs, double *restrict numeric_data, int nthreads);
|
757
841
|
int decimals_diff(double val1, double val2);
|
758
842
|
void dealloc_ModelOutputs(ModelOutputs &model_outputs);
|
843
|
+
ModelOutputs get_empty_ModelOutputs();
|
844
|
+
bool get_has_openmp();
|
845
|
+
|
846
|
+
extern bool interrupt_switch;
|
847
|
+
extern bool handle_is_locked;
|
848
|
+
void set_interrup_global_variable(int s);
|
849
|
+
class SignalSwitcher
|
850
|
+
{
|
851
|
+
public:
|
852
|
+
sig_t_ old_sig;
|
853
|
+
bool is_active;
|
854
|
+
SignalSwitcher();
|
855
|
+
~SignalSwitcher();
|
856
|
+
void restore_handle();
|
857
|
+
};
|
858
|
+
void check_interrupt_switch(SignalSwitcher &ss);
|
859
|
+
#ifdef _FOR_PYTHON
|
860
|
+
bool cy_check_interrupt_switch();
|
861
|
+
void cy_tick_off_interrupt_switch();
|
862
|
+
#endif
|
863
|
+
size_t log2ceil(size_t v);
|
864
|
+
#ifdef _FOR_PYTHON
|
865
|
+
ModelOutputs deepcopy(const ModelOutputs &inp);
|
866
|
+
#endif
|
@@ -226,6 +226,11 @@ bool follow_tree(ModelOutputs &model_outputs, PredictionData &prediction_data, s
|
|
226
226
|
true : found_outliers;
|
227
227
|
break;
|
228
228
|
}
|
229
|
+
|
230
|
+
default:
|
231
|
+
{
|
232
|
+
assert(0);
|
233
|
+
}
|
229
234
|
}
|
230
235
|
break;
|
231
236
|
}
|
@@ -279,6 +284,11 @@ bool follow_tree(ModelOutputs &model_outputs, PredictionData &prediction_data, s
|
|
279
284
|
true : found_outliers;
|
280
285
|
break;
|
281
286
|
}
|
287
|
+
|
288
|
+
default:
|
289
|
+
{
|
290
|
+
assert(0);
|
291
|
+
}
|
282
292
|
}
|
283
293
|
break;
|
284
294
|
}
|
@@ -332,10 +342,16 @@ bool follow_tree(ModelOutputs &model_outputs, PredictionData &prediction_data, s
|
|
332
342
|
true : found_outliers;
|
333
343
|
break;
|
334
344
|
}
|
345
|
+
|
346
|
+
default:
|
347
|
+
{
|
348
|
+
assert(0);
|
349
|
+
}
|
335
350
|
}
|
336
351
|
break;
|
337
352
|
}
|
338
353
|
|
354
|
+
default: {}
|
339
355
|
}
|
340
356
|
}
|
341
357
|
}
|
@@ -548,6 +564,11 @@ bool check_is_outlier_in_tree(std::vector<size_t> &clusters_in_tree, size_t curr
|
|
548
564
|
if (!isnan(num_val_other) && num_val_other > model_outputs.all_clusters[col][cl].split_point) flag_this_cluster = true;
|
549
565
|
break;
|
550
566
|
}
|
567
|
+
|
568
|
+
default:
|
569
|
+
{
|
570
|
+
assert(0);
|
571
|
+
}
|
551
572
|
}
|
552
573
|
break;
|
553
574
|
}
|
@@ -587,6 +608,11 @@ bool check_is_outlier_in_tree(std::vector<size_t> &clusters_in_tree, size_t curr
|
|
587
608
|
break;
|
588
609
|
}
|
589
610
|
|
611
|
+
default:
|
612
|
+
{
|
613
|
+
assert(0);
|
614
|
+
}
|
615
|
+
|
590
616
|
/* Note: type 'SingleCateg' is only used temporarily, later gets converted to 'Equal' */
|
591
617
|
}
|
592
618
|
break;
|
@@ -626,6 +652,11 @@ bool check_is_outlier_in_tree(std::vector<size_t> &clusters_in_tree, size_t curr
|
|
626
652
|
if (cat_val_other >=0 && cat_val_other != model_outputs.all_clusters[col][cl].split_lev) flag_this_cluster = true;
|
627
653
|
break;
|
628
654
|
}
|
655
|
+
|
656
|
+
default:
|
657
|
+
{
|
658
|
+
assert(0);
|
659
|
+
}
|
629
660
|
}
|
630
661
|
break;
|
631
662
|
}
|
@@ -645,6 +676,8 @@ bool check_is_outlier_in_tree(std::vector<size_t> &clusters_in_tree, size_t curr
|
|
645
676
|
model_outputs.all_clusters[col][cl].cluster_sd
|
646
677
|
)
|
647
678
|
);
|
679
|
+
if (is_na_or_inf(outlier_score))
|
680
|
+
outlier_score = 1. - 1e-15;
|
648
681
|
} else {
|
649
682
|
outlier_score = model_outputs.all_clusters[col][cl].score_categ[cat_val_this];
|
650
683
|
}
|