outliertree 0.2.1 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/NOTICE.txt +1 -1
- data/README.md +11 -10
- data/ext/outliertree/extconf.rb +1 -1
- data/lib/outliertree/result.rb +3 -3
- data/lib/outliertree/version.rb +1 -1
- data/vendor/outliertree/README.md +77 -40
- data/vendor/outliertree/src/Makevars.in +4 -0
- data/vendor/outliertree/src/Makevars.win +4 -0
- data/vendor/outliertree/src/RcppExports.cpp +20 -9
- data/vendor/outliertree/src/Rwrapper.cpp +256 -57
- data/vendor/outliertree/src/cat_outlier.cpp +6 -6
- data/vendor/outliertree/src/clusters.cpp +114 -9
- data/vendor/outliertree/src/fit_model.cpp +505 -308
- data/vendor/outliertree/src/misc.cpp +165 -4
- data/vendor/outliertree/src/outlier_tree.hpp +159 -51
- data/vendor/outliertree/src/outliertree-win.def +3 -0
- data/vendor/outliertree/src/predict.cpp +33 -0
- data/vendor/outliertree/src/split.cpp +124 -20
- metadata +8 -6
- data/vendor/outliertree/src/Makevars +0 -3
@@ -41,23 +41,34 @@
|
|
41
41
|
#include <algorithm>
|
42
42
|
#include <numeric>
|
43
43
|
#include <unordered_set>
|
44
|
+
#include <exception>
|
45
|
+
#include <stdexcept>
|
46
|
+
#include <cassert>
|
44
47
|
#include <math.h>
|
45
48
|
#include <cmath>
|
46
49
|
#include <stddef.h>
|
47
50
|
#include <limits.h>
|
51
|
+
#include <limits>
|
48
52
|
#include <stdlib.h>
|
49
53
|
#include <stddef.h>
|
50
54
|
#include <string.h>
|
55
|
+
#include <stdint.h>
|
51
56
|
#ifdef _OPENMP
|
52
57
|
#include <omp.h>
|
53
58
|
#endif
|
59
|
+
#ifdef _FOR_R
|
60
|
+
#include <Rcpp.h>
|
61
|
+
#endif
|
62
|
+
#include <signal.h>
|
63
|
+
typedef void (*sig_t_)(int);
|
64
|
+
|
54
65
|
|
55
66
|
/************************
|
56
67
|
Short Functions
|
57
68
|
*************************/
|
58
69
|
#define extract_bit(number, bit) (((number) >> (bit)) & 1) /* https://stackoverflow.com/questions/2249731/how-do-i-get-bit-by-bit-data-from-an-integer-value-in-c */
|
59
70
|
#define pow2(n) ( ((size_t) 1) << (n) ) /* https://stackoverflow.com/questions/101439/the-most-efficient-way-to-implement-an-integer-based-power-function-powint-int */
|
60
|
-
#define avg_between(a, b) ((
|
71
|
+
#define avg_between(a, b) ((a) + 0.5*((b) - (a)))
|
61
72
|
#define square(x) ((x) * (x))
|
62
73
|
#ifndef isinf
|
63
74
|
#define isinf std::isinf
|
@@ -68,7 +79,7 @@
|
|
68
79
|
#define is_na_or_inf(x) (isnan(x) || isinf(x))
|
69
80
|
|
70
81
|
/* Aliasing for compiler optimizations */
|
71
|
-
#if defined(__GNUG__) || defined(__GNUC__) || defined(_MSC_VER) || defined(__clang__) || defined(__INTEL_COMPILER)
|
82
|
+
#if defined(__GNUG__) || defined(__GNUC__) || defined(_MSC_VER) || defined(__clang__) || defined(__INTEL_COMPILER) || defined(__IBMCPP__) || defined(__ibmxl__) || defined(SUPPORTS_RESTRICT)
|
72
83
|
#define restrict __restrict
|
73
84
|
#else
|
74
85
|
#define restrict
|
@@ -89,6 +100,8 @@
|
|
89
100
|
#define omp_get_thread_num() 0
|
90
101
|
#endif
|
91
102
|
|
103
|
+
#define unexpected_error() throw std::runtime_error("Unexpected error. Please open an issue in GitHub.\n")
|
104
|
+
|
92
105
|
|
93
106
|
/****************************************************************
|
94
107
|
Data types and structs that are returned from this module
|
@@ -103,6 +116,8 @@ typedef enum SplitType {
|
|
103
116
|
} SplitType;
|
104
117
|
typedef enum ColTransf {NoTransf, Log, Exp} ColTransf; /* transformation to apply to numeric column */
|
105
118
|
|
119
|
+
/* TODO: should write serializers for the model objects, but need to somehow deal with long double types */
|
120
|
+
|
106
121
|
/*
|
107
122
|
* 1-d clusters that define homogeneous groups in which observations can be outliers.
|
108
123
|
* Note that these are associated to a tree and define one extra condition from what
|
@@ -114,7 +129,7 @@ typedef struct Cluster {
|
|
114
129
|
size_t col_num = 0; /* numer of the column by which its being split, the target column is given by index of the cluster vector */
|
115
130
|
SplitType split_type = Root;
|
116
131
|
double split_point = HUGE_VAL; /* numerical */
|
117
|
-
std::vector<char> split_subset
|
132
|
+
std::vector<signed char> split_subset; /* categorical */
|
118
133
|
int split_lev = INT_MAX; /* ordinal */
|
119
134
|
bool has_NA_branch = false; /* this is in order to determine the best outlier cluster when it fits under more than 1 */
|
120
135
|
|
@@ -127,7 +142,7 @@ typedef struct Cluster {
|
|
127
142
|
double display_lim_high = -HUGE_VAL; /* numerical target column */
|
128
143
|
double display_mean = -HUGE_VAL; /* numerical target column */
|
129
144
|
double display_sd = -HUGE_VAL; /* numerical target column */
|
130
|
-
std::vector<char> subset_common
|
145
|
+
std::vector<signed char> subset_common; /* categorical or ordinal target column (=0 is common) */
|
131
146
|
double perc_in_subset = HUGE_VAL; /* categorical or ordinal target column */
|
132
147
|
double perc_next_most_comm = -HUGE_VAL; /* categorical or ordinal target column */ /* TODO */
|
133
148
|
int categ_maj = -1; /* when using majority-criterion for categorical outliers */
|
@@ -156,7 +171,7 @@ typedef struct Cluster {
|
|
156
171
|
}
|
157
172
|
|
158
173
|
/* categorical split */
|
159
|
-
Cluster(ColType column_type, size_t col_num, SplitType split_type, char *split_subset, int ncat, bool has_NA_branch = false)
|
174
|
+
Cluster(ColType column_type, size_t col_num, SplitType split_type, signed char *split_subset, int ncat, bool has_NA_branch = false)
|
160
175
|
{
|
161
176
|
this->column_type = column_type;
|
162
177
|
this->col_num = col_num;
|
@@ -245,21 +260,21 @@ typedef struct Cluster {
|
|
245
260
|
typedef struct ClusterTree {
|
246
261
|
size_t parent = 0; /* index in a vector */
|
247
262
|
SplitType parent_branch = Root; /* this tree follows this branch in the split given by its parent */
|
248
|
-
std::vector<size_t> clusters
|
263
|
+
std::vector<size_t> clusters; /* these clusters define additional splits */
|
249
264
|
|
250
265
|
SplitType split_this_branch = Root; /* when using 'follow_all' */
|
251
|
-
std::vector<size_t> all_branches
|
266
|
+
std::vector<size_t> all_branches; /* when using 'follow_all' */
|
252
267
|
|
253
268
|
ColType column_type = NoType;
|
254
269
|
size_t col_num = 0;
|
255
270
|
double split_point = HUGE_VAL;
|
256
|
-
std::vector<char> split_subset
|
271
|
+
std::vector<signed char> split_subset;
|
257
272
|
int split_lev = INT_MAX;
|
258
273
|
|
259
274
|
size_t tree_NA = 0; /* binary splits */
|
260
275
|
size_t tree_left = 0; /* binary splits */
|
261
276
|
size_t tree_right = 0; /* binary splits */
|
262
|
-
std::vector<size_t> binary_branches
|
277
|
+
std::vector<size_t> binary_branches; /* multiple splits (single category or binarized categories) */
|
263
278
|
|
264
279
|
ClusterTree(size_t parent, SplitType parent_branch)
|
265
280
|
{
|
@@ -286,7 +301,7 @@ typedef struct ClusterTree {
|
|
286
301
|
this->split_lev = split_lev;
|
287
302
|
}
|
288
303
|
|
289
|
-
ClusterTree(size_t parent, size_t col_num, SplitType split_this_branch, char *split_subset, int ncat)
|
304
|
+
ClusterTree(size_t parent, size_t col_num, SplitType split_this_branch, signed char *split_subset, int ncat)
|
290
305
|
{
|
291
306
|
this->parent = parent;
|
292
307
|
this->col_num = col_num;
|
@@ -336,6 +351,8 @@ typedef struct ClusterTree {
|
|
336
351
|
|
337
352
|
} ClusterTree;
|
338
353
|
|
354
|
+
/* TODO: should separate the results from the actual model object */
|
355
|
+
|
339
356
|
/* these are needed for prediction time, and are thus returned from the function that fits the model */
|
340
357
|
typedef struct ModelOutputs {
|
341
358
|
std::vector< std::vector<ClusterTree> > all_trees; /* clusters in which observations can be outliers, required for prediction time */
|
@@ -370,11 +387,6 @@ typedef struct ModelOutputs {
|
|
370
387
|
archive(
|
371
388
|
this->all_trees,
|
372
389
|
this->all_clusters,
|
373
|
-
this->outlier_scores_final,
|
374
|
-
this->outlier_clusters_final,
|
375
|
-
this->outlier_columns_final,
|
376
|
-
this->outlier_trees_final,
|
377
|
-
this->outlier_depth_final,
|
378
390
|
this->start_ix_cat_counts,
|
379
391
|
this->prop_categ,
|
380
392
|
this->col_transf,
|
@@ -421,7 +433,69 @@ bool fit_outliers_models(ModelOutputs &model_outputs,
|
|
421
433
|
size_t max_depth = 3, double max_perc_outliers = 0.01, size_t min_size_numeric = 25, size_t min_size_categ = 50,
|
422
434
|
double min_gain = 1e-2, bool gain_as_pct = false, bool follow_all = false, double z_norm = 2.67, double z_outlier = 8.0);
|
423
435
|
|
424
|
-
|
436
|
+
class ExhaustedColumnTracker
|
437
|
+
{
|
438
|
+
public:
|
439
|
+
std::vector<bool> is_exhausted;
|
440
|
+
std::vector<size_t> col_indices;
|
441
|
+
std::vector<size_t> n_held;
|
442
|
+
|
443
|
+
void initialize(size_t ncols, size_t max_depth)
|
444
|
+
{
|
445
|
+
this->is_exhausted.assign(ncols, false);
|
446
|
+
this->n_held.clear();
|
447
|
+
this->n_held.reserve(max_depth+1);
|
448
|
+
this->col_indices.clear();
|
449
|
+
this->col_indices.reserve(ncols);
|
450
|
+
}
|
451
|
+
|
452
|
+
void push_branch()
|
453
|
+
{
|
454
|
+
this->n_held.push_back(0);
|
455
|
+
}
|
456
|
+
|
457
|
+
void push_col(size_t col)
|
458
|
+
{
|
459
|
+
this->is_exhausted[col] = true;
|
460
|
+
this->col_indices.push_back(col);
|
461
|
+
this->n_held.back() += 1;
|
462
|
+
}
|
463
|
+
|
464
|
+
void pop_branch()
|
465
|
+
{
|
466
|
+
size_t col;
|
467
|
+
while (this->n_held.back() > 0)
|
468
|
+
{
|
469
|
+
col = this->col_indices.back();
|
470
|
+
this->is_exhausted[col] = false;
|
471
|
+
this->col_indices.pop_back();
|
472
|
+
this->n_held.back() -= 1;
|
473
|
+
}
|
474
|
+
|
475
|
+
this->n_held.pop_back();
|
476
|
+
}
|
477
|
+
};
|
478
|
+
|
479
|
+
class ExhaustedColumnsLevel
|
480
|
+
{
|
481
|
+
public:
|
482
|
+
bool pop = false;
|
483
|
+
ExhaustedColumnTracker* tracker = nullptr;
|
484
|
+
ExhaustedColumnsLevel() = default;
|
485
|
+
void initialize(ExhaustedColumnTracker* tracker) {
|
486
|
+
this->pop = true;
|
487
|
+
this->tracker = tracker;
|
488
|
+
this->tracker->push_branch();
|
489
|
+
}
|
490
|
+
~ExhaustedColumnsLevel() {
|
491
|
+
if (this->pop) {
|
492
|
+
this->tracker->pop_branch();
|
493
|
+
this->pop = false;
|
494
|
+
}
|
495
|
+
}
|
496
|
+
};
|
497
|
+
|
498
|
+
struct Workspace {
|
425
499
|
|
426
500
|
std::vector<size_t> ix_arr; /* indices from the target column */
|
427
501
|
size_t st; /* chunk of the indices to take for current function calls */
|
@@ -460,7 +534,7 @@ typedef struct {
|
|
460
534
|
int *untransf_target_col; /* column as it was before forcibly binarizing (dynamic pointer) */
|
461
535
|
int *temp_ptr_x; /* dynamic pointer */
|
462
536
|
|
463
|
-
std::vector<char> buffer_subset_categ_best; /* categorical split that gave the best gain */
|
537
|
+
std::vector<signed char> buffer_subset_categ_best; /* categorical split that gave the best gain */
|
464
538
|
long double this_gain; /* buffer where to store gain */
|
465
539
|
double this_split_point; /* numeric split threshold */
|
466
540
|
int this_split_lev; /* ordinal split threshold */
|
@@ -477,8 +551,8 @@ typedef struct {
|
|
477
551
|
std::vector<size_t> buffer_crosstab; /* buffer arrays where to allocate values required by functions and not used outside them */
|
478
552
|
std::vector<size_t> buffer_cat_cnt; /* buffer arrays where to allocate values required by functions and not used outside them */
|
479
553
|
std::vector<size_t> buffer_cat_sorted; /* buffer arrays where to allocate values required by functions and not used outside them */
|
480
|
-
std::vector<char>
|
481
|
-
std::vector<char>
|
554
|
+
std::vector<signed char> buffer_subset_categ; /* buffer arrays where to allocate values required by functions and not used outside them */
|
555
|
+
std::vector<signed char> buffer_subset_outlier; /* buffer arrays where to allocate values required by functions and not used outside them */
|
482
556
|
std::vector<long double> buffer_sd; /* used for a more numerically-stable two-pass gain calculation */
|
483
557
|
|
484
558
|
bool drop_cluster; /* for categorical and ordinal variables, not all clusters can flag observations as outliers, so those are not kept */
|
@@ -486,10 +560,14 @@ typedef struct {
|
|
486
560
|
bool target_col_is_ord; /* whether the target column is ordinal (rest is the same as in categoricals) */
|
487
561
|
int ncat_this; /* number of categories in the target column */
|
488
562
|
|
489
|
-
|
563
|
+
ExhaustedColumnTracker exhausted_col_tracker;
|
564
|
+
bool has_zero_variance;
|
565
|
+
bool is_binary_split;
|
566
|
+
bool best_cat_split_is_binary;
|
567
|
+
};
|
490
568
|
|
491
569
|
/* info holders to shorten function call arguments */
|
492
|
-
|
570
|
+
struct ModelParams {
|
493
571
|
bool categ_as_bin;
|
494
572
|
bool ord_as_bin;
|
495
573
|
bool cat_bruteforce_subset;
|
@@ -506,16 +584,16 @@ typedef struct {
|
|
506
584
|
double z_outlier;
|
507
585
|
double z_tail;
|
508
586
|
std::vector<long double> prop_small; /* this is not a parameter, but a shared array determined from the parameters and data */
|
509
|
-
}
|
587
|
+
};
|
510
588
|
|
511
589
|
/* Note: the vectors here are filled within the function that fits the model, while the pointers are passed from outside */
|
512
|
-
|
590
|
+
struct InputData {
|
513
591
|
double *restrict numeric_data; size_t ncols_numeric;
|
514
592
|
int *restrict categorical_data; size_t ncols_categ; int *restrict ncat;
|
515
593
|
int *restrict ordinal_data; size_t ncols_ord; int *restrict ncat_ord;
|
516
594
|
size_t nrows; size_t tot_cols; std::vector<char> has_NA; std::vector<char> skip_col; int max_categ;
|
517
595
|
std::vector<size_t> cat_counts;
|
518
|
-
}
|
596
|
+
};
|
519
597
|
|
520
598
|
|
521
599
|
void process_numeric_col(std::vector<Cluster> &cluster_root,
|
@@ -547,12 +625,12 @@ void recursive_split_categ(Workspace &workspace,
|
|
547
625
|
(This is the module from which
|
548
626
|
new data can be flagged as outliers)
|
549
627
|
********************************************/
|
550
|
-
|
628
|
+
struct PredictionData {
|
551
629
|
double *restrict numeric_data;
|
552
630
|
int *restrict categorical_data;
|
553
631
|
int *restrict ordinal_data;
|
554
632
|
size_t nrows;
|
555
|
-
}
|
633
|
+
};
|
556
634
|
|
557
635
|
bool find_new_outliers(double *restrict numeric_data,
|
558
636
|
int *restrict categorical_data,
|
@@ -570,19 +648,21 @@ bool check_is_outlier_in_tree(std::vector<size_t> &clusters_in_tree, size_t curr
|
|
570
648
|
*********************************/
|
571
649
|
#define SD_REG 1e-5 /* Regularization for standard deviation estimation */
|
572
650
|
|
573
|
-
|
651
|
+
/* TODO: should make long doubles optional */
|
652
|
+
|
653
|
+
struct NumericBranch {
|
574
654
|
size_t cnt;
|
575
655
|
long double sum;
|
576
656
|
long double sum_sq;
|
577
|
-
}
|
657
|
+
};
|
578
658
|
|
579
|
-
|
659
|
+
struct NumericSplit {
|
580
660
|
NumericBranch NA_branch = {0, 0, 0};
|
581
661
|
NumericBranch left_branch = {0, 0, 0};
|
582
662
|
NumericBranch right_branch = {0, 0, 0};
|
583
|
-
}
|
663
|
+
};
|
584
664
|
|
585
|
-
|
665
|
+
struct CategSplit {
|
586
666
|
size_t *restrict NA_branch; /* array of counts of the target variable's categories */
|
587
667
|
size_t *restrict left_branch; /* array of counts of the target variable's categories */
|
588
668
|
size_t *restrict right_branch; /* array of counts of the target variable's categories */
|
@@ -591,11 +671,11 @@ typedef struct {
|
|
591
671
|
size_t size_NA = 0;
|
592
672
|
size_t size_left = 0;
|
593
673
|
size_t size_right = 0;
|
594
|
-
}
|
674
|
+
};
|
595
675
|
|
596
|
-
void subset_to_onehot(size_t ix_arr[], size_t n_true, size_t n_tot,
|
676
|
+
void subset_to_onehot(size_t ix_arr[], size_t n_true, size_t n_tot, signed char onehot[]);
|
597
677
|
size_t move_zero_count_to_front(size_t *restrict cat_sorted, size_t *restrict cat_cnt, size_t ncat_x);
|
598
|
-
void flag_zero_counts(char split_subset[], size_t buffer_cat_cnt[], size_t ncat_x);
|
678
|
+
void flag_zero_counts(signed char split_subset[], size_t buffer_cat_cnt[], size_t ncat_x);
|
599
679
|
long double calc_sd(size_t cnt, long double sum, long double sum_sq);
|
600
680
|
long double calc_sd(NumericBranch &branch);
|
601
681
|
long double calc_sd(size_t ix_arr[], double *restrict x, size_t st, size_t end, double *restrict mean);
|
@@ -610,23 +690,25 @@ long double categ_gain_from_split(size_t *restrict ix_arr, int *restrict x, size
|
|
610
690
|
size_t ncat, size_t *restrict buffer_cat_cnt, long double base_info);
|
611
691
|
void split_numericx_numericy(size_t *restrict ix_arr, size_t st, size_t end, double *restrict x, double *restrict y,
|
612
692
|
long double sd_y, bool has_na, size_t min_size, bool take_mid, long double *restrict buffer_sd,
|
613
|
-
long double *restrict gain, double *restrict split_point, size_t *restrict split_left, size_t *restrict split_NA);
|
693
|
+
long double *restrict gain, double *restrict split_point, size_t *restrict split_left, size_t *restrict split_NA, bool *restrict has_zero_variance);
|
614
694
|
void split_categx_numericy(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, double *restrict y, long double sd_y, double ymean,
|
615
695
|
bool x_is_ordinal, size_t ncat_x, size_t *restrict buffer_cat_cnt, long double *restrict buffer_cat_sum,
|
616
696
|
long double *restrict buffer_cat_sum_sq, size_t *restrict buffer_cat_sorted,
|
617
|
-
bool has_na, size_t min_size, long double *gain, char *restrict split_subset, int *restrict split_point);
|
697
|
+
bool has_na, size_t min_size, long double *gain, signed char *restrict split_subset, int *restrict split_point, bool *restrict has_zero_variance, bool *restrict binary_split);
|
618
698
|
void split_numericx_categy(size_t *restrict ix_arr, size_t st, size_t end, double *restrict x, int *restrict y,
|
619
699
|
size_t ncat_y, long double base_info, size_t *restrict buffer_cat_cnt,
|
620
700
|
bool has_na, size_t min_size, bool take_mid, long double *restrict gain, double *restrict split_point,
|
621
|
-
size_t *restrict split_left, size_t *restrict split_NA);
|
701
|
+
size_t *restrict split_left, size_t *restrict split_NA, bool *restrict has_zero_variance);
|
622
702
|
void split_ordx_categy(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, int *restrict y,
|
623
703
|
size_t ncat_y, size_t ncat_x, long double base_info,
|
624
704
|
size_t *restrict buffer_cat_cnt, size_t *restrict buffer_crosstab, size_t *restrict buffer_ord_cnt,
|
625
|
-
bool has_na, size_t min_size, long double *gain, int *split_point
|
705
|
+
bool has_na, size_t min_size, long double *gain, int *split_point,
|
706
|
+
bool *restrict has_zero_variance, bool *restrict binary_split);
|
626
707
|
void split_categx_biny(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, int *restrict y,
|
627
708
|
size_t ncat_x, long double base_info,
|
628
709
|
size_t *restrict buffer_cat_cnt, size_t *restrict buffer_crosstab, size_t *restrict buffer_cat_sorted,
|
629
|
-
bool has_na, size_t min_size, long double *gain, char *restrict split_subset
|
710
|
+
bool has_na, size_t min_size, long double *gain, signed char *restrict split_subset,
|
711
|
+
bool *restrict has_zero_variance, bool *restrict binary_split);
|
630
712
|
void split_categx_categy_separate(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, int *restrict y,
|
631
713
|
size_t ncat_x, size_t ncat_y, long double base_info,
|
632
714
|
size_t *restrict buffer_cat_cnt, size_t *restrict buffer_crosstab,
|
@@ -634,7 +716,8 @@ void split_categx_categy_separate(size_t *restrict ix_arr, size_t st, size_t end
|
|
634
716
|
void split_categx_categy_subset(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, int *restrict y,
|
635
717
|
size_t ncat_x, size_t ncat_y, long double base_info,
|
636
718
|
size_t *restrict buffer_cat_cnt, size_t *restrict buffer_crosstab, size_t *restrict buffer_split,
|
637
|
-
bool has_na, size_t min_size, long double *gain, char *restrict split_subset
|
719
|
+
bool has_na, size_t min_size, long double *gain, signed char *restrict split_subset,
|
720
|
+
bool *restrict has_zero_variance, bool *restrict binary_split);
|
638
721
|
|
639
722
|
|
640
723
|
|
@@ -642,8 +725,8 @@ void split_categx_categy_subset(size_t *restrict ix_arr, size_t st, size_t end,
|
|
642
725
|
Prototypes from clusters.cpp
|
643
726
|
************************************/
|
644
727
|
#define calculate_max_outliers(n, perc) ( (n) * (perc) + (long double)2 * sqrtl( (n) * (perc) * ((long double)1 - perc) ) + (long double)1 )
|
645
|
-
#define z_score(x, mu, sd) ( ((x) - (mu)) / (sd) )
|
646
|
-
#define chebyshyov_bound(
|
728
|
+
#define z_score(x, mu, sd) ( ((x) - (mu)) / std::max((sd), 1e-12) )
|
729
|
+
#define chebyshyov_bound(zval) (1.0 / std::max(square(zval), 1.))
|
647
730
|
|
648
731
|
bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_t st, size_t end,
|
649
732
|
double *restrict outlier_scores, size_t *restrict outlier_clusters, size_t *restrict outlier_trees,
|
@@ -654,7 +737,7 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
|
|
654
737
|
void define_categ_cluster_no_cond(int *restrict x, size_t *restrict ix_arr, size_t st, size_t end, size_t ncateg,
|
655
738
|
double *restrict outlier_scores, size_t *restrict outlier_clusters, size_t *restrict outlier_trees,
|
656
739
|
size_t *restrict outlier_depth, Cluster &cluster,
|
657
|
-
size_t *restrict categ_counts, char *restrict is_outlier, double perc_next_most_comm);
|
740
|
+
size_t *restrict categ_counts, signed char *restrict is_outlier, double perc_next_most_comm);
|
658
741
|
bool define_categ_cluster(int *restrict x, size_t *restrict ix_arr, size_t st, size_t end, size_t ncateg, bool by_maj,
|
659
742
|
double *restrict outlier_scores, size_t *restrict outlier_clusters, size_t *restrict outlier_trees,
|
660
743
|
size_t *restrict outlier_depth, Cluster &cluster, std::vector<Cluster> &clusters,
|
@@ -662,7 +745,7 @@ bool define_categ_cluster(int *restrict x, size_t *restrict ix_arr, size_t st, s
|
|
662
745
|
double max_perc_outliers, double z_norm, double z_outlier,
|
663
746
|
long double *restrict perc_threshold, long double *restrict prop_prior,
|
664
747
|
size_t *restrict buffer_categ_counts, long double *restrict buffer_categ_pct,
|
665
|
-
size_t *restrict buffer_categ_ix, char *restrict buffer_outliers,
|
748
|
+
size_t *restrict buffer_categ_ix, signed char *restrict buffer_outliers,
|
666
749
|
bool *restrict drop_cluster);
|
667
750
|
void simplify_when_equal_cond(std::vector<Cluster> &clusters, int ncat_ord[]);
|
668
751
|
void simplify_when_equal_cond(std::vector<ClusterTree> &trees, int ncat_ord[]);
|
@@ -680,12 +763,12 @@ void calculate_cluster_poss_categs(ModelOutputs &model_outputs, size_t col, size
|
|
680
763
|
#define calculate_max_cat_outliers(n, perc, z_norm) ((long double)1 + ((n) * (perc) / z_norm)) /* Note: this is not anyhow probabilistic, nor based on provable bounds */
|
681
764
|
void find_outlier_categories(size_t categ_counts[], size_t ncateg, size_t tot, double max_perc_outliers,
|
682
765
|
long double perc_threshold[], size_t buffer_ix[], long double buffer_perc[],
|
683
|
-
double z_norm, char is_outlier[], bool *found_outliers, bool *new_is_outlier, double *next_most_comm);
|
766
|
+
double z_norm, signed char is_outlier[], bool *found_outliers, bool *new_is_outlier, double *next_most_comm);
|
684
767
|
void find_outlier_categories_by_maj(size_t categ_counts[], size_t ncateg, size_t tot, double max_perc_outliers,
|
685
|
-
long double prior_prob[], double z_outlier, char is_outlier[],
|
768
|
+
long double prior_prob[], double z_outlier, signed char is_outlier[],
|
686
769
|
bool *found_outliers, bool *new_is_outlier, int *categ_maj);
|
687
770
|
bool find_outlier_categories_no_cond(size_t categ_counts[], size_t ncateg, size_t tot,
|
688
|
-
char is_outlier[], double *next_most_comm);
|
771
|
+
signed char is_outlier[], double *next_most_comm);
|
689
772
|
|
690
773
|
|
691
774
|
|
@@ -699,7 +782,7 @@ typedef struct {
|
|
699
782
|
double gain_best_restore;
|
700
783
|
double split_point_restore;
|
701
784
|
int split_lev_restore;
|
702
|
-
std::vector<char> split_subset_restore;
|
785
|
+
std::vector<signed char> split_subset_restore;
|
703
786
|
size_t ix1_restore;
|
704
787
|
size_t ix2_restore;
|
705
788
|
size_t ix3_restore;
|
@@ -709,12 +792,13 @@ typedef struct {
|
|
709
792
|
ColType col_type_best_rememer;
|
710
793
|
double split_point_best_restore;
|
711
794
|
int split_lev_best_restore;
|
712
|
-
std::vector<char> split_subset_best_restore;
|
795
|
+
std::vector<signed char> split_subset_best_restore;
|
713
796
|
long double base_info_restore;
|
714
797
|
long double base_info_orig_restore;
|
715
798
|
double sd_y_restore;
|
716
799
|
bool has_outliers_restore;
|
717
800
|
bool lev_has_outliers_restore;
|
801
|
+
bool is_binary_split_restore;
|
718
802
|
} RecursionState;
|
719
803
|
|
720
804
|
|
@@ -739,14 +823,14 @@ size_t move_outliers_to_front(size_t ix_arr[], double outlier_scores[], size_t s
|
|
739
823
|
size_t move_NAs_to_front(size_t ix_arr[], double x[], size_t st, size_t end, bool inf_as_NA);
|
740
824
|
size_t move_NAs_to_front(size_t ix_arr[], int x[], size_t st, size_t end);
|
741
825
|
void divide_subset_split(size_t ix_arr[], double x[], size_t st, size_t end, double split_point, bool has_NA, size_t *split_NA, size_t *st_right);
|
742
|
-
void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, char subset_categ[], int ncat, bool has_NA, size_t *split_NA, size_t *st_right);
|
826
|
+
void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, signed char subset_categ[], int ncat, bool has_NA, size_t *split_NA, size_t *st_right);
|
743
827
|
void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, int split_lev, bool has_NA, size_t *split_NA, size_t *st_right);
|
744
828
|
bool check_workspace_is_allocated(Workspace &workspace);
|
745
829
|
void allocate_thread_workspace(Workspace &workspace, size_t nrows, int max_categ);
|
746
830
|
void backup_recursion_state(Workspace &workspace, RecursionState &state_backup);
|
747
831
|
void restore_recursion_state(Workspace &workspace, RecursionState &state_backup);
|
748
832
|
void set_tree_as_numeric(ClusterTree &tree, double split_point, size_t col);
|
749
|
-
void set_tree_as_categorical(ClusterTree &tree, int ncat, char *split_subset, size_t col);
|
833
|
+
void set_tree_as_categorical(ClusterTree &tree, int ncat, signed char *split_subset, size_t col);
|
750
834
|
void set_tree_as_categorical(ClusterTree &tree, size_t col);
|
751
835
|
void set_tree_as_categorical(ClusterTree &tree, size_t col, int ncat);
|
752
836
|
void set_tree_as_ordinal(ClusterTree &tree, int split_lev, size_t col);
|
@@ -756,3 +840,27 @@ void check_more_two_values(double arr_num[], size_t nrows, size_t ncols, int nth
|
|
756
840
|
void calc_min_decimals_to_print(ModelOutputs &model_outputs, double *restrict numeric_data, int nthreads);
|
757
841
|
int decimals_diff(double val1, double val2);
|
758
842
|
void dealloc_ModelOutputs(ModelOutputs &model_outputs);
|
843
|
+
ModelOutputs get_empty_ModelOutputs();
|
844
|
+
bool get_has_openmp();
|
845
|
+
|
846
|
+
extern bool interrupt_switch;
|
847
|
+
extern bool handle_is_locked;
|
848
|
+
void set_interrup_global_variable(int s);
|
849
|
+
class SignalSwitcher
|
850
|
+
{
|
851
|
+
public:
|
852
|
+
sig_t_ old_sig;
|
853
|
+
bool is_active;
|
854
|
+
SignalSwitcher();
|
855
|
+
~SignalSwitcher();
|
856
|
+
void restore_handle();
|
857
|
+
};
|
858
|
+
void check_interrupt_switch(SignalSwitcher &ss);
|
859
|
+
#ifdef _FOR_PYTHON
|
860
|
+
bool cy_check_interrupt_switch();
|
861
|
+
void cy_tick_off_interrupt_switch();
|
862
|
+
#endif
|
863
|
+
size_t log2ceil(size_t v);
|
864
|
+
#ifdef _FOR_PYTHON
|
865
|
+
ModelOutputs deepcopy(const ModelOutputs &inp);
|
866
|
+
#endif
|
@@ -226,6 +226,11 @@ bool follow_tree(ModelOutputs &model_outputs, PredictionData &prediction_data, s
|
|
226
226
|
true : found_outliers;
|
227
227
|
break;
|
228
228
|
}
|
229
|
+
|
230
|
+
default:
|
231
|
+
{
|
232
|
+
assert(0);
|
233
|
+
}
|
229
234
|
}
|
230
235
|
break;
|
231
236
|
}
|
@@ -279,6 +284,11 @@ bool follow_tree(ModelOutputs &model_outputs, PredictionData &prediction_data, s
|
|
279
284
|
true : found_outliers;
|
280
285
|
break;
|
281
286
|
}
|
287
|
+
|
288
|
+
default:
|
289
|
+
{
|
290
|
+
assert(0);
|
291
|
+
}
|
282
292
|
}
|
283
293
|
break;
|
284
294
|
}
|
@@ -332,10 +342,16 @@ bool follow_tree(ModelOutputs &model_outputs, PredictionData &prediction_data, s
|
|
332
342
|
true : found_outliers;
|
333
343
|
break;
|
334
344
|
}
|
345
|
+
|
346
|
+
default:
|
347
|
+
{
|
348
|
+
assert(0);
|
349
|
+
}
|
335
350
|
}
|
336
351
|
break;
|
337
352
|
}
|
338
353
|
|
354
|
+
default: {}
|
339
355
|
}
|
340
356
|
}
|
341
357
|
}
|
@@ -548,6 +564,11 @@ bool check_is_outlier_in_tree(std::vector<size_t> &clusters_in_tree, size_t curr
|
|
548
564
|
if (!isnan(num_val_other) && num_val_other > model_outputs.all_clusters[col][cl].split_point) flag_this_cluster = true;
|
549
565
|
break;
|
550
566
|
}
|
567
|
+
|
568
|
+
default:
|
569
|
+
{
|
570
|
+
assert(0);
|
571
|
+
}
|
551
572
|
}
|
552
573
|
break;
|
553
574
|
}
|
@@ -587,6 +608,11 @@ bool check_is_outlier_in_tree(std::vector<size_t> &clusters_in_tree, size_t curr
|
|
587
608
|
break;
|
588
609
|
}
|
589
610
|
|
611
|
+
default:
|
612
|
+
{
|
613
|
+
assert(0);
|
614
|
+
}
|
615
|
+
|
590
616
|
/* Note: type 'SingleCateg' is only used temporarily, later gets converted to 'Equal' */
|
591
617
|
}
|
592
618
|
break;
|
@@ -626,6 +652,11 @@ bool check_is_outlier_in_tree(std::vector<size_t> &clusters_in_tree, size_t curr
|
|
626
652
|
if (cat_val_other >=0 && cat_val_other != model_outputs.all_clusters[col][cl].split_lev) flag_this_cluster = true;
|
627
653
|
break;
|
628
654
|
}
|
655
|
+
|
656
|
+
default:
|
657
|
+
{
|
658
|
+
assert(0);
|
659
|
+
}
|
629
660
|
}
|
630
661
|
break;
|
631
662
|
}
|
@@ -645,6 +676,8 @@ bool check_is_outlier_in_tree(std::vector<size_t> &clusters_in_tree, size_t curr
|
|
645
676
|
model_outputs.all_clusters[col][cl].cluster_sd
|
646
677
|
)
|
647
678
|
);
|
679
|
+
if (is_na_or_inf(outlier_score))
|
680
|
+
outlier_score = 1. - 1e-15;
|
648
681
|
} else {
|
649
682
|
outlier_score = model_outputs.all_clusters[col][cl].score_categ[cat_val_this];
|
650
683
|
}
|