outliertree 0.2.1 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -41,23 +41,34 @@
41
41
  #include <algorithm>
42
42
  #include <numeric>
43
43
  #include <unordered_set>
44
+ #include <exception>
45
+ #include <stdexcept>
46
+ #include <cassert>
44
47
  #include <math.h>
45
48
  #include <cmath>
46
49
  #include <stddef.h>
47
50
  #include <limits.h>
51
+ #include <limits>
48
52
  #include <stdlib.h>
49
53
  #include <stddef.h>
50
54
  #include <string.h>
55
+ #include <stdint.h>
51
56
  #ifdef _OPENMP
52
57
  #include <omp.h>
53
58
  #endif
59
+ #ifdef _FOR_R
60
+ #include <Rcpp.h>
61
+ #endif
62
+ #include <signal.h>
63
+ typedef void (*sig_t_)(int);
64
+
54
65
 
55
66
  /************************
56
67
  Short Functions
57
68
  *************************/
58
69
  #define extract_bit(number, bit) (((number) >> (bit)) & 1) /* https://stackoverflow.com/questions/2249731/how-do-i-get-bit-by-bit-data-from-an-integer-value-in-c */
59
70
  #define pow2(n) ( ((size_t) 1) << (n) ) /* https://stackoverflow.com/questions/101439/the-most-efficient-way-to-implement-an-integer-based-power-function-powint-int */
60
- #define avg_between(a, b) (((a) + (b)) * 0.5)
71
+ #define avg_between(a, b) ((a) + 0.5*((b) - (a)))
61
72
  #define square(x) ((x) * (x))
62
73
  #ifndef isinf
63
74
  #define isinf std::isinf
@@ -68,7 +79,7 @@
68
79
  #define is_na_or_inf(x) (isnan(x) || isinf(x))
69
80
 
70
81
  /* Aliasing for compiler optimizations */
71
- #if defined(__GNUG__) || defined(__GNUC__) || defined(_MSC_VER) || defined(__clang__) || defined(__INTEL_COMPILER)
82
+ #if defined(__GNUG__) || defined(__GNUC__) || defined(_MSC_VER) || defined(__clang__) || defined(__INTEL_COMPILER) || defined(__IBMCPP__) || defined(__ibmxl__) || defined(SUPPORTS_RESTRICT)
72
83
  #define restrict __restrict
73
84
  #else
74
85
  #define restrict
@@ -89,6 +100,8 @@
89
100
  #define omp_get_thread_num() 0
90
101
  #endif
91
102
 
103
+ #define unexpected_error() throw std::runtime_error("Unexpected error. Please open an issue in GitHub.\n")
104
+
92
105
 
93
106
  /****************************************************************
94
107
  Data types and structs that are returned from this module
@@ -103,6 +116,8 @@ typedef enum SplitType {
103
116
  } SplitType;
104
117
  typedef enum ColTransf {NoTransf, Log, Exp} ColTransf; /* transformation to apply to numeric column */
105
118
 
119
+ /* TODO: should write serializers for the model objects, but need to somehow deal with long double types */
120
+
106
121
  /*
107
122
  * 1-d clusters that define homogeneous groups in which observations can be outliers.
108
123
  * Note that these are associated to a tree and define one extra condition from what
@@ -114,7 +129,7 @@ typedef struct Cluster {
114
129
  size_t col_num = 0; /* numer of the column by which its being split, the target column is given by index of the cluster vector */
115
130
  SplitType split_type = Root;
116
131
  double split_point = HUGE_VAL; /* numerical */
117
- std::vector<char> split_subset = std::vector<char>(); /* categorical */
132
+ std::vector<signed char> split_subset; /* categorical */
118
133
  int split_lev = INT_MAX; /* ordinal */
119
134
  bool has_NA_branch = false; /* this is in order to determine the best outlier cluster when it fits under more than 1 */
120
135
 
@@ -127,7 +142,7 @@ typedef struct Cluster {
127
142
  double display_lim_high = -HUGE_VAL; /* numerical target column */
128
143
  double display_mean = -HUGE_VAL; /* numerical target column */
129
144
  double display_sd = -HUGE_VAL; /* numerical target column */
130
- std::vector<char> subset_common = std::vector<char>(); /* categorical or ordinal target column (=0 is common) */
145
+ std::vector<signed char> subset_common; /* categorical or ordinal target column (=0 is common) */
131
146
  double perc_in_subset = HUGE_VAL; /* categorical or ordinal target column */
132
147
  double perc_next_most_comm = -HUGE_VAL; /* categorical or ordinal target column */ /* TODO */
133
148
  int categ_maj = -1; /* when using majority-criterion for categorical outliers */
@@ -156,7 +171,7 @@ typedef struct Cluster {
156
171
  }
157
172
 
158
173
  /* categorical split */
159
- Cluster(ColType column_type, size_t col_num, SplitType split_type, char *split_subset, int ncat, bool has_NA_branch = false)
174
+ Cluster(ColType column_type, size_t col_num, SplitType split_type, signed char *split_subset, int ncat, bool has_NA_branch = false)
160
175
  {
161
176
  this->column_type = column_type;
162
177
  this->col_num = col_num;
@@ -245,21 +260,21 @@ typedef struct Cluster {
245
260
  typedef struct ClusterTree {
246
261
  size_t parent = 0; /* index in a vector */
247
262
  SplitType parent_branch = Root; /* this tree follows this branch in the split given by its parent */
248
- std::vector<size_t> clusters = std::vector<size_t>(); /* these clusters define additional splits */
263
+ std::vector<size_t> clusters; /* these clusters define additional splits */
249
264
 
250
265
  SplitType split_this_branch = Root; /* when using 'follow_all' */
251
- std::vector<size_t> all_branches = std::vector<size_t>(); /* when using 'follow_all' */
266
+ std::vector<size_t> all_branches; /* when using 'follow_all' */
252
267
 
253
268
  ColType column_type = NoType;
254
269
  size_t col_num = 0;
255
270
  double split_point = HUGE_VAL;
256
- std::vector<char> split_subset = std::vector<char>();
271
+ std::vector<signed char> split_subset;
257
272
  int split_lev = INT_MAX;
258
273
 
259
274
  size_t tree_NA = 0; /* binary splits */
260
275
  size_t tree_left = 0; /* binary splits */
261
276
  size_t tree_right = 0; /* binary splits */
262
- std::vector<size_t> binary_branches = std::vector<size_t>(); /* multiple splits (single category or binarized categories) */
277
+ std::vector<size_t> binary_branches; /* multiple splits (single category or binarized categories) */
263
278
 
264
279
  ClusterTree(size_t parent, SplitType parent_branch)
265
280
  {
@@ -286,7 +301,7 @@ typedef struct ClusterTree {
286
301
  this->split_lev = split_lev;
287
302
  }
288
303
 
289
- ClusterTree(size_t parent, size_t col_num, SplitType split_this_branch, char *split_subset, int ncat)
304
+ ClusterTree(size_t parent, size_t col_num, SplitType split_this_branch, signed char *split_subset, int ncat)
290
305
  {
291
306
  this->parent = parent;
292
307
  this->col_num = col_num;
@@ -336,6 +351,8 @@ typedef struct ClusterTree {
336
351
 
337
352
  } ClusterTree;
338
353
 
354
+ /* TODO: should separate the results from the actual model object */
355
+
339
356
  /* these are needed for prediction time, and are thus returned from the function that fits the model */
340
357
  typedef struct ModelOutputs {
341
358
  std::vector< std::vector<ClusterTree> > all_trees; /* clusters in which observations can be outliers, required for prediction time */
@@ -370,11 +387,6 @@ typedef struct ModelOutputs {
370
387
  archive(
371
388
  this->all_trees,
372
389
  this->all_clusters,
373
- this->outlier_scores_final,
374
- this->outlier_clusters_final,
375
- this->outlier_columns_final,
376
- this->outlier_trees_final,
377
- this->outlier_depth_final,
378
390
  this->start_ix_cat_counts,
379
391
  this->prop_categ,
380
392
  this->col_transf,
@@ -421,7 +433,69 @@ bool fit_outliers_models(ModelOutputs &model_outputs,
421
433
  size_t max_depth = 3, double max_perc_outliers = 0.01, size_t min_size_numeric = 25, size_t min_size_categ = 50,
422
434
  double min_gain = 1e-2, bool gain_as_pct = false, bool follow_all = false, double z_norm = 2.67, double z_outlier = 8.0);
423
435
 
424
- typedef struct {
436
+ class ExhaustedColumnTracker
437
+ {
438
+ public:
439
+ std::vector<bool> is_exhausted;
440
+ std::vector<size_t> col_indices;
441
+ std::vector<size_t> n_held;
442
+
443
+ void initialize(size_t ncols, size_t max_depth)
444
+ {
445
+ this->is_exhausted.assign(ncols, false);
446
+ this->n_held.clear();
447
+ this->n_held.reserve(max_depth+1);
448
+ this->col_indices.clear();
449
+ this->col_indices.reserve(ncols);
450
+ }
451
+
452
+ void push_branch()
453
+ {
454
+ this->n_held.push_back(0);
455
+ }
456
+
457
+ void push_col(size_t col)
458
+ {
459
+ this->is_exhausted[col] = true;
460
+ this->col_indices.push_back(col);
461
+ this->n_held.back() += 1;
462
+ }
463
+
464
+ void pop_branch()
465
+ {
466
+ size_t col;
467
+ while (this->n_held.back() > 0)
468
+ {
469
+ col = this->col_indices.back();
470
+ this->is_exhausted[col] = false;
471
+ this->col_indices.pop_back();
472
+ this->n_held.back() -= 1;
473
+ }
474
+
475
+ this->n_held.pop_back();
476
+ }
477
+ };
478
+
479
+ class ExhaustedColumnsLevel
480
+ {
481
+ public:
482
+ bool pop = false;
483
+ ExhaustedColumnTracker* tracker = nullptr;
484
+ ExhaustedColumnsLevel() = default;
485
+ void initialize(ExhaustedColumnTracker* tracker) {
486
+ this->pop = true;
487
+ this->tracker = tracker;
488
+ this->tracker->push_branch();
489
+ }
490
+ ~ExhaustedColumnsLevel() {
491
+ if (this->pop) {
492
+ this->tracker->pop_branch();
493
+ this->pop = false;
494
+ }
495
+ }
496
+ };
497
+
498
+ struct Workspace {
425
499
 
426
500
  std::vector<size_t> ix_arr; /* indices from the target column */
427
501
  size_t st; /* chunk of the indices to take for current function calls */
@@ -460,7 +534,7 @@ typedef struct {
460
534
  int *untransf_target_col; /* column as it was before forcibly binarizing (dynamic pointer) */
461
535
  int *temp_ptr_x; /* dynamic pointer */
462
536
 
463
- std::vector<char> buffer_subset_categ_best; /* categorical split that gave the best gain */
537
+ std::vector<signed char> buffer_subset_categ_best; /* categorical split that gave the best gain */
464
538
  long double this_gain; /* buffer where to store gain */
465
539
  double this_split_point; /* numeric split threshold */
466
540
  int this_split_lev; /* ordinal split threshold */
@@ -477,8 +551,8 @@ typedef struct {
477
551
  std::vector<size_t> buffer_crosstab; /* buffer arrays where to allocate values required by functions and not used outside them */
478
552
  std::vector<size_t> buffer_cat_cnt; /* buffer arrays where to allocate values required by functions and not used outside them */
479
553
  std::vector<size_t> buffer_cat_sorted; /* buffer arrays where to allocate values required by functions and not used outside them */
480
- std::vector<char> buffer_subset_categ; /* buffer arrays where to allocate values required by functions and not used outside them */
481
- std::vector<char> buffer_subset_outlier; /* buffer arrays where to allocate values required by functions and not used outside them */
554
+ std::vector<signed char> buffer_subset_categ; /* buffer arrays where to allocate values required by functions and not used outside them */
555
+ std::vector<signed char> buffer_subset_outlier; /* buffer arrays where to allocate values required by functions and not used outside them */
482
556
  std::vector<long double> buffer_sd; /* used for a more numerically-stable two-pass gain calculation */
483
557
 
484
558
  bool drop_cluster; /* for categorical and ordinal variables, not all clusters can flag observations as outliers, so those are not kept */
@@ -486,10 +560,14 @@ typedef struct {
486
560
  bool target_col_is_ord; /* whether the target column is ordinal (rest is the same as in categoricals) */
487
561
  int ncat_this; /* number of categories in the target column */
488
562
 
489
- } Workspace;
563
+ ExhaustedColumnTracker exhausted_col_tracker;
564
+ bool has_zero_variance;
565
+ bool is_binary_split;
566
+ bool best_cat_split_is_binary;
567
+ };
490
568
 
491
569
  /* info holders to shorten function call arguments */
492
- typedef struct {
570
+ struct ModelParams {
493
571
  bool categ_as_bin;
494
572
  bool ord_as_bin;
495
573
  bool cat_bruteforce_subset;
@@ -506,16 +584,16 @@ typedef struct {
506
584
  double z_outlier;
507
585
  double z_tail;
508
586
  std::vector<long double> prop_small; /* this is not a parameter, but a shared array determined from the parameters and data */
509
- } ModelParams;
587
+ };
510
588
 
511
589
  /* Note: the vectors here are filled within the function that fits the model, while the pointers are passed from outside */
512
- typedef struct {
590
+ struct InputData {
513
591
  double *restrict numeric_data; size_t ncols_numeric;
514
592
  int *restrict categorical_data; size_t ncols_categ; int *restrict ncat;
515
593
  int *restrict ordinal_data; size_t ncols_ord; int *restrict ncat_ord;
516
594
  size_t nrows; size_t tot_cols; std::vector<char> has_NA; std::vector<char> skip_col; int max_categ;
517
595
  std::vector<size_t> cat_counts;
518
- } InputData;
596
+ };
519
597
 
520
598
 
521
599
  void process_numeric_col(std::vector<Cluster> &cluster_root,
@@ -547,12 +625,12 @@ void recursive_split_categ(Workspace &workspace,
547
625
  (This is the module from which
548
626
  new data can be flagged as outliers)
549
627
  ********************************************/
550
- typedef struct {
628
+ struct PredictionData {
551
629
  double *restrict numeric_data;
552
630
  int *restrict categorical_data;
553
631
  int *restrict ordinal_data;
554
632
  size_t nrows;
555
- } PredictionData;
633
+ };
556
634
 
557
635
  bool find_new_outliers(double *restrict numeric_data,
558
636
  int *restrict categorical_data,
@@ -570,19 +648,21 @@ bool check_is_outlier_in_tree(std::vector<size_t> &clusters_in_tree, size_t curr
570
648
  *********************************/
571
649
  #define SD_REG 1e-5 /* Regularization for standard deviation estimation */
572
650
 
573
- typedef struct {
651
+ /* TODO: should make long doubles optional */
652
+
653
+ struct NumericBranch {
574
654
  size_t cnt;
575
655
  long double sum;
576
656
  long double sum_sq;
577
- } NumericBranch;
657
+ };
578
658
 
579
- typedef struct {
659
+ struct NumericSplit {
580
660
  NumericBranch NA_branch = {0, 0, 0};
581
661
  NumericBranch left_branch = {0, 0, 0};
582
662
  NumericBranch right_branch = {0, 0, 0};
583
- } NumericSplit;
663
+ };
584
664
 
585
- typedef struct {
665
+ struct CategSplit {
586
666
  size_t *restrict NA_branch; /* array of counts of the target variable's categories */
587
667
  size_t *restrict left_branch; /* array of counts of the target variable's categories */
588
668
  size_t *restrict right_branch; /* array of counts of the target variable's categories */
@@ -591,11 +671,11 @@ typedef struct {
591
671
  size_t size_NA = 0;
592
672
  size_t size_left = 0;
593
673
  size_t size_right = 0;
594
- } CategSplit;
674
+ };
595
675
 
596
- void subset_to_onehot(size_t ix_arr[], size_t n_true, size_t n_tot, bool onehot[]);
676
+ void subset_to_onehot(size_t ix_arr[], size_t n_true, size_t n_tot, signed char onehot[]);
597
677
  size_t move_zero_count_to_front(size_t *restrict cat_sorted, size_t *restrict cat_cnt, size_t ncat_x);
598
- void flag_zero_counts(char split_subset[], size_t buffer_cat_cnt[], size_t ncat_x);
678
+ void flag_zero_counts(signed char split_subset[], size_t buffer_cat_cnt[], size_t ncat_x);
599
679
  long double calc_sd(size_t cnt, long double sum, long double sum_sq);
600
680
  long double calc_sd(NumericBranch &branch);
601
681
  long double calc_sd(size_t ix_arr[], double *restrict x, size_t st, size_t end, double *restrict mean);
@@ -610,23 +690,25 @@ long double categ_gain_from_split(size_t *restrict ix_arr, int *restrict x, size
610
690
  size_t ncat, size_t *restrict buffer_cat_cnt, long double base_info);
611
691
  void split_numericx_numericy(size_t *restrict ix_arr, size_t st, size_t end, double *restrict x, double *restrict y,
612
692
  long double sd_y, bool has_na, size_t min_size, bool take_mid, long double *restrict buffer_sd,
613
- long double *restrict gain, double *restrict split_point, size_t *restrict split_left, size_t *restrict split_NA);
693
+ long double *restrict gain, double *restrict split_point, size_t *restrict split_left, size_t *restrict split_NA, bool *restrict has_zero_variance);
614
694
  void split_categx_numericy(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, double *restrict y, long double sd_y, double ymean,
615
695
  bool x_is_ordinal, size_t ncat_x, size_t *restrict buffer_cat_cnt, long double *restrict buffer_cat_sum,
616
696
  long double *restrict buffer_cat_sum_sq, size_t *restrict buffer_cat_sorted,
617
- bool has_na, size_t min_size, long double *gain, char *restrict split_subset, int *restrict split_point);
697
+ bool has_na, size_t min_size, long double *gain, signed char *restrict split_subset, int *restrict split_point, bool *restrict has_zero_variance, bool *restrict binary_split);
618
698
  void split_numericx_categy(size_t *restrict ix_arr, size_t st, size_t end, double *restrict x, int *restrict y,
619
699
  size_t ncat_y, long double base_info, size_t *restrict buffer_cat_cnt,
620
700
  bool has_na, size_t min_size, bool take_mid, long double *restrict gain, double *restrict split_point,
621
- size_t *restrict split_left, size_t *restrict split_NA);
701
+ size_t *restrict split_left, size_t *restrict split_NA, bool *restrict has_zero_variance);
622
702
  void split_ordx_categy(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, int *restrict y,
623
703
  size_t ncat_y, size_t ncat_x, long double base_info,
624
704
  size_t *restrict buffer_cat_cnt, size_t *restrict buffer_crosstab, size_t *restrict buffer_ord_cnt,
625
- bool has_na, size_t min_size, long double *gain, int *split_point);
705
+ bool has_na, size_t min_size, long double *gain, int *split_point,
706
+ bool *restrict has_zero_variance, bool *restrict binary_split);
626
707
  void split_categx_biny(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, int *restrict y,
627
708
  size_t ncat_x, long double base_info,
628
709
  size_t *restrict buffer_cat_cnt, size_t *restrict buffer_crosstab, size_t *restrict buffer_cat_sorted,
629
- bool has_na, size_t min_size, long double *gain, char *restrict split_subset);
710
+ bool has_na, size_t min_size, long double *gain, signed char *restrict split_subset,
711
+ bool *restrict has_zero_variance, bool *restrict binary_split);
630
712
  void split_categx_categy_separate(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, int *restrict y,
631
713
  size_t ncat_x, size_t ncat_y, long double base_info,
632
714
  size_t *restrict buffer_cat_cnt, size_t *restrict buffer_crosstab,
@@ -634,7 +716,8 @@ void split_categx_categy_separate(size_t *restrict ix_arr, size_t st, size_t end
634
716
  void split_categx_categy_subset(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, int *restrict y,
635
717
  size_t ncat_x, size_t ncat_y, long double base_info,
636
718
  size_t *restrict buffer_cat_cnt, size_t *restrict buffer_crosstab, size_t *restrict buffer_split,
637
- bool has_na, size_t min_size, long double *gain, char *restrict split_subset);
719
+ bool has_na, size_t min_size, long double *gain, signed char *restrict split_subset,
720
+ bool *restrict has_zero_variance, bool *restrict binary_split);
638
721
 
639
722
 
640
723
 
@@ -642,8 +725,8 @@ void split_categx_categy_subset(size_t *restrict ix_arr, size_t st, size_t end,
642
725
  Prototypes from clusters.cpp
643
726
  ************************************/
644
727
  #define calculate_max_outliers(n, perc) ( (n) * (perc) + (long double)2 * sqrtl( (n) * (perc) * ((long double)1 - perc) ) + (long double)1 )
645
- #define z_score(x, mu, sd) ( ((x) - (mu)) / (sd) )
646
- #define chebyshyov_bound(sd) (1.0 / square(sd))
728
+ #define z_score(x, mu, sd) ( ((x) - (mu)) / std::max((sd), 1e-12) )
729
+ #define chebyshyov_bound(zval) (1.0 / std::max(square(zval), 1.))
647
730
 
648
731
  bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_t st, size_t end,
649
732
  double *restrict outlier_scores, size_t *restrict outlier_clusters, size_t *restrict outlier_trees,
@@ -654,7 +737,7 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
654
737
  void define_categ_cluster_no_cond(int *restrict x, size_t *restrict ix_arr, size_t st, size_t end, size_t ncateg,
655
738
  double *restrict outlier_scores, size_t *restrict outlier_clusters, size_t *restrict outlier_trees,
656
739
  size_t *restrict outlier_depth, Cluster &cluster,
657
- size_t *restrict categ_counts, char *restrict is_outlier, double perc_next_most_comm);
740
+ size_t *restrict categ_counts, signed char *restrict is_outlier, double perc_next_most_comm);
658
741
  bool define_categ_cluster(int *restrict x, size_t *restrict ix_arr, size_t st, size_t end, size_t ncateg, bool by_maj,
659
742
  double *restrict outlier_scores, size_t *restrict outlier_clusters, size_t *restrict outlier_trees,
660
743
  size_t *restrict outlier_depth, Cluster &cluster, std::vector<Cluster> &clusters,
@@ -662,7 +745,7 @@ bool define_categ_cluster(int *restrict x, size_t *restrict ix_arr, size_t st, s
662
745
  double max_perc_outliers, double z_norm, double z_outlier,
663
746
  long double *restrict perc_threshold, long double *restrict prop_prior,
664
747
  size_t *restrict buffer_categ_counts, long double *restrict buffer_categ_pct,
665
- size_t *restrict buffer_categ_ix, char *restrict buffer_outliers,
748
+ size_t *restrict buffer_categ_ix, signed char *restrict buffer_outliers,
666
749
  bool *restrict drop_cluster);
667
750
  void simplify_when_equal_cond(std::vector<Cluster> &clusters, int ncat_ord[]);
668
751
  void simplify_when_equal_cond(std::vector<ClusterTree> &trees, int ncat_ord[]);
@@ -680,12 +763,12 @@ void calculate_cluster_poss_categs(ModelOutputs &model_outputs, size_t col, size
680
763
  #define calculate_max_cat_outliers(n, perc, z_norm) ((long double)1 + ((n) * (perc) / z_norm)) /* Note: this is not anyhow probabilistic, nor based on provable bounds */
681
764
  void find_outlier_categories(size_t categ_counts[], size_t ncateg, size_t tot, double max_perc_outliers,
682
765
  long double perc_threshold[], size_t buffer_ix[], long double buffer_perc[],
683
- double z_norm, char is_outlier[], bool *found_outliers, bool *new_is_outlier, double *next_most_comm);
766
+ double z_norm, signed char is_outlier[], bool *found_outliers, bool *new_is_outlier, double *next_most_comm);
684
767
  void find_outlier_categories_by_maj(size_t categ_counts[], size_t ncateg, size_t tot, double max_perc_outliers,
685
- long double prior_prob[], double z_outlier, char is_outlier[],
768
+ long double prior_prob[], double z_outlier, signed char is_outlier[],
686
769
  bool *found_outliers, bool *new_is_outlier, int *categ_maj);
687
770
  bool find_outlier_categories_no_cond(size_t categ_counts[], size_t ncateg, size_t tot,
688
- char is_outlier[], double *next_most_comm);
771
+ signed char is_outlier[], double *next_most_comm);
689
772
 
690
773
 
691
774
 
@@ -699,7 +782,7 @@ typedef struct {
699
782
  double gain_best_restore;
700
783
  double split_point_restore;
701
784
  int split_lev_restore;
702
- std::vector<char> split_subset_restore;
785
+ std::vector<signed char> split_subset_restore;
703
786
  size_t ix1_restore;
704
787
  size_t ix2_restore;
705
788
  size_t ix3_restore;
@@ -709,12 +792,13 @@ typedef struct {
709
792
  ColType col_type_best_rememer;
710
793
  double split_point_best_restore;
711
794
  int split_lev_best_restore;
712
- std::vector<char> split_subset_best_restore;
795
+ std::vector<signed char> split_subset_best_restore;
713
796
  long double base_info_restore;
714
797
  long double base_info_orig_restore;
715
798
  double sd_y_restore;
716
799
  bool has_outliers_restore;
717
800
  bool lev_has_outliers_restore;
801
+ bool is_binary_split_restore;
718
802
  } RecursionState;
719
803
 
720
804
 
@@ -739,14 +823,14 @@ size_t move_outliers_to_front(size_t ix_arr[], double outlier_scores[], size_t s
739
823
  size_t move_NAs_to_front(size_t ix_arr[], double x[], size_t st, size_t end, bool inf_as_NA);
740
824
  size_t move_NAs_to_front(size_t ix_arr[], int x[], size_t st, size_t end);
741
825
  void divide_subset_split(size_t ix_arr[], double x[], size_t st, size_t end, double split_point, bool has_NA, size_t *split_NA, size_t *st_right);
742
- void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, char subset_categ[], int ncat, bool has_NA, size_t *split_NA, size_t *st_right);
826
+ void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, signed char subset_categ[], int ncat, bool has_NA, size_t *split_NA, size_t *st_right);
743
827
  void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, int split_lev, bool has_NA, size_t *split_NA, size_t *st_right);
744
828
  bool check_workspace_is_allocated(Workspace &workspace);
745
829
  void allocate_thread_workspace(Workspace &workspace, size_t nrows, int max_categ);
746
830
  void backup_recursion_state(Workspace &workspace, RecursionState &state_backup);
747
831
  void restore_recursion_state(Workspace &workspace, RecursionState &state_backup);
748
832
  void set_tree_as_numeric(ClusterTree &tree, double split_point, size_t col);
749
- void set_tree_as_categorical(ClusterTree &tree, int ncat, char *split_subset, size_t col);
833
+ void set_tree_as_categorical(ClusterTree &tree, int ncat, signed char *split_subset, size_t col);
750
834
  void set_tree_as_categorical(ClusterTree &tree, size_t col);
751
835
  void set_tree_as_categorical(ClusterTree &tree, size_t col, int ncat);
752
836
  void set_tree_as_ordinal(ClusterTree &tree, int split_lev, size_t col);
@@ -756,3 +840,27 @@ void check_more_two_values(double arr_num[], size_t nrows, size_t ncols, int nth
756
840
  void calc_min_decimals_to_print(ModelOutputs &model_outputs, double *restrict numeric_data, int nthreads);
757
841
  int decimals_diff(double val1, double val2);
758
842
  void dealloc_ModelOutputs(ModelOutputs &model_outputs);
843
+ ModelOutputs get_empty_ModelOutputs();
844
+ bool get_has_openmp();
845
+
846
+ extern bool interrupt_switch;
847
+ extern bool handle_is_locked;
848
+ void set_interrup_global_variable(int s);
849
+ class SignalSwitcher
850
+ {
851
+ public:
852
+ sig_t_ old_sig;
853
+ bool is_active;
854
+ SignalSwitcher();
855
+ ~SignalSwitcher();
856
+ void restore_handle();
857
+ };
858
+ void check_interrupt_switch(SignalSwitcher &ss);
859
+ #ifdef _FOR_PYTHON
860
+ bool cy_check_interrupt_switch();
861
+ void cy_tick_off_interrupt_switch();
862
+ #endif
863
+ size_t log2ceil(size_t v);
864
+ #ifdef _FOR_PYTHON
865
+ ModelOutputs deepcopy(const ModelOutputs &inp);
866
+ #endif
@@ -0,0 +1,3 @@
1
+ LIBRARY outliertree.dll
2
+ EXPORTS
3
+ R_init_outliertree
@@ -226,6 +226,11 @@ bool follow_tree(ModelOutputs &model_outputs, PredictionData &prediction_data, s
226
226
  true : found_outliers;
227
227
  break;
228
228
  }
229
+
230
+ default:
231
+ {
232
+ assert(0);
233
+ }
229
234
  }
230
235
  break;
231
236
  }
@@ -279,6 +284,11 @@ bool follow_tree(ModelOutputs &model_outputs, PredictionData &prediction_data, s
279
284
  true : found_outliers;
280
285
  break;
281
286
  }
287
+
288
+ default:
289
+ {
290
+ assert(0);
291
+ }
282
292
  }
283
293
  break;
284
294
  }
@@ -332,10 +342,16 @@ bool follow_tree(ModelOutputs &model_outputs, PredictionData &prediction_data, s
332
342
  true : found_outliers;
333
343
  break;
334
344
  }
345
+
346
+ default:
347
+ {
348
+ assert(0);
349
+ }
335
350
  }
336
351
  break;
337
352
  }
338
353
 
354
+ default: {}
339
355
  }
340
356
  }
341
357
  }
@@ -548,6 +564,11 @@ bool check_is_outlier_in_tree(std::vector<size_t> &clusters_in_tree, size_t curr
548
564
  if (!isnan(num_val_other) && num_val_other > model_outputs.all_clusters[col][cl].split_point) flag_this_cluster = true;
549
565
  break;
550
566
  }
567
+
568
+ default:
569
+ {
570
+ assert(0);
571
+ }
551
572
  }
552
573
  break;
553
574
  }
@@ -587,6 +608,11 @@ bool check_is_outlier_in_tree(std::vector<size_t> &clusters_in_tree, size_t curr
587
608
  break;
588
609
  }
589
610
 
611
+ default:
612
+ {
613
+ assert(0);
614
+ }
615
+
590
616
  /* Note: type 'SingleCateg' is only used temporarily, later gets converted to 'Equal' */
591
617
  }
592
618
  break;
@@ -626,6 +652,11 @@ bool check_is_outlier_in_tree(std::vector<size_t> &clusters_in_tree, size_t curr
626
652
  if (cat_val_other >=0 && cat_val_other != model_outputs.all_clusters[col][cl].split_lev) flag_this_cluster = true;
627
653
  break;
628
654
  }
655
+
656
+ default:
657
+ {
658
+ assert(0);
659
+ }
629
660
  }
630
661
  break;
631
662
  }
@@ -645,6 +676,8 @@ bool check_is_outlier_in_tree(std::vector<size_t> &clusters_in_tree, size_t curr
645
676
  model_outputs.all_clusters[col][cl].cluster_sd
646
677
  )
647
678
  );
679
+ if (is_na_or_inf(outlier_score))
680
+ outlier_score = 1. - 1e-15;
648
681
  } else {
649
682
  outlier_score = model_outputs.all_clusters[col][cl].score_categ[cat_val_this];
650
683
  }