outliertree 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -41,23 +41,34 @@
41
41
  #include <algorithm>
42
42
  #include <numeric>
43
43
  #include <unordered_set>
44
+ #include <exception>
45
+ #include <stdexcept>
46
+ #include <cassert>
44
47
  #include <math.h>
45
48
  #include <cmath>
46
49
  #include <stddef.h>
47
50
  #include <limits.h>
51
+ #include <limits>
48
52
  #include <stdlib.h>
49
53
  #include <stddef.h>
50
54
  #include <string.h>
55
+ #include <stdint.h>
51
56
  #ifdef _OPENMP
52
57
  #include <omp.h>
53
58
  #endif
59
+ #ifdef _FOR_R
60
+ #include <Rcpp.h>
61
+ #endif
62
+ #include <signal.h>
63
+ typedef void (*sig_t_)(int);
64
+
54
65
 
55
66
  /************************
56
67
  Short Functions
57
68
  *************************/
58
69
  #define extract_bit(number, bit) (((number) >> (bit)) & 1) /* https://stackoverflow.com/questions/2249731/how-do-i-get-bit-by-bit-data-from-an-integer-value-in-c */
59
70
  #define pow2(n) ( ((size_t) 1) << (n) ) /* https://stackoverflow.com/questions/101439/the-most-efficient-way-to-implement-an-integer-based-power-function-powint-int */
60
- #define avg_between(a, b) (((a) + (b)) * 0.5)
71
+ #define avg_between(a, b) ((a) + 0.5*((b) - (a)))
61
72
  #define square(x) ((x) * (x))
62
73
  #ifndef isinf
63
74
  #define isinf std::isinf
@@ -68,7 +79,7 @@
68
79
  #define is_na_or_inf(x) (isnan(x) || isinf(x))
69
80
 
70
81
  /* Aliasing for compiler optimizations */
71
- #if defined(__GNUG__) || defined(__GNUC__) || defined(_MSC_VER) || defined(__clang__) || defined(__INTEL_COMPILER)
82
+ #if defined(__GNUG__) || defined(__GNUC__) || defined(_MSC_VER) || defined(__clang__) || defined(__INTEL_COMPILER) || defined(__IBMCPP__) || defined(__ibmxl__) || defined(SUPPORTS_RESTRICT)
72
83
  #define restrict __restrict
73
84
  #else
74
85
  #define restrict
@@ -89,6 +100,8 @@
89
100
  #define omp_get_thread_num() 0
90
101
  #endif
91
102
 
103
+ #define unexpected_error() throw std::runtime_error("Unexpected error. Please open an issue in GitHub.\n")
104
+
92
105
 
93
106
  /****************************************************************
94
107
  Data types and structs that are returned from this module
@@ -103,6 +116,8 @@ typedef enum SplitType {
103
116
  } SplitType;
104
117
  typedef enum ColTransf {NoTransf, Log, Exp} ColTransf; /* transformation to apply to numeric column */
105
118
 
119
+ /* TODO: should write serializers for the model objects, but need to somehow deal with long double types */
120
+
106
121
  /*
107
122
  * 1-d clusters that define homogeneous groups in which observations can be outliers.
108
123
  * Note that these are associated to a tree and define one extra condition from what
@@ -114,7 +129,7 @@ typedef struct Cluster {
114
129
  size_t col_num = 0; /* numer of the column by which its being split, the target column is given by index of the cluster vector */
115
130
  SplitType split_type = Root;
116
131
  double split_point = HUGE_VAL; /* numerical */
117
- std::vector<char> split_subset = std::vector<char>(); /* categorical */
132
+ std::vector<signed char> split_subset; /* categorical */
118
133
  int split_lev = INT_MAX; /* ordinal */
119
134
  bool has_NA_branch = false; /* this is in order to determine the best outlier cluster when it fits under more than 1 */
120
135
 
@@ -127,7 +142,7 @@ typedef struct Cluster {
127
142
  double display_lim_high = -HUGE_VAL; /* numerical target column */
128
143
  double display_mean = -HUGE_VAL; /* numerical target column */
129
144
  double display_sd = -HUGE_VAL; /* numerical target column */
130
- std::vector<char> subset_common = std::vector<char>(); /* categorical or ordinal target column (=0 is common) */
145
+ std::vector<signed char> subset_common; /* categorical or ordinal target column (=0 is common) */
131
146
  double perc_in_subset = HUGE_VAL; /* categorical or ordinal target column */
132
147
  double perc_next_most_comm = -HUGE_VAL; /* categorical or ordinal target column */ /* TODO */
133
148
  int categ_maj = -1; /* when using majority-criterion for categorical outliers */
@@ -156,7 +171,7 @@ typedef struct Cluster {
156
171
  }
157
172
 
158
173
  /* categorical split */
159
- Cluster(ColType column_type, size_t col_num, SplitType split_type, char *split_subset, int ncat, bool has_NA_branch = false)
174
+ Cluster(ColType column_type, size_t col_num, SplitType split_type, signed char *split_subset, int ncat, bool has_NA_branch = false)
160
175
  {
161
176
  this->column_type = column_type;
162
177
  this->col_num = col_num;
@@ -245,21 +260,21 @@ typedef struct Cluster {
245
260
  typedef struct ClusterTree {
246
261
  size_t parent = 0; /* index in a vector */
247
262
  SplitType parent_branch = Root; /* this tree follows this branch in the split given by its parent */
248
- std::vector<size_t> clusters = std::vector<size_t>(); /* these clusters define additional splits */
263
+ std::vector<size_t> clusters; /* these clusters define additional splits */
249
264
 
250
265
  SplitType split_this_branch = Root; /* when using 'follow_all' */
251
- std::vector<size_t> all_branches = std::vector<size_t>(); /* when using 'follow_all' */
266
+ std::vector<size_t> all_branches; /* when using 'follow_all' */
252
267
 
253
268
  ColType column_type = NoType;
254
269
  size_t col_num = 0;
255
270
  double split_point = HUGE_VAL;
256
- std::vector<char> split_subset = std::vector<char>();
271
+ std::vector<signed char> split_subset;
257
272
  int split_lev = INT_MAX;
258
273
 
259
274
  size_t tree_NA = 0; /* binary splits */
260
275
  size_t tree_left = 0; /* binary splits */
261
276
  size_t tree_right = 0; /* binary splits */
262
- std::vector<size_t> binary_branches = std::vector<size_t>(); /* multiple splits (single category or binarized categories) */
277
+ std::vector<size_t> binary_branches; /* multiple splits (single category or binarized categories) */
263
278
 
264
279
  ClusterTree(size_t parent, SplitType parent_branch)
265
280
  {
@@ -286,7 +301,7 @@ typedef struct ClusterTree {
286
301
  this->split_lev = split_lev;
287
302
  }
288
303
 
289
- ClusterTree(size_t parent, size_t col_num, SplitType split_this_branch, char *split_subset, int ncat)
304
+ ClusterTree(size_t parent, size_t col_num, SplitType split_this_branch, signed char *split_subset, int ncat)
290
305
  {
291
306
  this->parent = parent;
292
307
  this->col_num = col_num;
@@ -336,6 +351,8 @@ typedef struct ClusterTree {
336
351
 
337
352
  } ClusterTree;
338
353
 
354
+ /* TODO: should separate the results from the actual model object */
355
+
339
356
  /* these are needed for prediction time, and are thus returned from the function that fits the model */
340
357
  typedef struct ModelOutputs {
341
358
  std::vector< std::vector<ClusterTree> > all_trees; /* clusters in which observations can be outliers, required for prediction time */
@@ -370,11 +387,6 @@ typedef struct ModelOutputs {
370
387
  archive(
371
388
  this->all_trees,
372
389
  this->all_clusters,
373
- this->outlier_scores_final,
374
- this->outlier_clusters_final,
375
- this->outlier_columns_final,
376
- this->outlier_trees_final,
377
- this->outlier_depth_final,
378
390
  this->start_ix_cat_counts,
379
391
  this->prop_categ,
380
392
  this->col_transf,
@@ -421,7 +433,69 @@ bool fit_outliers_models(ModelOutputs &model_outputs,
421
433
  size_t max_depth = 3, double max_perc_outliers = 0.01, size_t min_size_numeric = 25, size_t min_size_categ = 50,
422
434
  double min_gain = 1e-2, bool gain_as_pct = false, bool follow_all = false, double z_norm = 2.67, double z_outlier = 8.0);
423
435
 
424
- typedef struct {
436
+ class ExhaustedColumnTracker
437
+ {
438
+ public:
439
+ std::vector<bool> is_exhausted;
440
+ std::vector<size_t> col_indices;
441
+ std::vector<size_t> n_held;
442
+
443
+ void initialize(size_t ncols, size_t max_depth)
444
+ {
445
+ this->is_exhausted.assign(ncols, false);
446
+ this->n_held.clear();
447
+ this->n_held.reserve(max_depth+1);
448
+ this->col_indices.clear();
449
+ this->col_indices.reserve(ncols);
450
+ }
451
+
452
+ void push_branch()
453
+ {
454
+ this->n_held.push_back(0);
455
+ }
456
+
457
+ void push_col(size_t col)
458
+ {
459
+ this->is_exhausted[col] = true;
460
+ this->col_indices.push_back(col);
461
+ this->n_held.back() += 1;
462
+ }
463
+
464
+ void pop_branch()
465
+ {
466
+ size_t col;
467
+ while (this->n_held.back() > 0)
468
+ {
469
+ col = this->col_indices.back();
470
+ this->is_exhausted[col] = false;
471
+ this->col_indices.pop_back();
472
+ this->n_held.back() -= 1;
473
+ }
474
+
475
+ this->n_held.pop_back();
476
+ }
477
+ };
478
+
479
+ class ExhaustedColumnsLevel
480
+ {
481
+ public:
482
+ bool pop = false;
483
+ ExhaustedColumnTracker* tracker = nullptr;
484
+ ExhaustedColumnsLevel() = default;
485
+ void initialize(ExhaustedColumnTracker* tracker) {
486
+ this->pop = true;
487
+ this->tracker = tracker;
488
+ this->tracker->push_branch();
489
+ }
490
+ ~ExhaustedColumnsLevel() {
491
+ if (this->pop) {
492
+ this->tracker->pop_branch();
493
+ this->pop = false;
494
+ }
495
+ }
496
+ };
497
+
498
+ struct Workspace {
425
499
 
426
500
  std::vector<size_t> ix_arr; /* indices from the target column */
427
501
  size_t st; /* chunk of the indices to take for current function calls */
@@ -460,7 +534,7 @@ typedef struct {
460
534
  int *untransf_target_col; /* column as it was before forcibly binarizing (dynamic pointer) */
461
535
  int *temp_ptr_x; /* dynamic pointer */
462
536
 
463
- std::vector<char> buffer_subset_categ_best; /* categorical split that gave the best gain */
537
+ std::vector<signed char> buffer_subset_categ_best; /* categorical split that gave the best gain */
464
538
  long double this_gain; /* buffer where to store gain */
465
539
  double this_split_point; /* numeric split threshold */
466
540
  int this_split_lev; /* ordinal split threshold */
@@ -477,8 +551,8 @@ typedef struct {
477
551
  std::vector<size_t> buffer_crosstab; /* buffer arrays where to allocate values required by functions and not used outside them */
478
552
  std::vector<size_t> buffer_cat_cnt; /* buffer arrays where to allocate values required by functions and not used outside them */
479
553
  std::vector<size_t> buffer_cat_sorted; /* buffer arrays where to allocate values required by functions and not used outside them */
480
- std::vector<char> buffer_subset_categ; /* buffer arrays where to allocate values required by functions and not used outside them */
481
- std::vector<char> buffer_subset_outlier; /* buffer arrays where to allocate values required by functions and not used outside them */
554
+ std::vector<signed char> buffer_subset_categ; /* buffer arrays where to allocate values required by functions and not used outside them */
555
+ std::vector<signed char> buffer_subset_outlier; /* buffer arrays where to allocate values required by functions and not used outside them */
482
556
  std::vector<long double> buffer_sd; /* used for a more numerically-stable two-pass gain calculation */
483
557
 
484
558
  bool drop_cluster; /* for categorical and ordinal variables, not all clusters can flag observations as outliers, so those are not kept */
@@ -486,10 +560,14 @@ typedef struct {
486
560
  bool target_col_is_ord; /* whether the target column is ordinal (rest is the same as in categoricals) */
487
561
  int ncat_this; /* number of categories in the target column */
488
562
 
489
- } Workspace;
563
+ ExhaustedColumnTracker exhausted_col_tracker;
564
+ bool has_zero_variance;
565
+ bool is_binary_split;
566
+ bool best_cat_split_is_binary;
567
+ };
490
568
 
491
569
  /* info holders to shorten function call arguments */
492
- typedef struct {
570
+ struct ModelParams {
493
571
  bool categ_as_bin;
494
572
  bool ord_as_bin;
495
573
  bool cat_bruteforce_subset;
@@ -506,16 +584,16 @@ typedef struct {
506
584
  double z_outlier;
507
585
  double z_tail;
508
586
  std::vector<long double> prop_small; /* this is not a parameter, but a shared array determined from the parameters and data */
509
- } ModelParams;
587
+ };
510
588
 
511
589
  /* Note: the vectors here are filled within the function that fits the model, while the pointers are passed from outside */
512
- typedef struct {
590
+ struct InputData {
513
591
  double *restrict numeric_data; size_t ncols_numeric;
514
592
  int *restrict categorical_data; size_t ncols_categ; int *restrict ncat;
515
593
  int *restrict ordinal_data; size_t ncols_ord; int *restrict ncat_ord;
516
594
  size_t nrows; size_t tot_cols; std::vector<char> has_NA; std::vector<char> skip_col; int max_categ;
517
595
  std::vector<size_t> cat_counts;
518
- } InputData;
596
+ };
519
597
 
520
598
 
521
599
  void process_numeric_col(std::vector<Cluster> &cluster_root,
@@ -547,12 +625,12 @@ void recursive_split_categ(Workspace &workspace,
547
625
  (This is the module from which
548
626
  new data can be flagged as outliers)
549
627
  ********************************************/
550
- typedef struct {
628
+ struct PredictionData {
551
629
  double *restrict numeric_data;
552
630
  int *restrict categorical_data;
553
631
  int *restrict ordinal_data;
554
632
  size_t nrows;
555
- } PredictionData;
633
+ };
556
634
 
557
635
  bool find_new_outliers(double *restrict numeric_data,
558
636
  int *restrict categorical_data,
@@ -570,19 +648,21 @@ bool check_is_outlier_in_tree(std::vector<size_t> &clusters_in_tree, size_t curr
570
648
  *********************************/
571
649
  #define SD_REG 1e-5 /* Regularization for standard deviation estimation */
572
650
 
573
- typedef struct {
651
+ /* TODO: should make long doubles optional */
652
+
653
+ struct NumericBranch {
574
654
  size_t cnt;
575
655
  long double sum;
576
656
  long double sum_sq;
577
- } NumericBranch;
657
+ };
578
658
 
579
- typedef struct {
659
+ struct NumericSplit {
580
660
  NumericBranch NA_branch = {0, 0, 0};
581
661
  NumericBranch left_branch = {0, 0, 0};
582
662
  NumericBranch right_branch = {0, 0, 0};
583
- } NumericSplit;
663
+ };
584
664
 
585
- typedef struct {
665
+ struct CategSplit {
586
666
  size_t *restrict NA_branch; /* array of counts of the target variable's categories */
587
667
  size_t *restrict left_branch; /* array of counts of the target variable's categories */
588
668
  size_t *restrict right_branch; /* array of counts of the target variable's categories */
@@ -591,11 +671,11 @@ typedef struct {
591
671
  size_t size_NA = 0;
592
672
  size_t size_left = 0;
593
673
  size_t size_right = 0;
594
- } CategSplit;
674
+ };
595
675
 
596
- void subset_to_onehot(size_t ix_arr[], size_t n_true, size_t n_tot, bool onehot[]);
676
+ void subset_to_onehot(size_t ix_arr[], size_t n_true, size_t n_tot, signed char onehot[]);
597
677
  size_t move_zero_count_to_front(size_t *restrict cat_sorted, size_t *restrict cat_cnt, size_t ncat_x);
598
- void flag_zero_counts(char split_subset[], size_t buffer_cat_cnt[], size_t ncat_x);
678
+ void flag_zero_counts(signed char split_subset[], size_t buffer_cat_cnt[], size_t ncat_x);
599
679
  long double calc_sd(size_t cnt, long double sum, long double sum_sq);
600
680
  long double calc_sd(NumericBranch &branch);
601
681
  long double calc_sd(size_t ix_arr[], double *restrict x, size_t st, size_t end, double *restrict mean);
@@ -610,23 +690,25 @@ long double categ_gain_from_split(size_t *restrict ix_arr, int *restrict x, size
610
690
  size_t ncat, size_t *restrict buffer_cat_cnt, long double base_info);
611
691
  void split_numericx_numericy(size_t *restrict ix_arr, size_t st, size_t end, double *restrict x, double *restrict y,
612
692
  long double sd_y, bool has_na, size_t min_size, bool take_mid, long double *restrict buffer_sd,
613
- long double *restrict gain, double *restrict split_point, size_t *restrict split_left, size_t *restrict split_NA);
693
+ long double *restrict gain, double *restrict split_point, size_t *restrict split_left, size_t *restrict split_NA, bool *restrict has_zero_variance);
614
694
  void split_categx_numericy(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, double *restrict y, long double sd_y, double ymean,
615
695
  bool x_is_ordinal, size_t ncat_x, size_t *restrict buffer_cat_cnt, long double *restrict buffer_cat_sum,
616
696
  long double *restrict buffer_cat_sum_sq, size_t *restrict buffer_cat_sorted,
617
- bool has_na, size_t min_size, long double *gain, char *restrict split_subset, int *restrict split_point);
697
+ bool has_na, size_t min_size, long double *gain, signed char *restrict split_subset, int *restrict split_point, bool *restrict has_zero_variance, bool *restrict binary_split);
618
698
  void split_numericx_categy(size_t *restrict ix_arr, size_t st, size_t end, double *restrict x, int *restrict y,
619
699
  size_t ncat_y, long double base_info, size_t *restrict buffer_cat_cnt,
620
700
  bool has_na, size_t min_size, bool take_mid, long double *restrict gain, double *restrict split_point,
621
- size_t *restrict split_left, size_t *restrict split_NA);
701
+ size_t *restrict split_left, size_t *restrict split_NA, bool *restrict has_zero_variance);
622
702
  void split_ordx_categy(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, int *restrict y,
623
703
  size_t ncat_y, size_t ncat_x, long double base_info,
624
704
  size_t *restrict buffer_cat_cnt, size_t *restrict buffer_crosstab, size_t *restrict buffer_ord_cnt,
625
- bool has_na, size_t min_size, long double *gain, int *split_point);
705
+ bool has_na, size_t min_size, long double *gain, int *split_point,
706
+ bool *restrict has_zero_variance, bool *restrict binary_split);
626
707
  void split_categx_biny(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, int *restrict y,
627
708
  size_t ncat_x, long double base_info,
628
709
  size_t *restrict buffer_cat_cnt, size_t *restrict buffer_crosstab, size_t *restrict buffer_cat_sorted,
629
- bool has_na, size_t min_size, long double *gain, char *restrict split_subset);
710
+ bool has_na, size_t min_size, long double *gain, signed char *restrict split_subset,
711
+ bool *restrict has_zero_variance, bool *restrict binary_split);
630
712
  void split_categx_categy_separate(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, int *restrict y,
631
713
  size_t ncat_x, size_t ncat_y, long double base_info,
632
714
  size_t *restrict buffer_cat_cnt, size_t *restrict buffer_crosstab,
@@ -634,7 +716,8 @@ void split_categx_categy_separate(size_t *restrict ix_arr, size_t st, size_t end
634
716
  void split_categx_categy_subset(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, int *restrict y,
635
717
  size_t ncat_x, size_t ncat_y, long double base_info,
636
718
  size_t *restrict buffer_cat_cnt, size_t *restrict buffer_crosstab, size_t *restrict buffer_split,
637
- bool has_na, size_t min_size, long double *gain, char *restrict split_subset);
719
+ bool has_na, size_t min_size, long double *gain, signed char *restrict split_subset,
720
+ bool *restrict has_zero_variance, bool *restrict binary_split);
638
721
 
639
722
 
640
723
 
@@ -642,8 +725,8 @@ void split_categx_categy_subset(size_t *restrict ix_arr, size_t st, size_t end,
642
725
  Prototypes from clusters.cpp
643
726
  ************************************/
644
727
  #define calculate_max_outliers(n, perc) ( (n) * (perc) + (long double)2 * sqrtl( (n) * (perc) * ((long double)1 - perc) ) + (long double)1 )
645
- #define z_score(x, mu, sd) ( ((x) - (mu)) / (sd) )
646
- #define chebyshyov_bound(sd) (1.0 / square(sd))
728
+ #define z_score(x, mu, sd) ( ((x) - (mu)) / std::max((sd), 1e-12) )
729
+ #define chebyshyov_bound(zval) (1.0 / std::max(square(zval), 1.))
647
730
 
648
731
  bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_t st, size_t end,
649
732
  double *restrict outlier_scores, size_t *restrict outlier_clusters, size_t *restrict outlier_trees,
@@ -654,7 +737,7 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
654
737
  void define_categ_cluster_no_cond(int *restrict x, size_t *restrict ix_arr, size_t st, size_t end, size_t ncateg,
655
738
  double *restrict outlier_scores, size_t *restrict outlier_clusters, size_t *restrict outlier_trees,
656
739
  size_t *restrict outlier_depth, Cluster &cluster,
657
- size_t *restrict categ_counts, char *restrict is_outlier, double perc_next_most_comm);
740
+ size_t *restrict categ_counts, signed char *restrict is_outlier, double perc_next_most_comm);
658
741
  bool define_categ_cluster(int *restrict x, size_t *restrict ix_arr, size_t st, size_t end, size_t ncateg, bool by_maj,
659
742
  double *restrict outlier_scores, size_t *restrict outlier_clusters, size_t *restrict outlier_trees,
660
743
  size_t *restrict outlier_depth, Cluster &cluster, std::vector<Cluster> &clusters,
@@ -662,7 +745,7 @@ bool define_categ_cluster(int *restrict x, size_t *restrict ix_arr, size_t st, s
662
745
  double max_perc_outliers, double z_norm, double z_outlier,
663
746
  long double *restrict perc_threshold, long double *restrict prop_prior,
664
747
  size_t *restrict buffer_categ_counts, long double *restrict buffer_categ_pct,
665
- size_t *restrict buffer_categ_ix, char *restrict buffer_outliers,
748
+ size_t *restrict buffer_categ_ix, signed char *restrict buffer_outliers,
666
749
  bool *restrict drop_cluster);
667
750
  void simplify_when_equal_cond(std::vector<Cluster> &clusters, int ncat_ord[]);
668
751
  void simplify_when_equal_cond(std::vector<ClusterTree> &trees, int ncat_ord[]);
@@ -680,12 +763,12 @@ void calculate_cluster_poss_categs(ModelOutputs &model_outputs, size_t col, size
680
763
  #define calculate_max_cat_outliers(n, perc, z_norm) ((long double)1 + ((n) * (perc) / z_norm)) /* Note: this is not anyhow probabilistic, nor based on provable bounds */
681
764
  void find_outlier_categories(size_t categ_counts[], size_t ncateg, size_t tot, double max_perc_outliers,
682
765
  long double perc_threshold[], size_t buffer_ix[], long double buffer_perc[],
683
- double z_norm, char is_outlier[], bool *found_outliers, bool *new_is_outlier, double *next_most_comm);
766
+ double z_norm, signed char is_outlier[], bool *found_outliers, bool *new_is_outlier, double *next_most_comm);
684
767
  void find_outlier_categories_by_maj(size_t categ_counts[], size_t ncateg, size_t tot, double max_perc_outliers,
685
- long double prior_prob[], double z_outlier, char is_outlier[],
768
+ long double prior_prob[], double z_outlier, signed char is_outlier[],
686
769
  bool *found_outliers, bool *new_is_outlier, int *categ_maj);
687
770
  bool find_outlier_categories_no_cond(size_t categ_counts[], size_t ncateg, size_t tot,
688
- char is_outlier[], double *next_most_comm);
771
+ signed char is_outlier[], double *next_most_comm);
689
772
 
690
773
 
691
774
 
@@ -699,7 +782,7 @@ typedef struct {
699
782
  double gain_best_restore;
700
783
  double split_point_restore;
701
784
  int split_lev_restore;
702
- std::vector<char> split_subset_restore;
785
+ std::vector<signed char> split_subset_restore;
703
786
  size_t ix1_restore;
704
787
  size_t ix2_restore;
705
788
  size_t ix3_restore;
@@ -709,12 +792,13 @@ typedef struct {
709
792
  ColType col_type_best_rememer;
710
793
  double split_point_best_restore;
711
794
  int split_lev_best_restore;
712
- std::vector<char> split_subset_best_restore;
795
+ std::vector<signed char> split_subset_best_restore;
713
796
  long double base_info_restore;
714
797
  long double base_info_orig_restore;
715
798
  double sd_y_restore;
716
799
  bool has_outliers_restore;
717
800
  bool lev_has_outliers_restore;
801
+ bool is_binary_split_restore;
718
802
  } RecursionState;
719
803
 
720
804
 
@@ -739,14 +823,14 @@ size_t move_outliers_to_front(size_t ix_arr[], double outlier_scores[], size_t s
739
823
  size_t move_NAs_to_front(size_t ix_arr[], double x[], size_t st, size_t end, bool inf_as_NA);
740
824
  size_t move_NAs_to_front(size_t ix_arr[], int x[], size_t st, size_t end);
741
825
  void divide_subset_split(size_t ix_arr[], double x[], size_t st, size_t end, double split_point, bool has_NA, size_t *split_NA, size_t *st_right);
742
- void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, char subset_categ[], int ncat, bool has_NA, size_t *split_NA, size_t *st_right);
826
+ void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, signed char subset_categ[], int ncat, bool has_NA, size_t *split_NA, size_t *st_right);
743
827
  void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, int split_lev, bool has_NA, size_t *split_NA, size_t *st_right);
744
828
  bool check_workspace_is_allocated(Workspace &workspace);
745
829
  void allocate_thread_workspace(Workspace &workspace, size_t nrows, int max_categ);
746
830
  void backup_recursion_state(Workspace &workspace, RecursionState &state_backup);
747
831
  void restore_recursion_state(Workspace &workspace, RecursionState &state_backup);
748
832
  void set_tree_as_numeric(ClusterTree &tree, double split_point, size_t col);
749
- void set_tree_as_categorical(ClusterTree &tree, int ncat, char *split_subset, size_t col);
833
+ void set_tree_as_categorical(ClusterTree &tree, int ncat, signed char *split_subset, size_t col);
750
834
  void set_tree_as_categorical(ClusterTree &tree, size_t col);
751
835
  void set_tree_as_categorical(ClusterTree &tree, size_t col, int ncat);
752
836
  void set_tree_as_ordinal(ClusterTree &tree, int split_lev, size_t col);
@@ -756,3 +840,27 @@ void check_more_two_values(double arr_num[], size_t nrows, size_t ncols, int nth
756
840
  void calc_min_decimals_to_print(ModelOutputs &model_outputs, double *restrict numeric_data, int nthreads);
757
841
  int decimals_diff(double val1, double val2);
758
842
  void dealloc_ModelOutputs(ModelOutputs &model_outputs);
843
+ ModelOutputs get_empty_ModelOutputs();
844
+ bool get_has_openmp();
845
+
846
+ extern bool interrupt_switch;
847
+ extern bool handle_is_locked;
848
+ void set_interrup_global_variable(int s);
849
+ class SignalSwitcher
850
+ {
851
+ public:
852
+ sig_t_ old_sig;
853
+ bool is_active;
854
+ SignalSwitcher();
855
+ ~SignalSwitcher();
856
+ void restore_handle();
857
+ };
858
+ void check_interrupt_switch(SignalSwitcher &ss);
859
+ #ifdef _FOR_PYTHON
860
+ bool cy_check_interrupt_switch();
861
+ void cy_tick_off_interrupt_switch();
862
+ #endif
863
+ size_t log2ceil(size_t v);
864
+ #ifdef _FOR_PYTHON
865
+ ModelOutputs deepcopy(const ModelOutputs &inp);
866
+ #endif
@@ -0,0 +1,3 @@
1
+ LIBRARY outliertree.dll
2
+ EXPORTS
3
+ R_init_outliertree
@@ -226,6 +226,11 @@ bool follow_tree(ModelOutputs &model_outputs, PredictionData &prediction_data, s
226
226
  true : found_outliers;
227
227
  break;
228
228
  }
229
+
230
+ default:
231
+ {
232
+ assert(0);
233
+ }
229
234
  }
230
235
  break;
231
236
  }
@@ -279,6 +284,11 @@ bool follow_tree(ModelOutputs &model_outputs, PredictionData &prediction_data, s
279
284
  true : found_outliers;
280
285
  break;
281
286
  }
287
+
288
+ default:
289
+ {
290
+ assert(0);
291
+ }
282
292
  }
283
293
  break;
284
294
  }
@@ -332,10 +342,16 @@ bool follow_tree(ModelOutputs &model_outputs, PredictionData &prediction_data, s
332
342
  true : found_outliers;
333
343
  break;
334
344
  }
345
+
346
+ default:
347
+ {
348
+ assert(0);
349
+ }
335
350
  }
336
351
  break;
337
352
  }
338
353
 
354
+ default: {}
339
355
  }
340
356
  }
341
357
  }
@@ -548,6 +564,11 @@ bool check_is_outlier_in_tree(std::vector<size_t> &clusters_in_tree, size_t curr
548
564
  if (!isnan(num_val_other) && num_val_other > model_outputs.all_clusters[col][cl].split_point) flag_this_cluster = true;
549
565
  break;
550
566
  }
567
+
568
+ default:
569
+ {
570
+ assert(0);
571
+ }
551
572
  }
552
573
  break;
553
574
  }
@@ -587,6 +608,11 @@ bool check_is_outlier_in_tree(std::vector<size_t> &clusters_in_tree, size_t curr
587
608
  break;
588
609
  }
589
610
 
611
+ default:
612
+ {
613
+ assert(0);
614
+ }
615
+
590
616
  /* Note: type 'SingleCateg' is only used temporarily, later gets converted to 'Equal' */
591
617
  }
592
618
  break;
@@ -626,6 +652,11 @@ bool check_is_outlier_in_tree(std::vector<size_t> &clusters_in_tree, size_t curr
626
652
  if (cat_val_other >=0 && cat_val_other != model_outputs.all_clusters[col][cl].split_lev) flag_this_cluster = true;
627
653
  break;
628
654
  }
655
+
656
+ default:
657
+ {
658
+ assert(0);
659
+ }
629
660
  }
630
661
  break;
631
662
  }
@@ -645,6 +676,8 @@ bool check_is_outlier_in_tree(std::vector<size_t> &clusters_in_tree, size_t curr
645
676
  model_outputs.all_clusters[col][cl].cluster_sd
646
677
  )
647
678
  );
679
+ if (is_na_or_inf(outlier_score))
680
+ outlier_score = 1. - 1e-15;
648
681
  } else {
649
682
  outlier_score = model_outputs.all_clusters[col][cl].score_categ[cat_val_this];
650
683
  }