outliertree 0.2.1 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -39,7 +39,7 @@
39
39
  * at which position will the counts for a given column start. Note that NAs are stored as the last index in each
40
40
  * column, so each one needs one extra category
41
41
  */
42
- int calculate_category_indices(size_t start_ix_cat_counts[], int ncat[], size_t ncols, bool skip_col[], int max_categ)
42
+ int calculate_category_indices(size_t start_ix_cat_counts[], int ncat[], size_t ncols, char skip_col[], int max_categ)
43
43
  {
44
44
  for (size_t col = 0; col < ncols; col++) {
45
45
  max_categ = std::max(ncat[col], max_categ);
@@ -53,7 +53,7 @@ int calculate_category_indices(size_t start_ix_cat_counts[], int ncat[], size_t
53
53
  /* Save the counts of each category for each column in the array determined above */
54
54
  void calculate_all_cat_counts(size_t start_ix_cat_counts[], size_t cat_counts[], int ncat[],
55
55
  int categorical_data[], size_t ncols, size_t nrows,
56
- bool has_NA[], bool skip_col[], int nthreads)
56
+ char has_NA[], char skip_col[], int nthreads)
57
57
  {
58
58
  size_t col_st_offset;
59
59
  size_t col_stop;
@@ -80,7 +80,7 @@ void calculate_all_cat_counts(size_t start_ix_cat_counts[], size_t cat_counts[],
80
80
 
81
81
  /* Check if some column has a large majority that would make any split fail to meet minimum sizes */
82
82
  void check_cat_col_unsplittable(size_t start_ix_cat_counts[], size_t cat_counts[], int ncat[],
83
- size_t ncols, size_t min_conditioned_size, size_t nrows, bool skip_col[], int nthreads)
83
+ size_t ncols, size_t min_conditioned_size, size_t nrows, char skip_col[], int nthreads)
84
84
  {
85
85
  size_t largest_cnt;
86
86
  #pragma omp parallel for num_threads(nthreads) private(largest_cnt) shared(ncols, nrows, ncat, cat_counts, start_ix_cat_counts, min_conditioned_size, skip_col)
@@ -127,8 +127,8 @@ void calculate_lowerlim_proportion(long double *restrict prop_small, long double
127
127
 
128
128
  /* Check if a numerical column has no variance (i.e. will not be splittable).
129
129
  Along the way, also record the number of decimals to display for this column. */
130
- void check_missing_no_variance(double numeric_data[], size_t ncols, size_t nrows, bool has_NA[],
131
- bool skip_col[], int min_decimals[], int nthreads)
130
+ void check_missing_no_variance(double numeric_data[], size_t ncols, size_t nrows, char has_NA[],
131
+ char skip_col[], int min_decimals[], int nthreads)
132
132
  {
133
133
  long double running_mean;
134
134
  long double mean_prev;
@@ -145,8 +145,8 @@ void check_missing_no_variance(double numeric_data[], size_t ncols, size_t nrows
145
145
  private(running_mean, mean_prev, running_ssq, cnt, col_stop, xval, min_val, max_val, min_decimals_col)
146
146
  for (size_t_for col = 0; col < ncols; col++) {
147
147
  running_mean = 0;
148
- mean_prev = 0;
149
148
  running_ssq = 0;
149
+ mean_prev = numeric_data[col * nrows];
150
150
  min_val = HUGE_VAL;
151
151
  max_val = -HUGE_VAL;
152
152
  cnt = 0;
@@ -178,11 +178,12 @@ void check_missing_no_variance(double numeric_data[], size_t ncols, size_t nrows
178
178
  void calc_central_mean_and_sd(size_t ix_arr[], size_t st, size_t end, double x[], size_t size_quarter, double *mean_central, double *sd_central)
179
179
  {
180
180
  long double running_mean = 0;
181
- long double mean_prev = 0;
182
181
  long double running_ssq = 0;
182
+ long double mean_prev = 0;
183
183
  double xval;
184
184
  size_t st_offset = st + size_quarter;
185
185
  if (ix_arr != NULL) {
186
+ mean_prev = x[ix_arr[st]];
186
187
  for (size_t row = st_offset; row <= (end - size_quarter); row++) {
187
188
  xval = x[ix_arr[row]];
188
189
  running_mean += (xval - running_mean) / (long double)(row - st_offset + 1);
@@ -190,6 +191,7 @@ void calc_central_mean_and_sd(size_t ix_arr[], size_t st, size_t end, double x[]
190
191
  mean_prev = running_mean;
191
192
  }
192
193
  } else {
194
+ mean_prev = x[st_offset];
193
195
  for (size_t row = st_offset; row <= (end - size_quarter); row++) {
194
196
  xval = x[row];
195
197
  running_mean += (xval - running_mean) / (long double)(row - st_offset + 1);
@@ -405,7 +407,7 @@ void divide_subset_split(size_t ix_arr[], double x[], size_t st, size_t end, dou
405
407
  }
406
408
 
407
409
  /* for categorical */
408
- void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, char subset_categ[], int ncat, bool has_NA, size_t *split_NA, size_t *st_right)
410
+ void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, signed char subset_categ[], int ncat, bool has_NA, size_t *split_NA, size_t *st_right)
409
411
  {
410
412
  size_t temp;
411
413
 
@@ -508,6 +510,7 @@ void backup_recursion_state(Workspace &workspace, RecursionState &state_backup)
508
510
  state_backup.has_outliers_restore = workspace.has_outliers;
509
511
  state_backup.lev_has_outliers_restore = workspace.lev_has_outliers;
510
512
  state_backup.temp_ptr_x = workspace.temp_ptr_x;
513
+ state_backup.is_binary_split_restore = workspace.is_binary_split;
511
514
  }
512
515
 
513
516
  void restore_recursion_state(Workspace &workspace, RecursionState &state_backup)
@@ -532,6 +535,7 @@ void restore_recursion_state(Workspace &workspace, RecursionState &state_backup)
532
535
  workspace.has_outliers = state_backup.has_outliers_restore;
533
536
  workspace.lev_has_outliers = state_backup.lev_has_outliers_restore;
534
537
  workspace.temp_ptr_x = state_backup.temp_ptr_x;
538
+ workspace.is_binary_split = state_backup.is_binary_split_restore;
535
539
  }
536
540
 
537
541
  /* Next split on the trees is only decided after they are already initialized */
@@ -542,7 +546,7 @@ void set_tree_as_numeric(ClusterTree &tree, double split_point, size_t col)
542
546
  tree.col_num = col;
543
547
  }
544
548
 
545
- void set_tree_as_categorical(ClusterTree &tree, int ncat, char *split_subset, size_t col)
549
+ void set_tree_as_categorical(ClusterTree &tree, int ncat, signed char *split_subset, size_t col)
546
550
  {
547
551
  tree.column_type = Categorical;
548
552
  tree.col_num = col;
@@ -587,6 +591,7 @@ void forget_row_outputs(ModelOutputs &model_outputs)
587
591
  model_outputs.outlier_trees_final.clear();
588
592
  model_outputs.outlier_depth_final.clear();
589
593
  model_outputs.outlier_decimals_distr.clear();
594
+ model_outputs.min_decimals_col.clear();
590
595
 
591
596
  model_outputs.outlier_scores_final.shrink_to_fit();
592
597
  model_outputs.outlier_clusters_final.shrink_to_fit();
@@ -594,6 +599,7 @@ void forget_row_outputs(ModelOutputs &model_outputs)
594
599
  model_outputs.outlier_trees_final.shrink_to_fit();
595
600
  model_outputs.outlier_depth_final.shrink_to_fit();
596
601
  model_outputs.outlier_decimals_distr.shrink_to_fit();
602
+ model_outputs.min_decimals_col.shrink_to_fit();
597
603
  }
598
604
 
599
605
  void allocate_row_outputs(ModelOutputs &model_outputs, size_t nrows, size_t max_depth)
@@ -605,6 +611,7 @@ void allocate_row_outputs(ModelOutputs &model_outputs, size_t nrows, size_t max_
605
611
  model_outputs.outlier_trees_final.resize(nrows);
606
612
  model_outputs.outlier_depth_final.resize(nrows, max_depth + 2);
607
613
  model_outputs.outlier_decimals_distr.resize(nrows, 0);
614
+ model_outputs.min_decimals_col.resize(nrows);
608
615
 
609
616
  model_outputs.outlier_scores_final.shrink_to_fit();
610
617
  model_outputs.outlier_clusters_final.shrink_to_fit();
@@ -612,6 +619,7 @@ void allocate_row_outputs(ModelOutputs &model_outputs, size_t nrows, size_t max_
612
619
  model_outputs.outlier_trees_final.shrink_to_fit();
613
620
  model_outputs.outlier_depth_final.shrink_to_fit();
614
621
  model_outputs.outlier_decimals_distr.shrink_to_fit();
622
+ model_outputs.min_decimals_col.shrink_to_fit();
615
623
  }
616
624
 
617
625
  void check_more_two_values(double arr_num[], size_t nrows, size_t ncols, int nthreads, char too_few_values[])
@@ -672,14 +680,155 @@ int decimals_diff(double val1, double val2)
672
680
  return (int) res;
673
681
  }
674
682
 
683
+ ModelOutputs get_empty_ModelOutputs()
684
+ {
685
+ return ModelOutputs();
686
+ }
675
687
 
676
- /* Reason behind this function: Cython (as of v0.29) will not auto-deallocate
677
- structs which are part of a cdef'd class, which produces a memory leak
678
- but can be force-destructed. Unfortunately, Cython itself doesn't even
679
- allow calling destructors for structs, so it has to be done externally.
680
- This function should otherwise have no reason to exist.
681
- */
682
- void dealloc_ModelOutputs(ModelOutputs &model_outputs)
688
+ bool get_has_openmp()
689
+ {
690
+ #ifdef _OPENMP
691
+ return true;
692
+ #else
693
+ return false;
694
+ #endif
695
+ }
696
+
697
+ bool interrupt_switch = false;
698
+ bool handle_is_locked = false;
699
+
700
+ /* Function to handle interrupt signals */
701
+ void set_interrup_global_variable(int s)
702
+ {
703
+ #pragma omp critical
704
+ {
705
+ interrupt_switch = true;
706
+ }
707
+ }
708
+
709
+ void check_interrupt_switch(SignalSwitcher &ss)
710
+ {
711
+ if (interrupt_switch)
712
+ {
713
+ ss.restore_handle();
714
+ #ifndef _FOR_R
715
+ fprintf(stderr, "Error: procedure was interrupted\n");
716
+ #else
717
+ REprintf("Error: procedure was interrupted\n");
718
+ #endif
719
+ raise(SIGINT);
720
+ #ifdef _FOR_R
721
+ Rcpp::checkUserInterrupt();
722
+ #elif !defined(DONT_THROW_ON_INTERRUPT)
723
+ throw "Error: procedure was interrupted.\n";
724
+ #endif
725
+ }
726
+ }
727
+
728
+ #ifdef _FOR_PYTHON
729
+ bool cy_check_interrupt_switch()
730
+ {
731
+ return interrupt_switch;
732
+ }
733
+ void cy_tick_off_interrupt_switch()
734
+ {
735
+ interrupt_switch = false;
736
+ }
737
+ #endif
738
+
739
+ SignalSwitcher::SignalSwitcher()
740
+ {
741
+ #pragma omp critical
742
+ {
743
+ if (!handle_is_locked)
744
+ {
745
+ handle_is_locked = true;
746
+ interrupt_switch = false;
747
+ this->old_sig = signal(SIGINT, set_interrup_global_variable);
748
+ this->is_active = true;
749
+ }
750
+
751
+ else {
752
+ this->is_active = false;
753
+ }
754
+ }
755
+ }
756
+
757
+ SignalSwitcher::~SignalSwitcher()
758
+ {
759
+ #ifndef _FOR_PYTHON
760
+ #pragma omp critical
761
+ {
762
+ if (this->is_active && handle_is_locked)
763
+ interrupt_switch = false;
764
+ }
765
+ #endif
766
+ this->restore_handle();
767
+ }
768
+
769
+ void SignalSwitcher::restore_handle()
770
+ {
771
+ #pragma omp critical
772
+ {
773
+ if (this->is_active && handle_is_locked)
774
+ {
775
+ signal(SIGINT, this->old_sig);
776
+ this->is_active = false;
777
+ handle_is_locked = false;
778
+ }
779
+ }
780
+ }
781
+
782
+ /* ceil(log2(x)) done with bit-wise operations ensures perfect precision (and it's faster too)
783
+ https://stackoverflow.com/questions/2589096/find-most-significant-bit-left-most-that-is-set-in-a-bit-array
784
+ https://stackoverflow.com/questions/11376288/fast-computing-of-log2-for-64-bit-integers */
785
+ #if SIZE_MAX == UINT32_MAX /* 32-bit systems */
786
+ constexpr static const int MultiplyDeBruijnBitPosition[32] =
787
+ {
788
+ 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
789
+ 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31
790
+ };
791
+ size_t log2ceil( size_t v )
792
+ {
793
+ v--;
794
+ v |= v >> 1; // first round down to one less than a power of 2
795
+ v |= v >> 2;
796
+ v |= v >> 4;
797
+ v |= v >> 8;
798
+ v |= v >> 16;
799
+
800
+ return MultiplyDeBruijnBitPosition[( uint32_t )( v * 0x07C4ACDDU ) >> 27] + 1;
801
+ }
802
+ #elif SIZE_MAX == UINT64_MAX /* 64-bit systems */
803
+ constexpr static const uint64_t tab64[64] = {
804
+ 63, 0, 58, 1, 59, 47, 53, 2,
805
+ 60, 39, 48, 27, 54, 33, 42, 3,
806
+ 61, 51, 37, 40, 49, 18, 28, 20,
807
+ 55, 30, 34, 11, 43, 14, 22, 4,
808
+ 62, 57, 46, 52, 38, 26, 32, 41,
809
+ 50, 36, 17, 19, 29, 10, 13, 21,
810
+ 56, 45, 25, 31, 35, 16, 9, 12,
811
+ 44, 24, 15, 8, 23, 7, 6, 5};
812
+
813
+ size_t log2ceil(size_t value)
814
+ {
815
+ value--;
816
+ value |= value >> 1;
817
+ value |= value >> 2;
818
+ value |= value >> 4;
819
+ value |= value >> 8;
820
+ value |= value >> 16;
821
+ value |= value >> 32;
822
+ return tab64[((uint64_t)((value - (value >> 1))*0x07EDD5E59A4E28C2)) >> 58] + 1;
823
+ }
824
+ #else /* other architectures - might not be entirely precise, and will be slower */
825
+ size_t log2ceil(size_t x) {return (size_t)(ceill(log2l((long double) x)));}
826
+ #endif
827
+
828
+ #ifdef _FOR_PYTHON
829
+ ModelOutputs deepcopy(const ModelOutputs &inp)
683
830
  {
684
- model_outputs.~ModelOutputs();
831
+ ModelOutputs out = inp;
832
+ return out;
685
833
  }
834
+ #endif