outliertree 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -145,8 +145,8 @@ void check_missing_no_variance(double numeric_data[], size_t ncols, size_t nrows
145
145
  private(running_mean, mean_prev, running_ssq, cnt, col_stop, xval, min_val, max_val, min_decimals_col)
146
146
  for (size_t_for col = 0; col < ncols; col++) {
147
147
  running_mean = 0;
148
- mean_prev = 0;
149
148
  running_ssq = 0;
149
+ mean_prev = numeric_data[col * nrows];
150
150
  min_val = HUGE_VAL;
151
151
  max_val = -HUGE_VAL;
152
152
  cnt = 0;
@@ -178,11 +178,12 @@ void check_missing_no_variance(double numeric_data[], size_t ncols, size_t nrows
178
178
  void calc_central_mean_and_sd(size_t ix_arr[], size_t st, size_t end, double x[], size_t size_quarter, double *mean_central, double *sd_central)
179
179
  {
180
180
  long double running_mean = 0;
181
- long double mean_prev = 0;
182
181
  long double running_ssq = 0;
182
+ long double mean_prev = 0;
183
183
  double xval;
184
184
  size_t st_offset = st + size_quarter;
185
185
  if (ix_arr != NULL) {
186
+ mean_prev = x[ix_arr[st]];
186
187
  for (size_t row = st_offset; row <= (end - size_quarter); row++) {
187
188
  xval = x[ix_arr[row]];
188
189
  running_mean += (xval - running_mean) / (long double)(row - st_offset + 1);
@@ -190,6 +191,7 @@ void calc_central_mean_and_sd(size_t ix_arr[], size_t st, size_t end, double x[]
190
191
  mean_prev = running_mean;
191
192
  }
192
193
  } else {
194
+ mean_prev = x[st_offset];
193
195
  for (size_t row = st_offset; row <= (end - size_quarter); row++) {
194
196
  xval = x[row];
195
197
  running_mean += (xval - running_mean) / (long double)(row - st_offset + 1);
@@ -405,7 +407,7 @@ void divide_subset_split(size_t ix_arr[], double x[], size_t st, size_t end, dou
405
407
  }
406
408
 
407
409
  /* for categorical */
408
- void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, char subset_categ[], int ncat, bool has_NA, size_t *split_NA, size_t *st_right)
410
+ void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, signed char subset_categ[], int ncat, bool has_NA, size_t *split_NA, size_t *st_right)
409
411
  {
410
412
  size_t temp;
411
413
 
@@ -508,6 +510,7 @@ void backup_recursion_state(Workspace &workspace, RecursionState &state_backup)
508
510
  state_backup.has_outliers_restore = workspace.has_outliers;
509
511
  state_backup.lev_has_outliers_restore = workspace.lev_has_outliers;
510
512
  state_backup.temp_ptr_x = workspace.temp_ptr_x;
513
+ state_backup.is_binary_split_restore = workspace.is_binary_split;
511
514
  }
512
515
 
513
516
  void restore_recursion_state(Workspace &workspace, RecursionState &state_backup)
@@ -532,6 +535,7 @@ void restore_recursion_state(Workspace &workspace, RecursionState &state_backup)
532
535
  workspace.has_outliers = state_backup.has_outliers_restore;
533
536
  workspace.lev_has_outliers = state_backup.lev_has_outliers_restore;
534
537
  workspace.temp_ptr_x = state_backup.temp_ptr_x;
538
+ workspace.is_binary_split = state_backup.is_binary_split_restore;
535
539
  }
536
540
 
537
541
  /* Next split on the trees is only decided after they are already initialized */
@@ -542,7 +546,7 @@ void set_tree_as_numeric(ClusterTree &tree, double split_point, size_t col)
542
546
  tree.col_num = col;
543
547
  }
544
548
 
545
- void set_tree_as_categorical(ClusterTree &tree, int ncat, char *split_subset, size_t col)
549
+ void set_tree_as_categorical(ClusterTree &tree, int ncat, signed char *split_subset, size_t col)
546
550
  {
547
551
  tree.column_type = Categorical;
548
552
  tree.col_num = col;
@@ -587,6 +591,7 @@ void forget_row_outputs(ModelOutputs &model_outputs)
587
591
  model_outputs.outlier_trees_final.clear();
588
592
  model_outputs.outlier_depth_final.clear();
589
593
  model_outputs.outlier_decimals_distr.clear();
594
+ model_outputs.min_decimals_col.clear();
590
595
 
591
596
  model_outputs.outlier_scores_final.shrink_to_fit();
592
597
  model_outputs.outlier_clusters_final.shrink_to_fit();
@@ -594,6 +599,7 @@ void forget_row_outputs(ModelOutputs &model_outputs)
594
599
  model_outputs.outlier_trees_final.shrink_to_fit();
595
600
  model_outputs.outlier_depth_final.shrink_to_fit();
596
601
  model_outputs.outlier_decimals_distr.shrink_to_fit();
602
+ model_outputs.min_decimals_col.shrink_to_fit();
597
603
  }
598
604
 
599
605
  void allocate_row_outputs(ModelOutputs &model_outputs, size_t nrows, size_t max_depth)
@@ -605,6 +611,7 @@ void allocate_row_outputs(ModelOutputs &model_outputs, size_t nrows, size_t max_
605
611
  model_outputs.outlier_trees_final.resize(nrows);
606
612
  model_outputs.outlier_depth_final.resize(nrows, max_depth + 2);
607
613
  model_outputs.outlier_decimals_distr.resize(nrows, 0);
614
+ model_outputs.min_decimals_col.resize(nrows);
608
615
 
609
616
  model_outputs.outlier_scores_final.shrink_to_fit();
610
617
  model_outputs.outlier_clusters_final.shrink_to_fit();
@@ -612,6 +619,7 @@ void allocate_row_outputs(ModelOutputs &model_outputs, size_t nrows, size_t max_
612
619
  model_outputs.outlier_trees_final.shrink_to_fit();
613
620
  model_outputs.outlier_depth_final.shrink_to_fit();
614
621
  model_outputs.outlier_decimals_distr.shrink_to_fit();
622
+ model_outputs.min_decimals_col.shrink_to_fit();
615
623
  }
616
624
 
617
625
  void check_more_two_values(double arr_num[], size_t nrows, size_t ncols, int nthreads, char too_few_values[])
@@ -683,3 +691,156 @@ void dealloc_ModelOutputs(ModelOutputs &model_outputs)
683
691
  {
684
692
  model_outputs.~ModelOutputs();
685
693
  }
694
+
695
+ ModelOutputs get_empty_ModelOutputs()
696
+ {
697
+ return ModelOutputs();
698
+ }
699
+
700
+ bool get_has_openmp()
701
+ {
702
+ #ifdef _OPENMP
703
+ return true;
704
+ #else
705
+ return false;
706
+ #endif
707
+ }
708
+
709
+ bool interrupt_switch = false;
710
+ bool handle_is_locked = false;
711
+
712
+ /* Function to handle interrupt signals */
713
+ void set_interrup_global_variable(int s)
714
+ {
715
+ #pragma omp critical
716
+ {
717
+ interrupt_switch = true;
718
+ }
719
+ }
720
+
721
+ void check_interrupt_switch(SignalSwitcher &ss)
722
+ {
723
+ if (interrupt_switch)
724
+ {
725
+ ss.restore_handle();
726
+ #ifndef _FOR_R
727
+ fprintf(stderr, "Error: procedure was interrupted\n");
728
+ #else
729
+ REprintf("Error: procedure was interrupted\n");
730
+ #endif
731
+ raise(SIGINT);
732
+ #ifdef _FOR_R
733
+ Rcpp::checkUserInterrupt();
734
+ #elif !defined(DONT_THROW_ON_INTERRUPT)
735
+ throw "Error: procedure was interrupted.\n";
736
+ #endif
737
+ }
738
+ }
739
+
740
+ #ifdef _FOR_PYTHON
741
+ bool cy_check_interrupt_switch()
742
+ {
743
+ return interrupt_switch;
744
+ }
745
+ void cy_tick_off_interrupt_switch()
746
+ {
747
+ interrupt_switch = false;
748
+ }
749
+ #endif
750
+
751
+ SignalSwitcher::SignalSwitcher()
752
+ {
753
+ #pragma omp critical
754
+ {
755
+ if (!handle_is_locked)
756
+ {
757
+ handle_is_locked = true;
758
+ interrupt_switch = false;
759
+ this->old_sig = signal(SIGINT, set_interrup_global_variable);
760
+ this->is_active = true;
761
+ }
762
+
763
+ else {
764
+ this->is_active = false;
765
+ }
766
+ }
767
+ }
768
+
769
+ SignalSwitcher::~SignalSwitcher()
770
+ {
771
+ #ifndef _FOR_PYTHON
772
+ #pragma omp critical
773
+ {
774
+ if (this->is_active && handle_is_locked)
775
+ interrupt_switch = false;
776
+ }
777
+ #endif
778
+ this->restore_handle();
779
+ }
780
+
781
+ void SignalSwitcher::restore_handle()
782
+ {
783
+ #pragma omp critical
784
+ {
785
+ if (this->is_active && handle_is_locked)
786
+ {
787
+ signal(SIGINT, this->old_sig);
788
+ this->is_active = false;
789
+ handle_is_locked = false;
790
+ }
791
+ }
792
+ }
793
+
794
+ /* ceil(log2(x)) done with bit-wise operations ensures perfect precision (and it's faster too)
795
+ https://stackoverflow.com/questions/2589096/find-most-significant-bit-left-most-that-is-set-in-a-bit-array
796
+ https://stackoverflow.com/questions/11376288/fast-computing-of-log2-for-64-bit-integers */
797
+ #if SIZE_MAX == UINT32_MAX /* 32-bit systems */
798
+ constexpr static const int MultiplyDeBruijnBitPosition[32] =
799
+ {
800
+ 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
801
+ 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31
802
+ };
803
+ size_t log2ceil( size_t v )
804
+ {
805
+ v--;
806
+ v |= v >> 1; // first round down to one less than a power of 2
807
+ v |= v >> 2;
808
+ v |= v >> 4;
809
+ v |= v >> 8;
810
+ v |= v >> 16;
811
+
812
+ return MultiplyDeBruijnBitPosition[( uint32_t )( v * 0x07C4ACDDU ) >> 27] + 1;
813
+ }
814
+ #elif SIZE_MAX == UINT64_MAX /* 64-bit systems */
815
+ constexpr static const uint64_t tab64[64] = {
816
+ 63, 0, 58, 1, 59, 47, 53, 2,
817
+ 60, 39, 48, 27, 54, 33, 42, 3,
818
+ 61, 51, 37, 40, 49, 18, 28, 20,
819
+ 55, 30, 34, 11, 43, 14, 22, 4,
820
+ 62, 57, 46, 52, 38, 26, 32, 41,
821
+ 50, 36, 17, 19, 29, 10, 13, 21,
822
+ 56, 45, 25, 31, 35, 16, 9, 12,
823
+ 44, 24, 15, 8, 23, 7, 6, 5};
824
+
825
+ size_t log2ceil(size_t value)
826
+ {
827
+ value--;
828
+ value |= value >> 1;
829
+ value |= value >> 2;
830
+ value |= value >> 4;
831
+ value |= value >> 8;
832
+ value |= value >> 16;
833
+ value |= value >> 32;
834
+ return tab64[((uint64_t)((value - (value >> 1))*0x07EDD5E59A4E28C2)) >> 58] + 1;
835
+ }
836
+ #else /* other architectures - might not be entirely precise, and will be slower */
837
+ size_t log2ceil(size_t x) {return (size_t)(ceill(log2l((long double) x)));}
838
+ #endif
839
+
840
+ #ifdef _FOR_PYTHON
841
+ ModelOutputs deepcopy(const ModelOutputs &inp)
842
+ {
843
+ ModelOutputs out = inp;
844
+ return out;
845
+ }
846
+ #endif