outliertree 0.2.1 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -145,8 +145,8 @@ void check_missing_no_variance(double numeric_data[], size_t ncols, size_t nrows
145
145
  private(running_mean, mean_prev, running_ssq, cnt, col_stop, xval, min_val, max_val, min_decimals_col)
146
146
  for (size_t_for col = 0; col < ncols; col++) {
147
147
  running_mean = 0;
148
- mean_prev = 0;
149
148
  running_ssq = 0;
149
+ mean_prev = numeric_data[col * nrows];
150
150
  min_val = HUGE_VAL;
151
151
  max_val = -HUGE_VAL;
152
152
  cnt = 0;
@@ -178,11 +178,12 @@ void check_missing_no_variance(double numeric_data[], size_t ncols, size_t nrows
178
178
  void calc_central_mean_and_sd(size_t ix_arr[], size_t st, size_t end, double x[], size_t size_quarter, double *mean_central, double *sd_central)
179
179
  {
180
180
  long double running_mean = 0;
181
- long double mean_prev = 0;
182
181
  long double running_ssq = 0;
182
+ long double mean_prev = 0;
183
183
  double xval;
184
184
  size_t st_offset = st + size_quarter;
185
185
  if (ix_arr != NULL) {
186
+ mean_prev = x[ix_arr[st]];
186
187
  for (size_t row = st_offset; row <= (end - size_quarter); row++) {
187
188
  xval = x[ix_arr[row]];
188
189
  running_mean += (xval - running_mean) / (long double)(row - st_offset + 1);
@@ -190,6 +191,7 @@ void calc_central_mean_and_sd(size_t ix_arr[], size_t st, size_t end, double x[]
190
191
  mean_prev = running_mean;
191
192
  }
192
193
  } else {
194
+ mean_prev = x[st_offset];
193
195
  for (size_t row = st_offset; row <= (end - size_quarter); row++) {
194
196
  xval = x[row];
195
197
  running_mean += (xval - running_mean) / (long double)(row - st_offset + 1);
@@ -405,7 +407,7 @@ void divide_subset_split(size_t ix_arr[], double x[], size_t st, size_t end, dou
405
407
  }
406
408
 
407
409
  /* for categorical */
408
- void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, char subset_categ[], int ncat, bool has_NA, size_t *split_NA, size_t *st_right)
410
+ void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, signed char subset_categ[], int ncat, bool has_NA, size_t *split_NA, size_t *st_right)
409
411
  {
410
412
  size_t temp;
411
413
 
@@ -508,6 +510,7 @@ void backup_recursion_state(Workspace &workspace, RecursionState &state_backup)
508
510
  state_backup.has_outliers_restore = workspace.has_outliers;
509
511
  state_backup.lev_has_outliers_restore = workspace.lev_has_outliers;
510
512
  state_backup.temp_ptr_x = workspace.temp_ptr_x;
513
+ state_backup.is_binary_split_restore = workspace.is_binary_split;
511
514
  }
512
515
 
513
516
  void restore_recursion_state(Workspace &workspace, RecursionState &state_backup)
@@ -532,6 +535,7 @@ void restore_recursion_state(Workspace &workspace, RecursionState &state_backup)
532
535
  workspace.has_outliers = state_backup.has_outliers_restore;
533
536
  workspace.lev_has_outliers = state_backup.lev_has_outliers_restore;
534
537
  workspace.temp_ptr_x = state_backup.temp_ptr_x;
538
+ workspace.is_binary_split = state_backup.is_binary_split_restore;
535
539
  }
536
540
 
537
541
  /* Next split on the trees is only decided after they are already initialized */
@@ -542,7 +546,7 @@ void set_tree_as_numeric(ClusterTree &tree, double split_point, size_t col)
542
546
  tree.col_num = col;
543
547
  }
544
548
 
545
- void set_tree_as_categorical(ClusterTree &tree, int ncat, char *split_subset, size_t col)
549
+ void set_tree_as_categorical(ClusterTree &tree, int ncat, signed char *split_subset, size_t col)
546
550
  {
547
551
  tree.column_type = Categorical;
548
552
  tree.col_num = col;
@@ -587,6 +591,7 @@ void forget_row_outputs(ModelOutputs &model_outputs)
587
591
  model_outputs.outlier_trees_final.clear();
588
592
  model_outputs.outlier_depth_final.clear();
589
593
  model_outputs.outlier_decimals_distr.clear();
594
+ model_outputs.min_decimals_col.clear();
590
595
 
591
596
  model_outputs.outlier_scores_final.shrink_to_fit();
592
597
  model_outputs.outlier_clusters_final.shrink_to_fit();
@@ -594,6 +599,7 @@ void forget_row_outputs(ModelOutputs &model_outputs)
594
599
  model_outputs.outlier_trees_final.shrink_to_fit();
595
600
  model_outputs.outlier_depth_final.shrink_to_fit();
596
601
  model_outputs.outlier_decimals_distr.shrink_to_fit();
602
+ model_outputs.min_decimals_col.shrink_to_fit();
597
603
  }
598
604
 
599
605
  void allocate_row_outputs(ModelOutputs &model_outputs, size_t nrows, size_t max_depth)
@@ -605,6 +611,7 @@ void allocate_row_outputs(ModelOutputs &model_outputs, size_t nrows, size_t max_
605
611
  model_outputs.outlier_trees_final.resize(nrows);
606
612
  model_outputs.outlier_depth_final.resize(nrows, max_depth + 2);
607
613
  model_outputs.outlier_decimals_distr.resize(nrows, 0);
614
+ model_outputs.min_decimals_col.resize(nrows);
608
615
 
609
616
  model_outputs.outlier_scores_final.shrink_to_fit();
610
617
  model_outputs.outlier_clusters_final.shrink_to_fit();
@@ -612,6 +619,7 @@ void allocate_row_outputs(ModelOutputs &model_outputs, size_t nrows, size_t max_
612
619
  model_outputs.outlier_trees_final.shrink_to_fit();
613
620
  model_outputs.outlier_depth_final.shrink_to_fit();
614
621
  model_outputs.outlier_decimals_distr.shrink_to_fit();
622
+ model_outputs.min_decimals_col.shrink_to_fit();
615
623
  }
616
624
 
617
625
  void check_more_two_values(double arr_num[], size_t nrows, size_t ncols, int nthreads, char too_few_values[])
@@ -683,3 +691,156 @@ void dealloc_ModelOutputs(ModelOutputs &model_outputs)
683
691
  {
684
692
  model_outputs.~ModelOutputs();
685
693
  }
694
+
695
+ ModelOutputs get_empty_ModelOutputs()
696
+ {
697
+ return ModelOutputs();
698
+ }
699
+
700
+ bool get_has_openmp()
701
+ {
702
+ #ifdef _OPENMP
703
+ return true;
704
+ #else
705
+ return false;
706
+ #endif
707
+ }
708
+
709
+ bool interrupt_switch = false;
710
+ bool handle_is_locked = false;
711
+
712
+ /* Function to handle interrupt signals */
713
+ void set_interrup_global_variable(int s)
714
+ {
715
+ #pragma omp critical
716
+ {
717
+ interrupt_switch = true;
718
+ }
719
+ }
720
+
721
+ void check_interrupt_switch(SignalSwitcher &ss)
722
+ {
723
+ if (interrupt_switch)
724
+ {
725
+ ss.restore_handle();
726
+ #ifndef _FOR_R
727
+ fprintf(stderr, "Error: procedure was interrupted\n");
728
+ #else
729
+ REprintf("Error: procedure was interrupted\n");
730
+ #endif
731
+ raise(SIGINT);
732
+ #ifdef _FOR_R
733
+ Rcpp::checkUserInterrupt();
734
+ #elif !defined(DONT_THROW_ON_INTERRUPT)
735
+ throw "Error: procedure was interrupted.\n";
736
+ #endif
737
+ }
738
+ }
739
+
740
+ #ifdef _FOR_PYTHON
741
+ bool cy_check_interrupt_switch()
742
+ {
743
+ return interrupt_switch;
744
+ }
745
+ void cy_tick_off_interrupt_switch()
746
+ {
747
+ interrupt_switch = false;
748
+ }
749
+ #endif
750
+
751
+ SignalSwitcher::SignalSwitcher()
752
+ {
753
+ #pragma omp critical
754
+ {
755
+ if (!handle_is_locked)
756
+ {
757
+ handle_is_locked = true;
758
+ interrupt_switch = false;
759
+ this->old_sig = signal(SIGINT, set_interrup_global_variable);
760
+ this->is_active = true;
761
+ }
762
+
763
+ else {
764
+ this->is_active = false;
765
+ }
766
+ }
767
+ }
768
+
769
+ SignalSwitcher::~SignalSwitcher()
770
+ {
771
+ #ifndef _FOR_PYTHON
772
+ #pragma omp critical
773
+ {
774
+ if (this->is_active && handle_is_locked)
775
+ interrupt_switch = false;
776
+ }
777
+ #endif
778
+ this->restore_handle();
779
+ }
780
+
781
+ void SignalSwitcher::restore_handle()
782
+ {
783
+ #pragma omp critical
784
+ {
785
+ if (this->is_active && handle_is_locked)
786
+ {
787
+ signal(SIGINT, this->old_sig);
788
+ this->is_active = false;
789
+ handle_is_locked = false;
790
+ }
791
+ }
792
+ }
793
+
794
+ /* ceil(log2(x)) done with bit-wise operations ensures perfect precision (and it's faster too)
795
+ https://stackoverflow.com/questions/2589096/find-most-significant-bit-left-most-that-is-set-in-a-bit-array
796
+ https://stackoverflow.com/questions/11376288/fast-computing-of-log2-for-64-bit-integers */
797
+ #if SIZE_MAX == UINT32_MAX /* 32-bit systems */
798
+ constexpr static const int MultiplyDeBruijnBitPosition[32] =
799
+ {
800
+ 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
801
+ 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31
802
+ };
803
+ size_t log2ceil( size_t v )
804
+ {
805
+ v--;
806
+ v |= v >> 1; // first round down to one less than a power of 2
807
+ v |= v >> 2;
808
+ v |= v >> 4;
809
+ v |= v >> 8;
810
+ v |= v >> 16;
811
+
812
+ return MultiplyDeBruijnBitPosition[( uint32_t )( v * 0x07C4ACDDU ) >> 27] + 1;
813
+ }
814
+ #elif SIZE_MAX == UINT64_MAX /* 64-bit systems */
815
+ constexpr static const uint64_t tab64[64] = {
816
+ 63, 0, 58, 1, 59, 47, 53, 2,
817
+ 60, 39, 48, 27, 54, 33, 42, 3,
818
+ 61, 51, 37, 40, 49, 18, 28, 20,
819
+ 55, 30, 34, 11, 43, 14, 22, 4,
820
+ 62, 57, 46, 52, 38, 26, 32, 41,
821
+ 50, 36, 17, 19, 29, 10, 13, 21,
822
+ 56, 45, 25, 31, 35, 16, 9, 12,
823
+ 44, 24, 15, 8, 23, 7, 6, 5};
824
+
825
+ size_t log2ceil(size_t value)
826
+ {
827
+ value--;
828
+ value |= value >> 1;
829
+ value |= value >> 2;
830
+ value |= value >> 4;
831
+ value |= value >> 8;
832
+ value |= value >> 16;
833
+ value |= value >> 32;
834
+ return tab64[((uint64_t)((value - (value >> 1))*0x07EDD5E59A4E28C2)) >> 58] + 1;
835
+ }
836
+ #else /* other architectures - might not be entirely precise, and will be slower */
837
+ size_t log2ceil(size_t x) {return (size_t)(ceill(log2l((long double) x)));}
838
+ #endif
839
+
840
+ #ifdef _FOR_PYTHON
841
+ ModelOutputs deepcopy(const ModelOutputs &inp)
842
+ {
843
+ ModelOutputs out = inp;
844
+ return out;
845
+ }
846
+ #endif