outliertree 0.2.1 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/NOTICE.txt +1 -1
- data/README.md +11 -10
- data/ext/outliertree/extconf.rb +1 -1
- data/lib/outliertree/result.rb +3 -3
- data/lib/outliertree/version.rb +1 -1
- data/vendor/outliertree/README.md +77 -40
- data/vendor/outliertree/src/Makevars.in +4 -0
- data/vendor/outliertree/src/Makevars.win +4 -0
- data/vendor/outliertree/src/RcppExports.cpp +20 -9
- data/vendor/outliertree/src/Rwrapper.cpp +256 -57
- data/vendor/outliertree/src/cat_outlier.cpp +6 -6
- data/vendor/outliertree/src/clusters.cpp +114 -9
- data/vendor/outliertree/src/fit_model.cpp +505 -308
- data/vendor/outliertree/src/misc.cpp +165 -4
- data/vendor/outliertree/src/outlier_tree.hpp +159 -51
- data/vendor/outliertree/src/outliertree-win.def +3 -0
- data/vendor/outliertree/src/predict.cpp +33 -0
- data/vendor/outliertree/src/split.cpp +124 -20
- metadata +8 -6
- data/vendor/outliertree/src/Makevars +0 -3
@@ -145,8 +145,8 @@ void check_missing_no_variance(double numeric_data[], size_t ncols, size_t nrows
|
|
145
145
|
private(running_mean, mean_prev, running_ssq, cnt, col_stop, xval, min_val, max_val, min_decimals_col)
|
146
146
|
for (size_t_for col = 0; col < ncols; col++) {
|
147
147
|
running_mean = 0;
|
148
|
-
mean_prev = 0;
|
149
148
|
running_ssq = 0;
|
149
|
+
mean_prev = numeric_data[col * nrows];
|
150
150
|
min_val = HUGE_VAL;
|
151
151
|
max_val = -HUGE_VAL;
|
152
152
|
cnt = 0;
|
@@ -178,11 +178,12 @@ void check_missing_no_variance(double numeric_data[], size_t ncols, size_t nrows
|
|
178
178
|
void calc_central_mean_and_sd(size_t ix_arr[], size_t st, size_t end, double x[], size_t size_quarter, double *mean_central, double *sd_central)
|
179
179
|
{
|
180
180
|
long double running_mean = 0;
|
181
|
-
long double mean_prev = 0;
|
182
181
|
long double running_ssq = 0;
|
182
|
+
long double mean_prev = 0;
|
183
183
|
double xval;
|
184
184
|
size_t st_offset = st + size_quarter;
|
185
185
|
if (ix_arr != NULL) {
|
186
|
+
mean_prev = x[ix_arr[st]];
|
186
187
|
for (size_t row = st_offset; row <= (end - size_quarter); row++) {
|
187
188
|
xval = x[ix_arr[row]];
|
188
189
|
running_mean += (xval - running_mean) / (long double)(row - st_offset + 1);
|
@@ -190,6 +191,7 @@ void calc_central_mean_and_sd(size_t ix_arr[], size_t st, size_t end, double x[]
|
|
190
191
|
mean_prev = running_mean;
|
191
192
|
}
|
192
193
|
} else {
|
194
|
+
mean_prev = x[st_offset];
|
193
195
|
for (size_t row = st_offset; row <= (end - size_quarter); row++) {
|
194
196
|
xval = x[row];
|
195
197
|
running_mean += (xval - running_mean) / (long double)(row - st_offset + 1);
|
@@ -405,7 +407,7 @@ void divide_subset_split(size_t ix_arr[], double x[], size_t st, size_t end, dou
|
|
405
407
|
}
|
406
408
|
|
407
409
|
/* for categorical */
|
408
|
-
void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, char subset_categ[], int ncat, bool has_NA, size_t *split_NA, size_t *st_right)
|
410
|
+
void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, signed char subset_categ[], int ncat, bool has_NA, size_t *split_NA, size_t *st_right)
|
409
411
|
{
|
410
412
|
size_t temp;
|
411
413
|
|
@@ -508,6 +510,7 @@ void backup_recursion_state(Workspace &workspace, RecursionState &state_backup)
|
|
508
510
|
state_backup.has_outliers_restore = workspace.has_outliers;
|
509
511
|
state_backup.lev_has_outliers_restore = workspace.lev_has_outliers;
|
510
512
|
state_backup.temp_ptr_x = workspace.temp_ptr_x;
|
513
|
+
state_backup.is_binary_split_restore = workspace.is_binary_split;
|
511
514
|
}
|
512
515
|
|
513
516
|
void restore_recursion_state(Workspace &workspace, RecursionState &state_backup)
|
@@ -532,6 +535,7 @@ void restore_recursion_state(Workspace &workspace, RecursionState &state_backup)
|
|
532
535
|
workspace.has_outliers = state_backup.has_outliers_restore;
|
533
536
|
workspace.lev_has_outliers = state_backup.lev_has_outliers_restore;
|
534
537
|
workspace.temp_ptr_x = state_backup.temp_ptr_x;
|
538
|
+
workspace.is_binary_split = state_backup.is_binary_split_restore;
|
535
539
|
}
|
536
540
|
|
537
541
|
/* Next split on the trees is only decided after they are already initialized */
|
@@ -542,7 +546,7 @@ void set_tree_as_numeric(ClusterTree &tree, double split_point, size_t col)
|
|
542
546
|
tree.col_num = col;
|
543
547
|
}
|
544
548
|
|
545
|
-
void set_tree_as_categorical(ClusterTree &tree, int ncat, char *split_subset, size_t col)
|
549
|
+
void set_tree_as_categorical(ClusterTree &tree, int ncat, signed char *split_subset, size_t col)
|
546
550
|
{
|
547
551
|
tree.column_type = Categorical;
|
548
552
|
tree.col_num = col;
|
@@ -587,6 +591,7 @@ void forget_row_outputs(ModelOutputs &model_outputs)
|
|
587
591
|
model_outputs.outlier_trees_final.clear();
|
588
592
|
model_outputs.outlier_depth_final.clear();
|
589
593
|
model_outputs.outlier_decimals_distr.clear();
|
594
|
+
model_outputs.min_decimals_col.clear();
|
590
595
|
|
591
596
|
model_outputs.outlier_scores_final.shrink_to_fit();
|
592
597
|
model_outputs.outlier_clusters_final.shrink_to_fit();
|
@@ -594,6 +599,7 @@ void forget_row_outputs(ModelOutputs &model_outputs)
|
|
594
599
|
model_outputs.outlier_trees_final.shrink_to_fit();
|
595
600
|
model_outputs.outlier_depth_final.shrink_to_fit();
|
596
601
|
model_outputs.outlier_decimals_distr.shrink_to_fit();
|
602
|
+
model_outputs.min_decimals_col.shrink_to_fit();
|
597
603
|
}
|
598
604
|
|
599
605
|
void allocate_row_outputs(ModelOutputs &model_outputs, size_t nrows, size_t max_depth)
|
@@ -605,6 +611,7 @@ void allocate_row_outputs(ModelOutputs &model_outputs, size_t nrows, size_t max_
|
|
605
611
|
model_outputs.outlier_trees_final.resize(nrows);
|
606
612
|
model_outputs.outlier_depth_final.resize(nrows, max_depth + 2);
|
607
613
|
model_outputs.outlier_decimals_distr.resize(nrows, 0);
|
614
|
+
model_outputs.min_decimals_col.resize(nrows);
|
608
615
|
|
609
616
|
model_outputs.outlier_scores_final.shrink_to_fit();
|
610
617
|
model_outputs.outlier_clusters_final.shrink_to_fit();
|
@@ -612,6 +619,7 @@ void allocate_row_outputs(ModelOutputs &model_outputs, size_t nrows, size_t max_
|
|
612
619
|
model_outputs.outlier_trees_final.shrink_to_fit();
|
613
620
|
model_outputs.outlier_depth_final.shrink_to_fit();
|
614
621
|
model_outputs.outlier_decimals_distr.shrink_to_fit();
|
622
|
+
model_outputs.min_decimals_col.shrink_to_fit();
|
615
623
|
}
|
616
624
|
|
617
625
|
void check_more_two_values(double arr_num[], size_t nrows, size_t ncols, int nthreads, char too_few_values[])
|
@@ -683,3 +691,156 @@ void dealloc_ModelOutputs(ModelOutputs &model_outputs)
|
|
683
691
|
{
|
684
692
|
model_outputs.~ModelOutputs();
|
685
693
|
}
|
694
|
+
|
695
|
+
ModelOutputs get_empty_ModelOutputs()
|
696
|
+
{
|
697
|
+
return ModelOutputs();
|
698
|
+
}
|
699
|
+
|
700
|
+
bool get_has_openmp()
|
701
|
+
{
|
702
|
+
#ifdef _OPENMP
|
703
|
+
return true;
|
704
|
+
#else
|
705
|
+
return false;
|
706
|
+
#endif
|
707
|
+
}
|
708
|
+
|
709
|
+
bool interrupt_switch = false;
|
710
|
+
bool handle_is_locked = false;
|
711
|
+
|
712
|
+
/* Function to handle interrupt signals */
|
713
|
+
void set_interrup_global_variable(int s)
|
714
|
+
{
|
715
|
+
#pragma omp critical
|
716
|
+
{
|
717
|
+
interrupt_switch = true;
|
718
|
+
}
|
719
|
+
}
|
720
|
+
|
721
|
+
void check_interrupt_switch(SignalSwitcher &ss)
|
722
|
+
{
|
723
|
+
if (interrupt_switch)
|
724
|
+
{
|
725
|
+
ss.restore_handle();
|
726
|
+
#ifndef _FOR_R
|
727
|
+
fprintf(stderr, "Error: procedure was interrupted\n");
|
728
|
+
#else
|
729
|
+
REprintf("Error: procedure was interrupted\n");
|
730
|
+
#endif
|
731
|
+
raise(SIGINT);
|
732
|
+
#ifdef _FOR_R
|
733
|
+
Rcpp::checkUserInterrupt();
|
734
|
+
#elif !defined(DONT_THROW_ON_INTERRUPT)
|
735
|
+
throw "Error: procedure was interrupted.\n";
|
736
|
+
#endif
|
737
|
+
}
|
738
|
+
}
|
739
|
+
|
740
|
+
#ifdef _FOR_PYTHON
|
741
|
+
bool cy_check_interrupt_switch()
|
742
|
+
{
|
743
|
+
return interrupt_switch;
|
744
|
+
}
|
745
|
+
void cy_tick_off_interrupt_switch()
|
746
|
+
{
|
747
|
+
interrupt_switch = false;
|
748
|
+
}
|
749
|
+
#endif
|
750
|
+
|
751
|
+
SignalSwitcher::SignalSwitcher()
|
752
|
+
{
|
753
|
+
#pragma omp critical
|
754
|
+
{
|
755
|
+
if (!handle_is_locked)
|
756
|
+
{
|
757
|
+
handle_is_locked = true;
|
758
|
+
interrupt_switch = false;
|
759
|
+
this->old_sig = signal(SIGINT, set_interrup_global_variable);
|
760
|
+
this->is_active = true;
|
761
|
+
}
|
762
|
+
|
763
|
+
else {
|
764
|
+
this->is_active = false;
|
765
|
+
}
|
766
|
+
}
|
767
|
+
}
|
768
|
+
|
769
|
+
SignalSwitcher::~SignalSwitcher()
|
770
|
+
{
|
771
|
+
#ifndef _FOR_PYTHON
|
772
|
+
#pragma omp critical
|
773
|
+
{
|
774
|
+
if (this->is_active && handle_is_locked)
|
775
|
+
interrupt_switch = false;
|
776
|
+
}
|
777
|
+
#endif
|
778
|
+
this->restore_handle();
|
779
|
+
}
|
780
|
+
|
781
|
+
void SignalSwitcher::restore_handle()
|
782
|
+
{
|
783
|
+
#pragma omp critical
|
784
|
+
{
|
785
|
+
if (this->is_active && handle_is_locked)
|
786
|
+
{
|
787
|
+
signal(SIGINT, this->old_sig);
|
788
|
+
this->is_active = false;
|
789
|
+
handle_is_locked = false;
|
790
|
+
}
|
791
|
+
}
|
792
|
+
}
|
793
|
+
|
794
|
+
/* ceil(log2(x)) done with bit-wise operations ensures perfect precision (and it's faster too)
|
795
|
+
https://stackoverflow.com/questions/2589096/find-most-significant-bit-left-most-that-is-set-in-a-bit-array
|
796
|
+
https://stackoverflow.com/questions/11376288/fast-computing-of-log2-for-64-bit-integers */
|
797
|
+
#if SIZE_MAX == UINT32_MAX /* 32-bit systems */
|
798
|
+
constexpr static const int MultiplyDeBruijnBitPosition[32] =
|
799
|
+
{
|
800
|
+
0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
|
801
|
+
8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31
|
802
|
+
};
|
803
|
+
size_t log2ceil( size_t v )
|
804
|
+
{
|
805
|
+
v--;
|
806
|
+
v |= v >> 1; // first round down to one less than a power of 2
|
807
|
+
v |= v >> 2;
|
808
|
+
v |= v >> 4;
|
809
|
+
v |= v >> 8;
|
810
|
+
v |= v >> 16;
|
811
|
+
|
812
|
+
return MultiplyDeBruijnBitPosition[( uint32_t )( v * 0x07C4ACDDU ) >> 27] + 1;
|
813
|
+
}
|
814
|
+
#elif SIZE_MAX == UINT64_MAX /* 64-bit systems */
|
815
|
+
constexpr static const uint64_t tab64[64] = {
|
816
|
+
63, 0, 58, 1, 59, 47, 53, 2,
|
817
|
+
60, 39, 48, 27, 54, 33, 42, 3,
|
818
|
+
61, 51, 37, 40, 49, 18, 28, 20,
|
819
|
+
55, 30, 34, 11, 43, 14, 22, 4,
|
820
|
+
62, 57, 46, 52, 38, 26, 32, 41,
|
821
|
+
50, 36, 17, 19, 29, 10, 13, 21,
|
822
|
+
56, 45, 25, 31, 35, 16, 9, 12,
|
823
|
+
44, 24, 15, 8, 23, 7, 6, 5};
|
824
|
+
|
825
|
+
size_t log2ceil(size_t value)
|
826
|
+
{
|
827
|
+
value--;
|
828
|
+
value |= value >> 1;
|
829
|
+
value |= value >> 2;
|
830
|
+
value |= value >> 4;
|
831
|
+
value |= value >> 8;
|
832
|
+
value |= value >> 16;
|
833
|
+
value |= value >> 32;
|
834
|
+
return tab64[((uint64_t)((value - (value >> 1))*0x07EDD5E59A4E28C2)) >> 58] + 1;
|
835
|
+
}
|
836
|
+
#else /* other architectures - might not be entirely precise, and will be slower */
|
837
|
+
size_t log2ceil(size_t x) {return (size_t)(ceill(log2l((long double) x)));}
|
838
|
+
#endif
|
839
|
+
|
840
|
+
#ifdef _FOR_PYTHON
|
841
|
+
ModelOutputs deepcopy(const ModelOutputs &inp)
|
842
|
+
{
|
843
|
+
ModelOutputs out = inp;
|
844
|
+
return out;
|
845
|
+
}
|
846
|
+
#endif
|