outliertree 0.2.1 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/NOTICE.txt +1 -1
- data/README.md +11 -10
- data/ext/outliertree/ext.cpp +23 -0
- data/ext/outliertree/extconf.rb +1 -1
- data/lib/outliertree/result.rb +3 -3
- data/lib/outliertree/version.rb +1 -1
- data/vendor/outliertree/README.md +83 -41
- data/vendor/outliertree/src/Makevars.in +3 -0
- data/vendor/outliertree/src/Makevars.win +3 -0
- data/vendor/outliertree/src/RcppExports.cpp +17 -27
- data/vendor/outliertree/src/Rwrapper.cpp +354 -62
- data/vendor/outliertree/src/cat_outlier.cpp +6 -6
- data/vendor/outliertree/src/clusters.cpp +114 -9
- data/vendor/outliertree/src/fit_model.cpp +525 -331
- data/vendor/outliertree/src/misc.cpp +166 -17
- data/vendor/outliertree/src/outlier_tree.hpp +164 -56
- data/vendor/outliertree/src/outliertree-win.def +3 -0
- data/vendor/outliertree/src/predict.cpp +33 -0
- data/vendor/outliertree/src/split.cpp +124 -20
- metadata +8 -6
- data/vendor/outliertree/src/Makevars +0 -3
@@ -39,7 +39,7 @@
|
|
39
39
|
* at which position will the counts for a given column start. Note that NAs are stored as the last index in each
|
40
40
|
* column, so each one needs one extra category
|
41
41
|
*/
|
42
|
-
int calculate_category_indices(size_t start_ix_cat_counts[], int ncat[], size_t ncols,
|
42
|
+
int calculate_category_indices(size_t start_ix_cat_counts[], int ncat[], size_t ncols, char skip_col[], int max_categ)
|
43
43
|
{
|
44
44
|
for (size_t col = 0; col < ncols; col++) {
|
45
45
|
max_categ = std::max(ncat[col], max_categ);
|
@@ -53,7 +53,7 @@ int calculate_category_indices(size_t start_ix_cat_counts[], int ncat[], size_t
|
|
53
53
|
/* Save the counts of each category for each column in the array determined above */
|
54
54
|
void calculate_all_cat_counts(size_t start_ix_cat_counts[], size_t cat_counts[], int ncat[],
|
55
55
|
int categorical_data[], size_t ncols, size_t nrows,
|
56
|
-
|
56
|
+
char has_NA[], char skip_col[], int nthreads)
|
57
57
|
{
|
58
58
|
size_t col_st_offset;
|
59
59
|
size_t col_stop;
|
@@ -80,7 +80,7 @@ void calculate_all_cat_counts(size_t start_ix_cat_counts[], size_t cat_counts[],
|
|
80
80
|
|
81
81
|
/* Check if some column has a large majority that would make any split fail to meet minimum sizes */
|
82
82
|
void check_cat_col_unsplittable(size_t start_ix_cat_counts[], size_t cat_counts[], int ncat[],
|
83
|
-
size_t ncols, size_t min_conditioned_size, size_t nrows,
|
83
|
+
size_t ncols, size_t min_conditioned_size, size_t nrows, char skip_col[], int nthreads)
|
84
84
|
{
|
85
85
|
size_t largest_cnt;
|
86
86
|
#pragma omp parallel for num_threads(nthreads) private(largest_cnt) shared(ncols, nrows, ncat, cat_counts, start_ix_cat_counts, min_conditioned_size, skip_col)
|
@@ -127,8 +127,8 @@ void calculate_lowerlim_proportion(long double *restrict prop_small, long double
|
|
127
127
|
|
128
128
|
/* Check if a numerical column has no variance (i.e. will not be splittable).
|
129
129
|
Along the way, also record the number of decimals to display for this column. */
|
130
|
-
void check_missing_no_variance(double numeric_data[], size_t ncols, size_t nrows,
|
131
|
-
|
130
|
+
void check_missing_no_variance(double numeric_data[], size_t ncols, size_t nrows, char has_NA[],
|
131
|
+
char skip_col[], int min_decimals[], int nthreads)
|
132
132
|
{
|
133
133
|
long double running_mean;
|
134
134
|
long double mean_prev;
|
@@ -145,8 +145,8 @@ void check_missing_no_variance(double numeric_data[], size_t ncols, size_t nrows
|
|
145
145
|
private(running_mean, mean_prev, running_ssq, cnt, col_stop, xval, min_val, max_val, min_decimals_col)
|
146
146
|
for (size_t_for col = 0; col < ncols; col++) {
|
147
147
|
running_mean = 0;
|
148
|
-
mean_prev = 0;
|
149
148
|
running_ssq = 0;
|
149
|
+
mean_prev = numeric_data[col * nrows];
|
150
150
|
min_val = HUGE_VAL;
|
151
151
|
max_val = -HUGE_VAL;
|
152
152
|
cnt = 0;
|
@@ -178,11 +178,12 @@ void check_missing_no_variance(double numeric_data[], size_t ncols, size_t nrows
|
|
178
178
|
void calc_central_mean_and_sd(size_t ix_arr[], size_t st, size_t end, double x[], size_t size_quarter, double *mean_central, double *sd_central)
|
179
179
|
{
|
180
180
|
long double running_mean = 0;
|
181
|
-
long double mean_prev = 0;
|
182
181
|
long double running_ssq = 0;
|
182
|
+
long double mean_prev = 0;
|
183
183
|
double xval;
|
184
184
|
size_t st_offset = st + size_quarter;
|
185
185
|
if (ix_arr != NULL) {
|
186
|
+
mean_prev = x[ix_arr[st]];
|
186
187
|
for (size_t row = st_offset; row <= (end - size_quarter); row++) {
|
187
188
|
xval = x[ix_arr[row]];
|
188
189
|
running_mean += (xval - running_mean) / (long double)(row - st_offset + 1);
|
@@ -190,6 +191,7 @@ void calc_central_mean_and_sd(size_t ix_arr[], size_t st, size_t end, double x[]
|
|
190
191
|
mean_prev = running_mean;
|
191
192
|
}
|
192
193
|
} else {
|
194
|
+
mean_prev = x[st_offset];
|
193
195
|
for (size_t row = st_offset; row <= (end - size_quarter); row++) {
|
194
196
|
xval = x[row];
|
195
197
|
running_mean += (xval - running_mean) / (long double)(row - st_offset + 1);
|
@@ -405,7 +407,7 @@ void divide_subset_split(size_t ix_arr[], double x[], size_t st, size_t end, dou
|
|
405
407
|
}
|
406
408
|
|
407
409
|
/* for categorical */
|
408
|
-
void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, char subset_categ[], int ncat, bool has_NA, size_t *split_NA, size_t *st_right)
|
410
|
+
void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, signed char subset_categ[], int ncat, bool has_NA, size_t *split_NA, size_t *st_right)
|
409
411
|
{
|
410
412
|
size_t temp;
|
411
413
|
|
@@ -508,6 +510,7 @@ void backup_recursion_state(Workspace &workspace, RecursionState &state_backup)
|
|
508
510
|
state_backup.has_outliers_restore = workspace.has_outliers;
|
509
511
|
state_backup.lev_has_outliers_restore = workspace.lev_has_outliers;
|
510
512
|
state_backup.temp_ptr_x = workspace.temp_ptr_x;
|
513
|
+
state_backup.is_binary_split_restore = workspace.is_binary_split;
|
511
514
|
}
|
512
515
|
|
513
516
|
void restore_recursion_state(Workspace &workspace, RecursionState &state_backup)
|
@@ -532,6 +535,7 @@ void restore_recursion_state(Workspace &workspace, RecursionState &state_backup)
|
|
532
535
|
workspace.has_outliers = state_backup.has_outliers_restore;
|
533
536
|
workspace.lev_has_outliers = state_backup.lev_has_outliers_restore;
|
534
537
|
workspace.temp_ptr_x = state_backup.temp_ptr_x;
|
538
|
+
workspace.is_binary_split = state_backup.is_binary_split_restore;
|
535
539
|
}
|
536
540
|
|
537
541
|
/* Next split on the trees is only decided after they are already initialized */
|
@@ -542,7 +546,7 @@ void set_tree_as_numeric(ClusterTree &tree, double split_point, size_t col)
|
|
542
546
|
tree.col_num = col;
|
543
547
|
}
|
544
548
|
|
545
|
-
void set_tree_as_categorical(ClusterTree &tree, int ncat, char *split_subset, size_t col)
|
549
|
+
void set_tree_as_categorical(ClusterTree &tree, int ncat, signed char *split_subset, size_t col)
|
546
550
|
{
|
547
551
|
tree.column_type = Categorical;
|
548
552
|
tree.col_num = col;
|
@@ -587,6 +591,7 @@ void forget_row_outputs(ModelOutputs &model_outputs)
|
|
587
591
|
model_outputs.outlier_trees_final.clear();
|
588
592
|
model_outputs.outlier_depth_final.clear();
|
589
593
|
model_outputs.outlier_decimals_distr.clear();
|
594
|
+
model_outputs.min_decimals_col.clear();
|
590
595
|
|
591
596
|
model_outputs.outlier_scores_final.shrink_to_fit();
|
592
597
|
model_outputs.outlier_clusters_final.shrink_to_fit();
|
@@ -594,6 +599,7 @@ void forget_row_outputs(ModelOutputs &model_outputs)
|
|
594
599
|
model_outputs.outlier_trees_final.shrink_to_fit();
|
595
600
|
model_outputs.outlier_depth_final.shrink_to_fit();
|
596
601
|
model_outputs.outlier_decimals_distr.shrink_to_fit();
|
602
|
+
model_outputs.min_decimals_col.shrink_to_fit();
|
597
603
|
}
|
598
604
|
|
599
605
|
void allocate_row_outputs(ModelOutputs &model_outputs, size_t nrows, size_t max_depth)
|
@@ -605,6 +611,7 @@ void allocate_row_outputs(ModelOutputs &model_outputs, size_t nrows, size_t max_
|
|
605
611
|
model_outputs.outlier_trees_final.resize(nrows);
|
606
612
|
model_outputs.outlier_depth_final.resize(nrows, max_depth + 2);
|
607
613
|
model_outputs.outlier_decimals_distr.resize(nrows, 0);
|
614
|
+
model_outputs.min_decimals_col.resize(nrows);
|
608
615
|
|
609
616
|
model_outputs.outlier_scores_final.shrink_to_fit();
|
610
617
|
model_outputs.outlier_clusters_final.shrink_to_fit();
|
@@ -612,6 +619,7 @@ void allocate_row_outputs(ModelOutputs &model_outputs, size_t nrows, size_t max_
|
|
612
619
|
model_outputs.outlier_trees_final.shrink_to_fit();
|
613
620
|
model_outputs.outlier_depth_final.shrink_to_fit();
|
614
621
|
model_outputs.outlier_decimals_distr.shrink_to_fit();
|
622
|
+
model_outputs.min_decimals_col.shrink_to_fit();
|
615
623
|
}
|
616
624
|
|
617
625
|
void check_more_two_values(double arr_num[], size_t nrows, size_t ncols, int nthreads, char too_few_values[])
|
@@ -672,14 +680,155 @@ int decimals_diff(double val1, double val2)
|
|
672
680
|
return (int) res;
|
673
681
|
}
|
674
682
|
|
683
|
+
ModelOutputs get_empty_ModelOutputs()
|
684
|
+
{
|
685
|
+
return ModelOutputs();
|
686
|
+
}
|
675
687
|
|
676
|
-
|
677
|
-
|
678
|
-
|
679
|
-
|
680
|
-
|
681
|
-
|
682
|
-
|
688
|
+
bool get_has_openmp()
|
689
|
+
{
|
690
|
+
#ifdef _OPENMP
|
691
|
+
return true;
|
692
|
+
#else
|
693
|
+
return false;
|
694
|
+
#endif
|
695
|
+
}
|
696
|
+
|
697
|
+
bool interrupt_switch = false;
|
698
|
+
bool handle_is_locked = false;
|
699
|
+
|
700
|
+
/* Function to handle interrupt signals */
|
701
|
+
void set_interrup_global_variable(int s)
|
702
|
+
{
|
703
|
+
#pragma omp critical
|
704
|
+
{
|
705
|
+
interrupt_switch = true;
|
706
|
+
}
|
707
|
+
}
|
708
|
+
|
709
|
+
void check_interrupt_switch(SignalSwitcher &ss)
|
710
|
+
{
|
711
|
+
if (interrupt_switch)
|
712
|
+
{
|
713
|
+
ss.restore_handle();
|
714
|
+
#ifndef _FOR_R
|
715
|
+
fprintf(stderr, "Error: procedure was interrupted\n");
|
716
|
+
#else
|
717
|
+
REprintf("Error: procedure was interrupted\n");
|
718
|
+
#endif
|
719
|
+
raise(SIGINT);
|
720
|
+
#ifdef _FOR_R
|
721
|
+
Rcpp::checkUserInterrupt();
|
722
|
+
#elif !defined(DONT_THROW_ON_INTERRUPT)
|
723
|
+
throw "Error: procedure was interrupted.\n";
|
724
|
+
#endif
|
725
|
+
}
|
726
|
+
}
|
727
|
+
|
728
|
+
#ifdef _FOR_PYTHON
|
729
|
+
bool cy_check_interrupt_switch()
|
730
|
+
{
|
731
|
+
return interrupt_switch;
|
732
|
+
}
|
733
|
+
void cy_tick_off_interrupt_switch()
|
734
|
+
{
|
735
|
+
interrupt_switch = false;
|
736
|
+
}
|
737
|
+
#endif
|
738
|
+
|
739
|
+
SignalSwitcher::SignalSwitcher()
|
740
|
+
{
|
741
|
+
#pragma omp critical
|
742
|
+
{
|
743
|
+
if (!handle_is_locked)
|
744
|
+
{
|
745
|
+
handle_is_locked = true;
|
746
|
+
interrupt_switch = false;
|
747
|
+
this->old_sig = signal(SIGINT, set_interrup_global_variable);
|
748
|
+
this->is_active = true;
|
749
|
+
}
|
750
|
+
|
751
|
+
else {
|
752
|
+
this->is_active = false;
|
753
|
+
}
|
754
|
+
}
|
755
|
+
}
|
756
|
+
|
757
|
+
SignalSwitcher::~SignalSwitcher()
|
758
|
+
{
|
759
|
+
#ifndef _FOR_PYTHON
|
760
|
+
#pragma omp critical
|
761
|
+
{
|
762
|
+
if (this->is_active && handle_is_locked)
|
763
|
+
interrupt_switch = false;
|
764
|
+
}
|
765
|
+
#endif
|
766
|
+
this->restore_handle();
|
767
|
+
}
|
768
|
+
|
769
|
+
void SignalSwitcher::restore_handle()
|
770
|
+
{
|
771
|
+
#pragma omp critical
|
772
|
+
{
|
773
|
+
if (this->is_active && handle_is_locked)
|
774
|
+
{
|
775
|
+
signal(SIGINT, this->old_sig);
|
776
|
+
this->is_active = false;
|
777
|
+
handle_is_locked = false;
|
778
|
+
}
|
779
|
+
}
|
780
|
+
}
|
781
|
+
|
782
|
+
/* ceil(log2(x)) done with bit-wise operations ensures perfect precision (and it's faster too)
|
783
|
+
https://stackoverflow.com/questions/2589096/find-most-significant-bit-left-most-that-is-set-in-a-bit-array
|
784
|
+
https://stackoverflow.com/questions/11376288/fast-computing-of-log2-for-64-bit-integers */
|
785
|
+
#if SIZE_MAX == UINT32_MAX /* 32-bit systems */
|
786
|
+
constexpr static const int MultiplyDeBruijnBitPosition[32] =
|
787
|
+
{
|
788
|
+
0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
|
789
|
+
8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31
|
790
|
+
};
|
791
|
+
size_t log2ceil( size_t v )
|
792
|
+
{
|
793
|
+
v--;
|
794
|
+
v |= v >> 1; // first round down to one less than a power of 2
|
795
|
+
v |= v >> 2;
|
796
|
+
v |= v >> 4;
|
797
|
+
v |= v >> 8;
|
798
|
+
v |= v >> 16;
|
799
|
+
|
800
|
+
return MultiplyDeBruijnBitPosition[( uint32_t )( v * 0x07C4ACDDU ) >> 27] + 1;
|
801
|
+
}
|
802
|
+
#elif SIZE_MAX == UINT64_MAX /* 64-bit systems */
|
803
|
+
constexpr static const uint64_t tab64[64] = {
|
804
|
+
63, 0, 58, 1, 59, 47, 53, 2,
|
805
|
+
60, 39, 48, 27, 54, 33, 42, 3,
|
806
|
+
61, 51, 37, 40, 49, 18, 28, 20,
|
807
|
+
55, 30, 34, 11, 43, 14, 22, 4,
|
808
|
+
62, 57, 46, 52, 38, 26, 32, 41,
|
809
|
+
50, 36, 17, 19, 29, 10, 13, 21,
|
810
|
+
56, 45, 25, 31, 35, 16, 9, 12,
|
811
|
+
44, 24, 15, 8, 23, 7, 6, 5};
|
812
|
+
|
813
|
+
size_t log2ceil(size_t value)
|
814
|
+
{
|
815
|
+
value--;
|
816
|
+
value |= value >> 1;
|
817
|
+
value |= value >> 2;
|
818
|
+
value |= value >> 4;
|
819
|
+
value |= value >> 8;
|
820
|
+
value |= value >> 16;
|
821
|
+
value |= value >> 32;
|
822
|
+
return tab64[((uint64_t)((value - (value >> 1))*0x07EDD5E59A4E28C2)) >> 58] + 1;
|
823
|
+
}
|
824
|
+
#else /* other architectures - might not be entirely precise, and will be slower */
|
825
|
+
size_t log2ceil(size_t x) {return (size_t)(ceill(log2l((long double) x)));}
|
826
|
+
#endif
|
827
|
+
|
828
|
+
#ifdef _FOR_PYTHON
|
829
|
+
ModelOutputs deepcopy(const ModelOutputs &inp)
|
683
830
|
{
|
684
|
-
|
831
|
+
ModelOutputs out = inp;
|
832
|
+
return out;
|
685
833
|
}
|
834
|
+
#endif
|