outliertree 0.1.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/NOTICE.txt +1 -1
- data/README.md +11 -10
- data/ext/outliertree/ext.cpp +104 -105
- data/ext/outliertree/extconf.rb +1 -1
- data/lib/outliertree/result.rb +3 -3
- data/lib/outliertree/version.rb +1 -1
- data/vendor/outliertree/README.md +77 -40
- data/vendor/outliertree/src/Makevars.in +4 -0
- data/vendor/outliertree/src/Makevars.win +4 -0
- data/vendor/outliertree/src/RcppExports.cpp +20 -9
- data/vendor/outliertree/src/Rwrapper.cpp +256 -57
- data/vendor/outliertree/src/cat_outlier.cpp +6 -6
- data/vendor/outliertree/src/clusters.cpp +114 -9
- data/vendor/outliertree/src/fit_model.cpp +505 -308
- data/vendor/outliertree/src/misc.cpp +165 -4
- data/vendor/outliertree/src/outlier_tree.hpp +159 -51
- data/vendor/outliertree/src/outliertree-win.def +3 -0
- data/vendor/outliertree/src/predict.cpp +33 -0
- data/vendor/outliertree/src/split.cpp +124 -20
- metadata +10 -8
- data/vendor/outliertree/src/Makevars +0 -3
| @@ -145,8 +145,8 @@ void check_missing_no_variance(double numeric_data[], size_t ncols, size_t nrows | |
| 145 145 | 
             
                        private(running_mean, mean_prev, running_ssq, cnt, col_stop, xval, min_val, max_val, min_decimals_col)
         | 
| 146 146 | 
             
                for (size_t_for col = 0; col < ncols; col++) {
         | 
| 147 147 | 
             
                    running_mean = 0;
         | 
| 148 | 
            -
                    mean_prev = 0;
         | 
| 149 148 | 
             
                    running_ssq = 0;
         | 
| 149 | 
            +
                    mean_prev = numeric_data[col * nrows];
         | 
| 150 150 | 
             
                    min_val =  HUGE_VAL;
         | 
| 151 151 | 
             
                    max_val = -HUGE_VAL;
         | 
| 152 152 | 
             
                    cnt = 0;
         | 
| @@ -178,11 +178,12 @@ void check_missing_no_variance(double numeric_data[], size_t ncols, size_t nrows | |
| 178 178 | 
             
            void calc_central_mean_and_sd(size_t ix_arr[], size_t st, size_t end, double x[], size_t size_quarter, double *mean_central, double *sd_central)
         | 
| 179 179 | 
             
            {
         | 
| 180 180 | 
             
                long double running_mean = 0;
         | 
| 181 | 
            -
                long double mean_prev    = 0;
         | 
| 182 181 | 
             
                long double running_ssq  = 0;
         | 
| 182 | 
            +
                long double mean_prev    = 0;
         | 
| 183 183 | 
             
                double xval;
         | 
| 184 184 | 
             
                size_t st_offset = st + size_quarter;
         | 
| 185 185 | 
             
                if (ix_arr != NULL) {
         | 
| 186 | 
            +
                    mean_prev = x[ix_arr[st]];
         | 
| 186 187 | 
             
                    for (size_t row = st_offset; row <= (end - size_quarter); row++) {
         | 
| 187 188 | 
             
                        xval = x[ix_arr[row]];
         | 
| 188 189 | 
             
                        running_mean += (xval - running_mean) / (long double)(row - st_offset + 1);
         | 
| @@ -190,6 +191,7 @@ void calc_central_mean_and_sd(size_t ix_arr[], size_t st, size_t end, double x[] | |
| 190 191 | 
             
                        mean_prev     = running_mean;
         | 
| 191 192 | 
             
                    }
         | 
| 192 193 | 
             
                } else {
         | 
| 194 | 
            +
                    mean_prev = x[st_offset];
         | 
| 193 195 | 
             
                    for (size_t row = st_offset; row <= (end - size_quarter); row++) {
         | 
| 194 196 | 
             
                        xval = x[row];
         | 
| 195 197 | 
             
                        running_mean += (xval - running_mean) / (long double)(row - st_offset + 1);
         | 
| @@ -405,7 +407,7 @@ void divide_subset_split(size_t ix_arr[], double x[], size_t st, size_t end, dou | |
| 405 407 | 
             
            }
         | 
| 406 408 |  | 
| 407 409 | 
             
            /* for categorical */
         | 
| 408 | 
            -
            void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, char subset_categ[], int ncat, bool has_NA, size_t *split_NA, size_t *st_right)
         | 
| 410 | 
            +
            void divide_subset_split(size_t ix_arr[], int x[], size_t st, size_t end, signed char subset_categ[], int ncat, bool has_NA, size_t *split_NA, size_t *st_right)
         | 
| 409 411 | 
             
            {
         | 
| 410 412 | 
             
                size_t temp;
         | 
| 411 413 |  | 
| @@ -508,6 +510,7 @@ void backup_recursion_state(Workspace &workspace, RecursionState &state_backup) | |
| 508 510 | 
             
                state_backup.has_outliers_restore = workspace.has_outliers;
         | 
| 509 511 | 
             
                state_backup.lev_has_outliers_restore = workspace.lev_has_outliers;
         | 
| 510 512 | 
             
                state_backup.temp_ptr_x = workspace.temp_ptr_x;
         | 
| 513 | 
            +
                state_backup.is_binary_split_restore = workspace.is_binary_split;
         | 
| 511 514 | 
             
            }
         | 
| 512 515 |  | 
| 513 516 | 
             
            void restore_recursion_state(Workspace &workspace, RecursionState &state_backup)
         | 
| @@ -532,6 +535,7 @@ void restore_recursion_state(Workspace &workspace, RecursionState &state_backup) | |
| 532 535 | 
             
                workspace.has_outliers = state_backup.has_outliers_restore;
         | 
| 533 536 | 
             
                workspace.lev_has_outliers = state_backup.lev_has_outliers_restore;
         | 
| 534 537 | 
             
                workspace.temp_ptr_x = state_backup.temp_ptr_x;
         | 
| 538 | 
            +
                workspace.is_binary_split = state_backup.is_binary_split_restore;
         | 
| 535 539 | 
             
            }
         | 
| 536 540 |  | 
| 537 541 | 
             
            /* Next split on the trees is only decided after they are already initialized */
         | 
| @@ -542,7 +546,7 @@ void set_tree_as_numeric(ClusterTree &tree, double split_point, size_t col) | |
| 542 546 | 
             
                tree.col_num = col;
         | 
| 543 547 | 
             
            }
         | 
| 544 548 |  | 
| 545 | 
            -
            void set_tree_as_categorical(ClusterTree &tree, int ncat, char *split_subset, size_t col)
         | 
| 549 | 
            +
            void set_tree_as_categorical(ClusterTree &tree, int ncat, signed char *split_subset, size_t col)
         | 
| 546 550 | 
             
            {
         | 
| 547 551 | 
             
                tree.column_type = Categorical;
         | 
| 548 552 | 
             
                tree.col_num = col;
         | 
| @@ -587,6 +591,7 @@ void forget_row_outputs(ModelOutputs &model_outputs) | |
| 587 591 | 
             
                model_outputs.outlier_trees_final.clear();
         | 
| 588 592 | 
             
                model_outputs.outlier_depth_final.clear();
         | 
| 589 593 | 
             
                model_outputs.outlier_decimals_distr.clear();
         | 
| 594 | 
            +
                model_outputs.min_decimals_col.clear();
         | 
| 590 595 |  | 
| 591 596 | 
             
                model_outputs.outlier_scores_final.shrink_to_fit();
         | 
| 592 597 | 
             
                model_outputs.outlier_clusters_final.shrink_to_fit();
         | 
| @@ -594,6 +599,7 @@ void forget_row_outputs(ModelOutputs &model_outputs) | |
| 594 599 | 
             
                model_outputs.outlier_trees_final.shrink_to_fit();
         | 
| 595 600 | 
             
                model_outputs.outlier_depth_final.shrink_to_fit();
         | 
| 596 601 | 
             
                model_outputs.outlier_decimals_distr.shrink_to_fit();
         | 
| 602 | 
            +
                model_outputs.min_decimals_col.shrink_to_fit();
         | 
| 597 603 | 
             
            }
         | 
| 598 604 |  | 
| 599 605 | 
             
            void allocate_row_outputs(ModelOutputs &model_outputs, size_t nrows, size_t max_depth)
         | 
| @@ -605,6 +611,7 @@ void allocate_row_outputs(ModelOutputs &model_outputs, size_t nrows, size_t max_ | |
| 605 611 | 
             
                model_outputs.outlier_trees_final.resize(nrows);
         | 
| 606 612 | 
             
                model_outputs.outlier_depth_final.resize(nrows, max_depth + 2);
         | 
| 607 613 | 
             
                model_outputs.outlier_decimals_distr.resize(nrows, 0);
         | 
| 614 | 
            +
                model_outputs.min_decimals_col.resize(nrows);
         | 
| 608 615 |  | 
| 609 616 | 
             
                model_outputs.outlier_scores_final.shrink_to_fit();
         | 
| 610 617 | 
             
                model_outputs.outlier_clusters_final.shrink_to_fit();
         | 
| @@ -612,6 +619,7 @@ void allocate_row_outputs(ModelOutputs &model_outputs, size_t nrows, size_t max_ | |
| 612 619 | 
             
                model_outputs.outlier_trees_final.shrink_to_fit();
         | 
| 613 620 | 
             
                model_outputs.outlier_depth_final.shrink_to_fit();
         | 
| 614 621 | 
             
                model_outputs.outlier_decimals_distr.shrink_to_fit();
         | 
| 622 | 
            +
                model_outputs.min_decimals_col.shrink_to_fit();
         | 
| 615 623 | 
             
            }
         | 
| 616 624 |  | 
| 617 625 | 
             
            void check_more_two_values(double arr_num[], size_t nrows, size_t ncols, int nthreads, char too_few_values[])
         | 
| @@ -683,3 +691,156 @@ void dealloc_ModelOutputs(ModelOutputs &model_outputs) | |
| 683 691 | 
             
            {
         | 
| 684 692 | 
             
                model_outputs.~ModelOutputs();
         | 
| 685 693 | 
             
            }
         | 
| 694 | 
            +
             | 
| 695 | 
            +
            ModelOutputs get_empty_ModelOutputs()
         | 
| 696 | 
            +
            {
         | 
| 697 | 
            +
                return ModelOutputs();
         | 
| 698 | 
            +
            }
         | 
| 699 | 
            +
             | 
| 700 | 
            +
            bool get_has_openmp()
         | 
| 701 | 
            +
            {
         | 
| 702 | 
            +
                #ifdef _OPENMP
         | 
| 703 | 
            +
                return true;
         | 
| 704 | 
            +
                #else
         | 
| 705 | 
            +
                return false;
         | 
| 706 | 
            +
                #endif
         | 
| 707 | 
            +
            }
         | 
| 708 | 
            +
             | 
| 709 | 
            +
            bool interrupt_switch = false;
         | 
| 710 | 
            +
            bool handle_is_locked = false;
         | 
| 711 | 
            +
             | 
| 712 | 
            +
            /* Function to handle interrupt signals */
         | 
| 713 | 
            +
            void set_interrup_global_variable(int s)
         | 
| 714 | 
            +
            {
         | 
| 715 | 
            +
                #pragma omp critical
         | 
| 716 | 
            +
                {
         | 
| 717 | 
            +
                    interrupt_switch = true;
         | 
| 718 | 
            +
                }
         | 
| 719 | 
            +
            }
         | 
| 720 | 
            +
             | 
| 721 | 
            +
            void check_interrupt_switch(SignalSwitcher &ss)
         | 
| 722 | 
            +
            {
         | 
| 723 | 
            +
                if (interrupt_switch)
         | 
| 724 | 
            +
                {
         | 
| 725 | 
            +
                    ss.restore_handle();
         | 
| 726 | 
            +
                    #ifndef _FOR_R
         | 
| 727 | 
            +
                    fprintf(stderr, "Error: procedure was interrupted\n");
         | 
| 728 | 
            +
                    #else
         | 
| 729 | 
            +
                    REprintf("Error: procedure was interrupted\n");
         | 
| 730 | 
            +
                    #endif
         | 
| 731 | 
            +
                    raise(SIGINT);
         | 
| 732 | 
            +
                    #ifdef _FOR_R
         | 
| 733 | 
            +
                    Rcpp::checkUserInterrupt();
         | 
| 734 | 
            +
                    #elif !defined(DONT_THROW_ON_INTERRUPT)
         | 
| 735 | 
            +
                    throw "Error: procedure was interrupted.\n";
         | 
| 736 | 
            +
                    #endif
         | 
| 737 | 
            +
                }
         | 
| 738 | 
            +
            }
         | 
| 739 | 
            +
             | 
| 740 | 
            +
            #ifdef _FOR_PYTHON
         | 
| 741 | 
            +
            bool cy_check_interrupt_switch()
         | 
| 742 | 
            +
            {
         | 
| 743 | 
            +
                return interrupt_switch;
         | 
| 744 | 
            +
            }
         | 
| 745 | 
            +
            void cy_tick_off_interrupt_switch()
         | 
| 746 | 
            +
            {
         | 
| 747 | 
            +
                interrupt_switch = false;
         | 
| 748 | 
            +
            }
         | 
| 749 | 
            +
            #endif
         | 
| 750 | 
            +
             | 
| 751 | 
            +
            SignalSwitcher::SignalSwitcher()
         | 
| 752 | 
            +
            {
         | 
| 753 | 
            +
                #pragma omp critical
         | 
| 754 | 
            +
                {
         | 
| 755 | 
            +
                    if (!handle_is_locked)
         | 
| 756 | 
            +
                    {
         | 
| 757 | 
            +
                        handle_is_locked = true;
         | 
| 758 | 
            +
                        interrupt_switch = false;
         | 
| 759 | 
            +
                        this->old_sig = signal(SIGINT, set_interrup_global_variable);
         | 
| 760 | 
            +
                        this->is_active = true;
         | 
| 761 | 
            +
                    }
         | 
| 762 | 
            +
             | 
| 763 | 
            +
                    else {
         | 
| 764 | 
            +
                        this->is_active = false;
         | 
| 765 | 
            +
                    }
         | 
| 766 | 
            +
                }
         | 
| 767 | 
            +
            }
         | 
| 768 | 
            +
             | 
| 769 | 
            +
            SignalSwitcher::~SignalSwitcher()
         | 
| 770 | 
            +
            {
         | 
| 771 | 
            +
                #ifndef _FOR_PYTHON
         | 
| 772 | 
            +
                #pragma omp critical
         | 
| 773 | 
            +
                {
         | 
| 774 | 
            +
                    if (this->is_active && handle_is_locked)
         | 
| 775 | 
            +
                        interrupt_switch = false;
         | 
| 776 | 
            +
                }
         | 
| 777 | 
            +
                #endif
         | 
| 778 | 
            +
                this->restore_handle();
         | 
| 779 | 
            +
            }
         | 
| 780 | 
            +
             | 
| 781 | 
            +
            void SignalSwitcher::restore_handle()
         | 
| 782 | 
            +
            {
         | 
| 783 | 
            +
                #pragma omp critical
         | 
| 784 | 
            +
                {
         | 
| 785 | 
            +
                    if (this->is_active && handle_is_locked)
         | 
| 786 | 
            +
                    {
         | 
| 787 | 
            +
                        signal(SIGINT, this->old_sig);
         | 
| 788 | 
            +
                        this->is_active = false;
         | 
| 789 | 
            +
                        handle_is_locked = false;
         | 
| 790 | 
            +
                    }
         | 
| 791 | 
            +
                }
         | 
| 792 | 
            +
            }
         | 
| 793 | 
            +
             | 
| 794 | 
            +
            /* ceil(log2(x)) done with bit-wise operations ensures perfect precision (and it's faster too)
         | 
| 795 | 
            +
               https://stackoverflow.com/questions/2589096/find-most-significant-bit-left-most-that-is-set-in-a-bit-array
         | 
| 796 | 
            +
               https://stackoverflow.com/questions/11376288/fast-computing-of-log2-for-64-bit-integers  */
         | 
| 797 | 
            +
            #if SIZE_MAX == UINT32_MAX /* 32-bit systems */
         | 
| 798 | 
            +
                constexpr static const int MultiplyDeBruijnBitPosition[32] =
         | 
| 799 | 
            +
                {
         | 
| 800 | 
            +
                    0, 9,  1,  10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30,
         | 
| 801 | 
            +
                    8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6,  26, 5,  4, 31
         | 
| 802 | 
            +
                };
         | 
| 803 | 
            +
                size_t log2ceil( size_t v )
         | 
| 804 | 
            +
                {
         | 
| 805 | 
            +
                    v--;
         | 
| 806 | 
            +
                    v |= v >> 1; // first round down to one less than a power of 2
         | 
| 807 | 
            +
                    v |= v >> 2;
         | 
| 808 | 
            +
                    v |= v >> 4;
         | 
| 809 | 
            +
                    v |= v >> 8;
         | 
| 810 | 
            +
                    v |= v >> 16;
         | 
| 811 | 
            +
             | 
| 812 | 
            +
                    return MultiplyDeBruijnBitPosition[( uint32_t )( v * 0x07C4ACDDU ) >> 27] + 1;
         | 
| 813 | 
            +
                }
         | 
| 814 | 
            +
            #elif SIZE_MAX == UINT64_MAX /* 64-bit systems */
         | 
| 815 | 
            +
                constexpr static const uint64_t tab64[64] = {
         | 
| 816 | 
            +
                    63,  0, 58,  1, 59, 47, 53,  2,
         | 
| 817 | 
            +
                    60, 39, 48, 27, 54, 33, 42,  3,
         | 
| 818 | 
            +
                    61, 51, 37, 40, 49, 18, 28, 20,
         | 
| 819 | 
            +
                    55, 30, 34, 11, 43, 14, 22,  4,
         | 
| 820 | 
            +
                    62, 57, 46, 52, 38, 26, 32, 41,
         | 
| 821 | 
            +
                    50, 36, 17, 19, 29, 10, 13, 21,
         | 
| 822 | 
            +
                    56, 45, 25, 31, 35, 16,  9, 12,
         | 
| 823 | 
            +
                    44, 24, 15,  8, 23,  7,  6,  5};
         | 
| 824 | 
            +
             | 
| 825 | 
            +
                size_t log2ceil(size_t value)
         | 
| 826 | 
            +
                {
         | 
| 827 | 
            +
                    value--;
         | 
| 828 | 
            +
                    value |= value >> 1;
         | 
| 829 | 
            +
                    value |= value >> 2;
         | 
| 830 | 
            +
                    value |= value >> 4;
         | 
| 831 | 
            +
                    value |= value >> 8;
         | 
| 832 | 
            +
                    value |= value >> 16;
         | 
| 833 | 
            +
                    value |= value >> 32;
         | 
| 834 | 
            +
                    return tab64[((uint64_t)((value - (value >> 1))*0x07EDD5E59A4E28C2)) >> 58] + 1;
         | 
| 835 | 
            +
                }
         | 
| 836 | 
            +
            #else /* other architectures - might not be entirely precise, and will be slower */
         | 
| 837 | 
            +
                size_t log2ceil(size_t x) {return (size_t)(ceill(log2l((long double) x)));}
         | 
| 838 | 
            +
            #endif
         | 
| 839 | 
            +
             | 
| 840 | 
            +
            #ifdef _FOR_PYTHON
         | 
| 841 | 
            +
            ModelOutputs deepcopy(const ModelOutputs &inp)
         | 
| 842 | 
            +
            {
         | 
| 843 | 
            +
                ModelOutputs out = inp;
         | 
| 844 | 
            +
                return out;
         | 
| 845 | 
            +
            }
         | 
| 846 | 
            +
            #endif
         |