outliertree 0.1.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/NOTICE.txt +1 -1
- data/README.md +11 -10
- data/ext/outliertree/ext.cpp +104 -105
- data/ext/outliertree/extconf.rb +1 -1
- data/lib/outliertree/result.rb +3 -3
- data/lib/outliertree/version.rb +1 -1
- data/vendor/outliertree/README.md +77 -40
- data/vendor/outliertree/src/Makevars.in +4 -0
- data/vendor/outliertree/src/Makevars.win +4 -0
- data/vendor/outliertree/src/RcppExports.cpp +20 -9
- data/vendor/outliertree/src/Rwrapper.cpp +256 -57
- data/vendor/outliertree/src/cat_outlier.cpp +6 -6
- data/vendor/outliertree/src/clusters.cpp +114 -9
- data/vendor/outliertree/src/fit_model.cpp +505 -308
- data/vendor/outliertree/src/misc.cpp +165 -4
- data/vendor/outliertree/src/outlier_tree.hpp +159 -51
- data/vendor/outliertree/src/outliertree-win.def +3 -0
- data/vendor/outliertree/src/predict.cpp +33 -0
- data/vendor/outliertree/src/split.cpp +124 -20
- metadata +10 -8
- data/vendor/outliertree/src/Makevars +0 -3
| @@ -121,8 +121,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_ | |
| 121 121 | 
             
                bool has_low_values  = false;
         | 
| 122 122 | 
             
                bool has_high_values = false;
         | 
| 123 123 | 
             
                long double running_mean = 0;
         | 
| 124 | 
            -
                long double mean_prev    = 0;
         | 
| 125 124 | 
             
                long double running_ssq  = 0;
         | 
| 125 | 
            +
                long double mean_prev    = 0;
         | 
| 126 126 | 
             
                double xval;
         | 
| 127 127 | 
             
                double mean;
         | 
| 128 128 | 
             
                double sd;
         | 
| @@ -134,6 +134,14 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_ | |
| 134 134 | 
             
                size_t end_normals   = 0;
         | 
| 135 135 | 
             
                double min_gap = z_outlier - z_norm;
         | 
| 136 136 |  | 
| 137 | 
            +
                double curr_gap, next_gap, eps, lim_by_orig;
         | 
| 138 | 
            +
             | 
| 139 | 
            +
                /* Note: there is no good reason and no theory behind these numbers.
         | 
| 140 | 
            +
                   TODO: find a better way of setting this */
         | 
| 141 | 
            +
                double min_gap_orig_scale = log(sqrtl((long double)(end - st + 1))) / 2.;
         | 
| 142 | 
            +
                min_gap_orig_scale = std::fmax(1.1, min_gap_orig_scale);
         | 
| 143 | 
            +
                min_gap_orig_scale = std::fmin(2.5, min_gap_orig_scale);
         | 
| 144 | 
            +
             | 
| 137 145 | 
             
                /* TODO: here it's not necessary to sort the whole data, only top/bottom N */
         | 
| 138 146 |  | 
| 139 147 | 
             
                /* sort the data */
         | 
| @@ -141,6 +149,7 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_ | |
| 141 149 |  | 
| 142 150 | 
             
                /* calculate statistics with tails and previous outliers excluded */
         | 
| 143 151 | 
             
                cnt = end_non_tail - st_non_tail + 1;
         | 
| 152 | 
            +
                mean_prev = x[ ix_arr[st_non_tail] ];
         | 
| 144 153 | 
             
                for (size_t row = st_non_tail; row <= end_non_tail; row++) {
         | 
| 145 154 | 
             
                    xval = x[ ix_arr[row] ];
         | 
| 146 155 | 
             
                    running_mean += (xval - running_mean) / (long double)(row - st_non_tail + 1);
         | 
| @@ -157,10 +166,16 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_ | |
| 157 166 | 
             
                if ((!isinf(left_tail) || !isinf(right_tail)) && !is_log_transf && !is_exp_transf) {
         | 
| 158 167 | 
             
                    sd *= 0.5;
         | 
| 159 168 | 
             
                }
         | 
| 169 | 
            +
                sd = std::fmax(sd, 1e-15);
         | 
| 170 | 
            +
                while (std::numeric_limits<double>::epsilon() > sd*std::fmin(min_gap, z_norm))
         | 
| 171 | 
            +
                    sd *= 4;
         | 
| 160 172 | 
             
                cluster.cluster_mean = mean;
         | 
| 161 173 | 
             
                cluster.cluster_sd = sd;
         | 
| 162 174 | 
             
                cnt = end - st + 1;
         | 
| 163 175 |  | 
| 176 | 
            +
                /* TODO: review how to better set this limit */
         | 
| 177 | 
            +
                tail_size = std::min(tail_size, log2ceil(end - st + 1));
         | 
| 178 | 
            +
             | 
| 164 179 | 
             
                /* see if the minimum and/or maximum values qualify for outliers */
         | 
| 165 180 | 
             
                if (-z_score(x[ix_arr[st]],  mean, sd) >= z_outlier && x[ix_arr[st]]  > left_tail)  has_low_values  = true;
         | 
| 166 181 | 
             
                if ( z_score(x[ix_arr[end]], mean, sd) >= z_outlier && x[ix_arr[end]] < right_tail) has_high_values = true;
         | 
| @@ -170,6 +185,22 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_ | |
| 170 185 | 
             
                    for (size_t row = st; row < st + tail_size; row++) {
         | 
| 171 186 |  | 
| 172 187 | 
             
                        if (( z_score(x[ix_arr[row + 1]], mean, sd) - z_score(x[ix_arr[row]], mean, sd) ) >= min_gap) {
         | 
| 188 | 
            +
                            
         | 
| 189 | 
            +
                            /* if the variable was transformed, check that the gap is still wide in the original scale */
         | 
| 190 | 
            +
                            if (is_exp_transf || is_log_transf) {
         | 
| 191 | 
            +
                                curr_gap = orig_x[ix_arr[row + 1]] - orig_x[ix_arr[row]];
         | 
| 192 | 
            +
                                next_gap = 0;
         | 
| 193 | 
            +
                                for (size_t rr = row + 1; rr < end; rr++) {
         | 
| 194 | 
            +
                                    if (orig_x[ix_arr[rr+1]] > orig_x[ix_arr[rr]]) {
         | 
| 195 | 
            +
                                        next_gap = orig_x[ix_arr[rr+1]] - orig_x[ix_arr[rr]];
         | 
| 196 | 
            +
                                        break;
         | 
| 197 | 
            +
                                    }
         | 
| 198 | 
            +
                                }
         | 
| 199 | 
            +
             | 
| 200 | 
            +
                                if (next_gap > 0 && curr_gap/next_gap < min_gap_orig_scale)
         | 
| 201 | 
            +
                                    continue;
         | 
| 202 | 
            +
                            }
         | 
| 203 | 
            +
             | 
| 173 204 | 
             
                            st_normals = row + 1;
         | 
| 174 205 | 
             
                            if (is_exp_transf) {
         | 
| 175 206 | 
             
                                cluster.lower_lim = log(x[ix_arr[row + 1]] - min_gap * sd) * orig_sd + orig_mean;
         | 
| @@ -180,6 +211,12 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_ | |
| 180 211 | 
             
                            }
         | 
| 181 212 | 
             
                            cluster.display_lim_low = orig_x[ix_arr[row + 1]];
         | 
| 182 213 | 
             
                            cluster.perc_above = (long double)(end - st_normals + 1) / (long double)(end - st + 1);
         | 
| 214 | 
            +
             | 
| 215 | 
            +
                            eps = 1e-15;
         | 
| 216 | 
            +
                            while (cluster.display_lim_low <= cluster.lower_lim) {
         | 
| 217 | 
            +
                                cluster.lower_lim -= eps;
         | 
| 218 | 
            +
                                eps *= 4;
         | 
| 219 | 
            +
                            }
         | 
| 183 220 | 
             
                            break;
         | 
| 184 221 | 
             
                        }
         | 
| 185 222 | 
             
                        if (z_score(x[ix_arr[row]], mean, sd) > -z_outlier) break;
         | 
| @@ -233,6 +270,25 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_ | |
| 233 270 | 
             
                        cluster.lower_lim = exp(x[ix_arr[st]] - min_gap * sd) + log_minval;
         | 
| 234 271 | 
             
                    }
         | 
| 235 272 |  | 
| 273 | 
            +
                    if (cluster.lower_lim > -HUGE_VAL) {
         | 
| 274 | 
            +
                        eps = 1e-15;
         | 
| 275 | 
            +
                        while (cluster.lower_lim >= orig_x[ix_arr[st]]) {
         | 
| 276 | 
            +
                            cluster.lower_lim -= eps;
         | 
| 277 | 
            +
                            eps *= 4.;
         | 
| 278 | 
            +
                        }
         | 
| 279 | 
            +
                    }
         | 
| 280 | 
            +
             | 
| 281 | 
            +
                    if (is_exp_transf || is_log_transf) {
         | 
| 282 | 
            +
                        for (size_t row = st; row < end; row++) {
         | 
| 283 | 
            +
                            if (orig_x[ix_arr[row+1]] > orig_x[ix_arr[row]]) {
         | 
| 284 | 
            +
                                curr_gap = orig_x[ix_arr[row+1]] - orig_x[ix_arr[row]];
         | 
| 285 | 
            +
                                lim_by_orig = orig_x[ix_arr[st]] - min_gap_orig_scale * curr_gap;
         | 
| 286 | 
            +
                                cluster.lower_lim = std::fmin(cluster.lower_lim, lim_by_orig);
         | 
| 287 | 
            +
                                break;
         | 
| 288 | 
            +
                            }
         | 
| 289 | 
            +
                        }
         | 
| 290 | 
            +
                    }
         | 
| 291 | 
            +
             | 
| 236 292 | 
             
                    cluster.display_lim_low = orig_x[ix_arr[st]];
         | 
| 237 293 |  | 
| 238 294 | 
             
                }
         | 
| @@ -241,6 +297,22 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_ | |
| 241 297 | 
             
                    for (size_t row = end; row > (end - tail_size); row--) {
         | 
| 242 298 |  | 
| 243 299 | 
             
                        if (( z_score(x[ix_arr[row]], mean, sd) - z_score(x[ix_arr[row - 1]], mean, sd) ) >= min_gap) {
         | 
| 300 | 
            +
                            
         | 
| 301 | 
            +
                            /* if the variable was transformed, check that the gap is still wide in the original scale */
         | 
| 302 | 
            +
                            if (is_exp_transf || is_log_transf) {
         | 
| 303 | 
            +
                                curr_gap = orig_x[ix_arr[row]] - orig_x[ix_arr[row - 1]];
         | 
| 304 | 
            +
                                next_gap = 0;
         | 
| 305 | 
            +
                                for (size_t rr = row-1; rr > st; rr--) {
         | 
| 306 | 
            +
                                    if (orig_x[ix_arr[rr]] > orig_x[ix_arr[rr-1]]) {
         | 
| 307 | 
            +
                                        next_gap = orig_x[ix_arr[rr]] - orig_x[ix_arr[rr-1]];
         | 
| 308 | 
            +
                                        break;
         | 
| 309 | 
            +
                                    }
         | 
| 310 | 
            +
                                }
         | 
| 311 | 
            +
             | 
| 312 | 
            +
                                if (next_gap > 0 && curr_gap/next_gap < min_gap_orig_scale)
         | 
| 313 | 
            +
                                    continue;
         | 
| 314 | 
            +
                            }
         | 
| 315 | 
            +
             | 
| 244 316 | 
             
                            end_normals = row - 1;
         | 
| 245 317 | 
             
                            if (is_exp_transf) {
         | 
| 246 318 | 
             
                                cluster.upper_lim = log(x[ix_arr[row - 1]] + min_gap * sd) * orig_sd + orig_mean;
         | 
| @@ -251,6 +323,12 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_ | |
| 251 323 | 
             
                            }
         | 
| 252 324 | 
             
                            cluster.display_lim_high = orig_x[ix_arr[row - 1]];
         | 
| 253 325 | 
             
                            cluster.perc_below = (long double)(end_normals - st + 1) / (long double)(end - st + 1);
         | 
| 326 | 
            +
             | 
| 327 | 
            +
                            eps = 1e-15;
         | 
| 328 | 
            +
                            while (cluster.display_lim_high >= cluster.upper_lim) {
         | 
| 329 | 
            +
                                cluster.upper_lim += eps;
         | 
| 330 | 
            +
                                eps *= 4;
         | 
| 331 | 
            +
                            }
         | 
| 254 332 | 
             
                            break;
         | 
| 255 333 | 
             
                        }
         | 
| 256 334 | 
             
                        if (z_score(x[ix_arr[row]], mean, sd) < z_outlier) break;
         | 
| @@ -305,6 +383,25 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_ | |
| 305 383 | 
             
                        cluster.upper_lim = exp(x[ix_arr[end]] + min_gap * sd) + log_minval;
         | 
| 306 384 | 
             
                    }
         | 
| 307 385 |  | 
| 386 | 
            +
                    if (cluster.upper_lim < HUGE_VAL) {
         | 
| 387 | 
            +
                        eps = 1e-15;
         | 
| 388 | 
            +
                        while (cluster.upper_lim <= orig_x[ix_arr[end]]) {
         | 
| 389 | 
            +
                            cluster.upper_lim += eps;
         | 
| 390 | 
            +
                            eps *= 4.;
         | 
| 391 | 
            +
                        }
         | 
| 392 | 
            +
                    }
         | 
| 393 | 
            +
             | 
| 394 | 
            +
                    if (is_exp_transf || is_log_transf) {
         | 
| 395 | 
            +
                        for (size_t row = end; row < st; row--) {
         | 
| 396 | 
            +
                            if (orig_x[ix_arr[row]] > orig_x[ix_arr[row-1]]) {
         | 
| 397 | 
            +
                                curr_gap = orig_x[ix_arr[row]] - orig_x[ix_arr[row-1]];
         | 
| 398 | 
            +
                                lim_by_orig = orig_x[ix_arr[end]] + min_gap_orig_scale * curr_gap;
         | 
| 399 | 
            +
                                cluster.upper_lim = std::fmax(cluster.upper_lim, lim_by_orig);
         | 
| 400 | 
            +
                                break;
         | 
| 401 | 
            +
                            }
         | 
| 402 | 
            +
                        }
         | 
| 403 | 
            +
                    }
         | 
| 404 | 
            +
             | 
| 308 405 | 
             
                    cluster.display_lim_high = orig_x[ix_arr[end]];
         | 
| 309 406 | 
             
                }
         | 
| 310 407 |  | 
| @@ -313,8 +410,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_ | |
| 313 410 | 
             
                    size_t st_disp  = has_low_values?  st_normals  : st;
         | 
| 314 411 | 
             
                    size_t end_disp = has_high_values? end_normals : end;
         | 
| 315 412 | 
             
                    running_mean = 0;
         | 
| 316 | 
            -
                    mean_prev    = 0;
         | 
| 317 413 | 
             
                    running_ssq  = 0;
         | 
| 414 | 
            +
                    mean_prev    = orig_x[ix_arr[st_disp]];
         | 
| 318 415 | 
             
                    for (size_t row = st_disp; row <= end_disp; row++) {
         | 
| 319 416 | 
             
                        xval = orig_x[ix_arr[row]];
         | 
| 320 417 | 
             
                        running_mean += (xval - running_mean) / (long double)(row - st_disp + 1);
         | 
| @@ -372,7 +469,7 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_ | |
| 372 469 | 
             
            void define_categ_cluster_no_cond(int *restrict x, size_t *restrict ix_arr, size_t st, size_t end, size_t ncateg,
         | 
| 373 470 | 
             
                                              double *restrict outlier_scores, size_t *restrict outlier_clusters, size_t *restrict outlier_trees,
         | 
| 374 471 | 
             
                                              size_t *restrict outlier_depth, Cluster &cluster,
         | 
| 375 | 
            -
                                              size_t *restrict categ_counts, char *restrict is_outlier, double perc_next_most_comm)
         | 
| 472 | 
            +
                                              size_t *restrict categ_counts, signed char *restrict is_outlier, double perc_next_most_comm)
         | 
| 376 473 | 
             
            {
         | 
| 377 474 | 
             
                size_t cnt_common = end - st + 1;
         | 
| 378 475 | 
             
                cluster.cluster_size = cnt_common;
         | 
| @@ -474,7 +571,7 @@ bool define_categ_cluster(int *restrict x, size_t *restrict ix_arr, size_t st, s | |
| 474 571 | 
             
                                      double max_perc_outliers, double z_norm, double z_outlier,
         | 
| 475 572 | 
             
                                      long double *restrict perc_threshold, long double *restrict prop_prior,
         | 
| 476 573 | 
             
                                      size_t *restrict buffer_categ_counts, long double *restrict buffer_categ_pct,
         | 
| 477 | 
            -
                                      size_t *restrict buffer_categ_ix, char *restrict buffer_outliers,
         | 
| 574 | 
            +
                                      size_t *restrict buffer_categ_ix, signed char *restrict buffer_outliers,
         | 
| 478 575 | 
             
                                      bool *restrict drop_cluster)
         | 
| 479 576 | 
             
            {
         | 
| 480 577 | 
             
                bool found_outliers, new_is_outlier;
         | 
| @@ -567,7 +664,7 @@ bool define_categ_cluster(int *restrict x, size_t *restrict ix_arr, size_t st, s | |
| 567 664 |  | 
| 568 665 | 
             
                        cluster.perc_in_subset = (long double) buffer_categ_counts[cluster.categ_maj] / tot_dbl;
         | 
| 569 666 | 
             
                        for (size_t cat = 0; cat < ncateg; cat++) {
         | 
| 570 | 
            -
                            if (cat == cluster.categ_maj)
         | 
| 667 | 
            +
                            if ((int)cat == cluster.categ_maj)
         | 
| 571 668 | 
             
                                continue;
         | 
| 572 669 | 
             
                            if (cluster.subset_common[cat] != 0) {
         | 
| 573 670 | 
             
                                cluster.score_categ[cat] = (long double)(tot - buffer_categ_counts[cluster.categ_maj] + 1)
         | 
| @@ -626,10 +723,10 @@ void simplify_when_equal_cond(std::vector<Cluster> &clusters, int ncat_ord[]) | |
| 626 723 | 
             
                            } else {
         | 
| 627 724 |  | 
| 628 725 | 
             
                                size_subset_excl = std::accumulate(clusters[clust].split_subset.begin(), clusters[clust].split_subset.end(), (size_t)0,
         | 
| 629 | 
            -
                                                                   [](const size_t a, const char b){return a + ((b < 0)? 1 : 0);});
         | 
| 726 | 
            +
                                                                   [](const size_t a, const signed char b){return a + ((b < 0)? 1 : 0);});
         | 
| 630 727 | 
             
                                if (size_subset_excl > 0) continue;
         | 
| 631 728 | 
             
                                size_subset = std::accumulate(clusters[clust].split_subset.begin(), clusters[clust].split_subset.end(), (size_t)0,
         | 
| 632 | 
            -
                                                              [](const size_t a, const char b){return a + ((b > 0)? 1 : 0);});
         | 
| 729 | 
            +
                                                              [](const size_t a, const signed char b){return a + ((b > 0)? 1 : 0);});
         | 
| 633 730 | 
             
                                if (size_subset == 1) {
         | 
| 634 731 |  | 
| 635 732 | 
             
                                    do {col_equal++;} while (clusters[clust].split_subset[col_equal] <= 0);
         | 
| @@ -681,6 +778,7 @@ void simplify_when_equal_cond(std::vector<Cluster> &clusters, int ncat_ord[]) | |
| 681 778 | 
             
                            break;
         | 
| 682 779 | 
             
                        }
         | 
| 683 780 |  | 
| 781 | 
            +
                        default: {}
         | 
| 684 782 | 
             
                    }
         | 
| 685 783 |  | 
| 686 784 | 
             
                }
         | 
| @@ -711,7 +809,7 @@ void simplify_when_equal_cond(std::vector<ClusterTree> &trees, int ncat_ord[]) | |
| 711 809 | 
             
                        case Categorical:
         | 
| 712 810 | 
             
                        {
         | 
| 713 811 | 
             
                            size_subset_excl = std::accumulate(trees[tree].split_subset.begin(), trees[tree].split_subset.end(), (size_t)0,
         | 
| 714 | 
            -
                                                               [](const size_t a, const char b){return a + ((b < 0)? 1 : 0);});
         | 
| 812 | 
            +
                                                               [](const size_t a, const signed char b){return a + ((b < 0)? 1 : 0);});
         | 
| 715 813 | 
             
                            if (size_subset_excl > 0) continue;
         | 
| 716 814 |  | 
| 717 815 | 
             
                            col_equal = -1;
         | 
| @@ -747,6 +845,8 @@ void simplify_when_equal_cond(std::vector<ClusterTree> &trees, int ncat_ord[]) | |
| 747 845 | 
             
                                            trees[tree].split_this_branch = Equal;
         | 
| 748 846 | 
             
                                            break;
         | 
| 749 847 | 
             
                                        }
         | 
| 848 | 
            +
             | 
| 849 | 
            +
                                        default: {}
         | 
| 750 850 | 
             
                                    }
         | 
| 751 851 | 
             
                                }
         | 
| 752 852 |  | 
| @@ -755,7 +855,7 @@ void simplify_when_equal_cond(std::vector<ClusterTree> &trees, int ncat_ord[]) | |
| 755 855 | 
             
                            else {
         | 
| 756 856 |  | 
| 757 857 | 
             
                                size_subset = std::accumulate(trees[tree].split_subset.begin(), trees[tree].split_subset.end(), (size_t)0,
         | 
| 758 | 
            -
                                                              [](const size_t a, const char b){return a + ((b > 0)? 1 : 0);});
         | 
| 858 | 
            +
                                                              [](const size_t a, const signed char b){return a + ((b > 0)? 1 : 0);});
         | 
| 759 859 | 
             
                                if (size_subset == 1) {
         | 
| 760 860 |  | 
| 761 861 | 
             
                                    do {col_equal++;} while (trees[tree].split_subset[col_equal] <= 0);
         | 
| @@ -778,6 +878,8 @@ void simplify_when_equal_cond(std::vector<ClusterTree> &trees, int ncat_ord[]) | |
| 778 878 | 
             
                                                trees[tree].split_this_branch = Equal;
         | 
| 779 879 | 
             
                                                break;
         | 
| 780 880 | 
             
                                            }
         | 
| 881 | 
            +
             | 
| 882 | 
            +
                                            default: {}
         | 
| 781 883 | 
             
                                        }
         | 
| 782 884 | 
             
                                    }
         | 
| 783 885 |  | 
| @@ -801,6 +903,8 @@ void simplify_when_equal_cond(std::vector<ClusterTree> &trees, int ncat_ord[]) | |
| 801 903 | 
             
                                                trees[tree].split_this_branch = Equal;
         | 
| 802 904 | 
             
                                                break;
         | 
| 803 905 | 
             
                                            }
         | 
| 906 | 
            +
             | 
| 907 | 
            +
                                            default: {}
         | 
| 804 908 | 
             
                                        }
         | 
| 805 909 | 
             
                                    }
         | 
| 806 910 |  | 
| @@ -846,6 +950,7 @@ void simplify_when_equal_cond(std::vector<ClusterTree> &trees, int ncat_ord[]) | |
| 846 950 | 
             
                            break;
         | 
| 847 951 | 
             
                        }
         | 
| 848 952 |  | 
| 953 | 
            +
                        default: {}
         | 
| 849 954 | 
             
                    }
         | 
| 850 955 |  | 
| 851 956 | 
             
                }
         |