RubyGems - outliertree - Versions diffs - 0.2.1 → 0.3.0 - Mend

outliertree 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +5 -0
data/NOTICE.txt +1 -1
data/README.md +11 -10
data/ext/outliertree/extconf.rb +1 -1
data/lib/outliertree/result.rb +3 -3
data/lib/outliertree/version.rb +1 -1
data/vendor/outliertree/README.md +77 -40
data/vendor/outliertree/src/Makevars.in +4 -0
data/vendor/outliertree/src/Makevars.win +4 -0
data/vendor/outliertree/src/RcppExports.cpp +20 -9
data/vendor/outliertree/src/Rwrapper.cpp +256 -57
data/vendor/outliertree/src/cat_outlier.cpp +6 -6
data/vendor/outliertree/src/clusters.cpp +114 -9
data/vendor/outliertree/src/fit_model.cpp +505 -308
data/vendor/outliertree/src/misc.cpp +165 -4
data/vendor/outliertree/src/outlier_tree.hpp +159 -51
data/vendor/outliertree/src/outliertree-win.def +3 -0
data/vendor/outliertree/src/predict.cpp +33 -0
data/vendor/outliertree/src/split.cpp +124 -20
metadata +8 -6
data/vendor/outliertree/src/Makevars +0 -3

data/vendor/outliertree/src/split.cpp CHANGED Viewed

@@ -36,12 +36,15 @@
 /* TODO: don't divide the gains by tot at every calculation as it makes it slower */
-/* TODO: sorting here is the slowest thing, so it could be improved by using radix sort for categorical/ordinal and timsort for numerical */
+/* TODO: sorting here is the slowest thing, so it could be improved by using radix sort for categorical/ordinal */
 /* TODO: columns that split by numeric should output the sum/sum_sq to pass it to the cluster functions, instead of recalculating them later */
+/* TODO: the calculations of standard deviations when splitting a numeric column by a categorical column are
+   highly imprecise and might throw negative variances. Should switch to a more robust procedure. */
-void subset_to_onehot(size_t ix_arr[], size_t n_true, size_t n_tot, char onehot[])
+void subset_to_onehot(size_t ix_arr[], size_t n_true, size_t n_tot, signed char onehot[])
 {
     memset(onehot, 0, sizeof(bool) * n_tot);
     for (size_t i = 0; i <= n_true; i++) onehot[ix_arr[i]] = 1;
@@ -62,7 +65,7 @@ size_t move_zero_count_to_front(size_t *restrict cat_sorted, size_t *restrict ca
     return st_cat;
 }
-void flag_zero_counts(char split_subset[], size_t buffer_cat_cnt[], size_t ncat_x)
+void flag_zero_counts(signed char split_subset[], size_t buffer_cat_cnt[], size_t ncat_x)
 {
     for (size_t cat = 0; cat < ncat_x; cat++)
         if (buffer_cat_cnt[cat] == 0) split_subset[cat] = -1;
@@ -71,20 +74,20 @@ void flag_zero_counts(char split_subset[], size_t buffer_cat_cnt[], size_t ncat_
 long double calc_sd(size_t cnt, long double sum, long double sum_sq)
 {
     if (cnt < 3) return 0;
-    return sqrtl( (sum_sq - (square(sum) / (long double) cnt) + SD_REG) / (long double) (cnt - 1) );
+    return std::sqrt( (sum_sq - (square(sum) / (long double) cnt) + SD_REG) / (long double) (cnt - 1) );
 }
 long double calc_sd(NumericBranch &branch)
 {
     if (branch.cnt < 3) return 0;
-    return sqrtl((branch.sum_sq - (square(branch.sum) / (long double) branch.cnt) + SD_REG) / (long double) (branch.cnt - 1));
+    return std::sqrt((branch.sum_sq - (square(branch.sum) / (long double) branch.cnt) + SD_REG) / (long double) (branch.cnt - 1));
 }
 long double calc_sd(size_t ix_arr[], double *restrict x, size_t st, size_t end, double *restrict mean)
 {
     long double running_mean = 0;
-    long double mean_prev    = 0;
     long double running_ssq  = 0;
+    long double mean_prev    = x[ix_arr[st]];
     double xval;
     for (size_t row = st; row <= end; row++) {
         xval = x[ix_arr[row]];
@@ -93,7 +96,7 @@ long double calc_sd(size_t ix_arr[], double *restrict x, size_t st, size_t end,
         mean_prev     = running_mean;
     }
     *mean = (double) running_mean;
-    return sqrtl(running_ssq / (long double)(end - st));
+    return std::sqrt(running_ssq / (long double)(end - st));
 }
@@ -242,11 +245,13 @@ long double categ_gain_from_split(size_t *restrict ix_arr, int *restrict x, size
 *    - split_left (out)
 *        Index at which the data is split between the two branches (includes last from left branch).
 *    - split_NA (out)
-*        Index at which the NA data is separated from the other branches
+*        Index at which the NA data is separated from the other branches.
+*    - has_zero_variance (out)
+*        Whether the 'x' column has zero variance (contains only one unique value).
 */
 void split_numericx_numericy(size_t *restrict ix_arr, size_t st, size_t end, double *restrict x, double *restrict y,
                              long double sd_y, bool has_na, size_t min_size, bool take_mid, long double *restrict buffer_sd,
-                             long double *restrict gain, double *restrict split_point, size_t *restrict split_left, size_t *restrict split_NA)
+                             long double *restrict gain, double *restrict split_point, size_t *restrict split_left, size_t *restrict split_NA, bool *restrict has_zero_variance)
 {
     *gain = -HUGE_VAL;
@@ -255,11 +260,12 @@ void split_numericx_numericy(size_t *restrict ix_arr, size_t st, size_t end, dou
     long double this_gain;
     long double cnt_dbl = (long double)(end - st + 1);
     long double running_mean = 0;
-    long double mean_prev    = 0;
     long double running_ssq  = 0;
+    long double mean_prev    = 0;
     double xval;
     long double info_left;
     long double info_NA = 0;
+    *has_zero_variance = false;
     /* check that there are enough observations for a split */
     if ((end - st + 1) < (2 * min_size)) return;
@@ -281,8 +287,13 @@ void split_numericx_numericy(size_t *restrict ix_arr, size_t st, size_t end, dou
     /* sort the remaining non-NA values in ascending order */
     std::sort(ix_arr + st_non_na, ix_arr + end + 1, [&x](const size_t a, const size_t b){return x[a] < x[b];});
+    if (x[ix_arr[st_non_na]] == x[ix_arr[end]]) {
+        *has_zero_variance = true;
+        return;
+    }
     /* calculate SD*N backwards first, then forwards */
+    mean_prev = y[ix_arr[end]];
     for (size_t i = end; i >= st_non_na; i--) {
         xval = y[ix_arr[i]];
         running_mean += (xval - running_mean) / (long double)(end - i + 1);
@@ -297,7 +308,7 @@ void split_numericx_numericy(size_t *restrict ix_arr, size_t st, size_t end, dou
     /* look for the best split point, by moving one observation at a time to the left branch*/
     running_mean = 0;
     running_ssq  = 0;
-    mean_prev    = 0;
+    mean_prev    = y[ix_arr[st_non_na]];
     for (size_t i = st_non_na; i <= (end - min_size); i++) {
         xval = y[ix_arr[i]];
         running_mean += (xval - running_mean) / (long double)(i - st_non_na + 1);
@@ -366,12 +377,16 @@ void split_numericx_numericy(size_t *restrict ix_arr, size_t st, size_t end, dou
 *        Array that will indicate which categories go into the left branch in the chosen split.
 *        (value of 1 means it's on the left branch, 0 in the right branch, -1 not applicable)
 *    - split_point (out)
-*        Split level for ordinal X variables (left branch is <= this)
+*        Split level for ordinal X variables (left branch is <= this).
+*    - has_zero_variance (out)
+*        Whether the 'x' column has zero variance (contains only one unique value).
+*    - binary_split
+*        Whether the produced split is binary (single category at each branch).
 */
 void split_categx_numericy(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, double *restrict y, long double sd_y, double ymean,
                            bool x_is_ordinal, size_t ncat_x, size_t *restrict buffer_cat_cnt, long double *restrict buffer_cat_sum,
                            long double *restrict buffer_cat_sum_sq, size_t *restrict buffer_cat_sorted,
-                           bool has_na, size_t min_size, long double *gain, char *restrict split_subset, int *restrict split_point)
+                           bool has_na, size_t min_size, long double *gain, signed char *restrict split_subset, int *restrict split_point, bool *restrict has_zero_variance, bool *restrict binary_split)
 {
     /* output parameters and variables to use */
@@ -380,9 +395,11 @@ void split_categx_numericy(size_t *restrict ix_arr, size_t st, size_t end, int *
     NumericSplit split_info;
     size_t st_cat = 0;
     double sd_y_d = (double) sd_y;
+    *has_zero_variance = false;
+    *binary_split = false;
     /* reset the buffers */
-    memset(split_subset,      0, sizeof(char)   *  ncat_x);
+    memset(split_subset,      0, sizeof(signed char)   *  ncat_x);
     memset(buffer_cat_cnt,    0, sizeof(size_t) * (ncat_x + 1));
     memset(buffer_cat_sum,    0, sizeof(long double) * (ncat_x + 1));
     memset(buffer_cat_sum_sq, 0, sizeof(long double) * (ncat_x + 1));
@@ -415,6 +432,16 @@ void split_categx_numericy(size_t *restrict ix_arr, size_t st, size_t end, int *
     }
+    int n_unique_cat = 0;
+    for (size_t cat = 0; cat < ncat_x; cat++) {
+        n_unique_cat += buffer_cat_sum_sq[cat] > 0;
+        if (n_unique_cat >= 2) break;
+    }
+    if (n_unique_cat <= 1) {
+        *has_zero_variance = true;
+        return;
+    }
     /* set NAs to their own branch */
     if (buffer_cat_cnt[ncat_x] > 0) {
         split_info.NA_branch = {buffer_cat_cnt[ncat_x], buffer_cat_sum[ncat_x], buffer_cat_sum_sq[ncat_x]};
@@ -430,6 +457,8 @@ void split_categx_numericy(size_t *restrict ix_arr, size_t st, size_t end, int *
         split_info.right_branch = {buffer_cat_cnt[1], buffer_cat_sum[1], buffer_cat_sum_sq[1]};
         *gain = numeric_gain(split_info, 1.0) * sd_y;
         split_subset[0] = 1;
+        *binary_split = true;
     }
     /* subset and ordinal splits */
@@ -443,7 +472,7 @@ void split_categx_numericy(size_t *restrict ix_arr, size_t st, size_t end, int *
         }
         /* if it's an ordinal variable, must respect the order */
-        for (size_t cat = 0; cat < ncat_x; cat++) buffer_cat_sorted[cat] = cat;
+        std::iota(buffer_cat_sorted, buffer_cat_sorted + ncat_x, (size_t)0);
         if (!x_is_ordinal) {
             /* otherwise, sort the categories according to their mean of y */
@@ -458,6 +487,10 @@ void split_categx_numericy(size_t *restrict ix_arr, size_t st, size_t end, int *
                           return (buffer_cat_sum[a] / (long double) buffer_cat_cnt[a]) >
                                  (buffer_cat_sum[b] / (long double) buffer_cat_cnt[b]);
                       });
+            if (ncat_x - st_cat == 2) {
+                *binary_split = true;
+            }
         }
         /* try moving each category to the left branch in the given order */
@@ -530,11 +563,13 @@ void split_categx_numericy(size_t *restrict ix_arr, size_t st, size_t end, int *
 *        Index at which the data is split between the two branches (includes last from left branch).
 *    - split_NA (out)
 *        Index at which the NA data is separated from the other branches
+*    - has_zero_variance (out)
+*        Whether the 'x' column has zero variance (contains only one unique value).
 */
 void split_numericx_categy(size_t *restrict ix_arr, size_t st, size_t end, double *restrict x, int *restrict y,
                            size_t ncat_y, long double base_info, size_t *restrict buffer_cat_cnt,
                            bool has_na, size_t min_size, bool take_mid, long double *restrict gain, double *restrict split_point,
-                           size_t *restrict split_left, size_t *restrict split_NA)
+                           size_t *restrict split_left, size_t *restrict split_NA, bool *restrict has_zero_variance)
 {
     *gain = -HUGE_VAL;
     *split_point = -HUGE_VAL;
@@ -543,6 +578,7 @@ void split_numericx_categy(size_t *restrict ix_arr, size_t st, size_t end, doubl
     CategSplit split_info;
     split_info.ncat = ncat_y;
     split_info.tot = end - st + 1;
+    *has_zero_variance = false;
     /* check that there are enough observations for a split */
     if ((end - st + 1) < (2 * min_size)) return;
@@ -571,6 +607,10 @@ void split_numericx_categy(size_t *restrict ix_arr, size_t st, size_t end, doubl
     /* sort the remaining non-NA values in ascending order */
     std::sort(ix_arr + st_non_na, ix_arr + end + 1, [&x](const size_t a, const size_t b){return x[a] < x[b];});
+    if (x[ix_arr[st_non_na]] == x[ix_arr[end]]) {
+        *has_zero_variance = true;
+        return;
+    }
     /* put all observations on the right branch */
     for (size_t i = st_non_na; i <= end; i++) split_info.right_branch[ y[ix_arr[i]] ]++;
@@ -638,11 +678,16 @@ void split_numericx_categy(size_t *restrict ix_arr, size_t st, size_t end, doubl
 *        Gain calculated on the best split found. If no split is possible, will return -Inf.
 *    - split_point (out)
 *        Threshold for splitting on values of 'x'. If no split is posible, will return -1.
+*    - has_zero_variance (out)
+*        Whether the 'x' column has zero variance (contains only one unique value).
+*    - binary_split
+*        Whether the produced split is binary (single category at each branch).
 */
 void split_ordx_categy(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, int *restrict y,
                        size_t ncat_y, size_t ncat_x, long double base_info,
                        size_t *restrict buffer_cat_cnt, size_t *restrict buffer_crosstab, size_t *restrict buffer_ord_cnt,
-                       bool has_na, size_t min_size, long double *gain, int *split_point)
+                       bool has_na, size_t min_size, long double *gain, int *split_point,
+                       bool *restrict has_zero_variance, bool *restrict binary_split)
 {
     *gain = -HUGE_VAL;
     *split_point = -1;
@@ -651,6 +696,8 @@ void split_ordx_categy(size_t *restrict ix_arr, size_t st, size_t end, int *rest
     CategSplit split_info;
     split_info.ncat = ncat_y;
     split_info.tot = end - st + 1;
+    *has_zero_variance = false;
+    *binary_split = false;
     /* check that there are enough observations for a split */
     if ((end - st + 1) < (2 * min_size)) return;
@@ -687,6 +734,19 @@ void split_ordx_categy(size_t *restrict ix_arr, size_t st, size_t end, int *rest
     split_info.size_right = end - st_non_na + 1;
     split_info.size_left  = 0;
+    int n_unique_cat = 0;
+    for (size_t cat = 0; cat < ncat_x; cat++) {
+        n_unique_cat += buffer_ord_cnt[cat] > 0;
+        if (n_unique_cat >= 3) break;
+    }
+    if (n_unique_cat <= 1) {
+        *has_zero_variance = true;
+        return;
+    }
+    if (n_unique_cat == 2) {
+        *binary_split = true;
+    }
     /* look for the best split point, by moving one observation at a time to the left branch*/
     for (size_t ord_cat = 0; ord_cat < (ncat_x - 1); ord_cat++) {
@@ -749,11 +809,16 @@ void split_ordx_categy(size_t *restrict ix_arr, size_t st, size_t end, int *rest
 *    - split_subset[ncat_x] (out)
 *        Array that will indicate which categories go into the left branch in the chosen split.
 *        (value of 1 means it's on the left branch, 0 in the right branch, -1 not applicable)
+*    - has_zero_variance (out)
+*        Whether the 'x' column has zero variance (contains only one unique value).
+*    - binary_split
+*        Whether the produced split is binary (single category at each branch).
 */
 void split_categx_biny(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, int *restrict y,
                        size_t ncat_x, long double base_info,
                        size_t *restrict buffer_cat_cnt, size_t *restrict buffer_crosstab, size_t *restrict buffer_cat_sorted,
-                       bool has_na, size_t min_size, long double *gain, char *restrict split_subset)
+                       bool has_na, size_t min_size, long double *gain, signed char *restrict split_subset,
+                       bool *restrict has_zero_variance, bool *restrict binary_split)
 {
     *gain = -HUGE_VAL;
     size_t st_non_na;
@@ -763,6 +828,8 @@ void split_categx_biny(size_t *restrict ix_arr, size_t st, size_t end, int *rest
     size_t st_cat;
     split_info.ncat = 2;
     split_info.tot = end - st + 1;
+    *has_zero_variance = false;
+    *binary_split = false;
     /* check that there are enough observations for a split */
     if ((end - st + 1) < (2 * min_size)) return;
@@ -798,8 +865,18 @@ void split_categx_biny(size_t *restrict ix_arr, size_t st, size_t end, int *rest
     split_info.size_right = end - st_non_na + 1;
     split_info.size_left  = 0;
+    int n_unique_cat = 0;
+    for (size_t cat = 0; cat < ncat_x; cat++) {
+        n_unique_cat += buffer_cat_cnt[cat] > 0;
+        if (n_unique_cat >= 2) break;
+    }
+    if (n_unique_cat <= 1) {
+        *has_zero_variance = true;
+        return;
+    }
     /* sort the categories according to their mean of y */
-    for (size_t cat = 0; cat < ncat_x; cat++) buffer_cat_sorted[cat] = cat;
+    std::iota(buffer_cat_sorted, buffer_cat_sorted + ncat_x, (size_t)0);
     st_cat = move_zero_count_to_front(buffer_cat_sorted, buffer_cat_cnt, ncat_x);
     std::sort(buffer_cat_sorted + st_cat, buffer_cat_sorted + ncat_x,
               [&buffer_crosstab, &buffer_cat_cnt](const size_t a, const size_t b)
@@ -807,6 +884,9 @@ void split_categx_biny(size_t *restrict ix_arr, size_t st, size_t end, int *rest
                   return ((long double) buffer_crosstab[2 * a] / (long double) buffer_cat_cnt[a]) >
                          ((long double) buffer_crosstab[2 * b] / (long double) buffer_cat_cnt[b]);
               });
+    if (ncat_x - st_cat == 2) {
+        *binary_split = true;
+    }
     /* look for the best split subset, by moving one category at a time to the left branch*/
     for (size_t cat = st_cat; cat < (ncat_x - 1); cat++) {
@@ -954,11 +1034,16 @@ void split_categx_categy_separate(size_t *restrict ix_arr, size_t st, size_t end
 *    - split_subset[ncat_x] (out)
 *        Array that will indicate which categories go into the left branch in the chosen split.
 *        (value of 1 means it's on the left branch, 0 in the right branch, -1 not applicable)
+*    - has_zero_variance (out)
+*        Whether the 'x' column has zero variance (contains only one unique value).
+*    - binary_split
+*        Whether the produced split is binary (single category at each branch).
 */
 void split_categx_categy_subset(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, int *restrict y,
                                 size_t ncat_x, size_t ncat_y, long double base_info,
                                 size_t *restrict buffer_cat_cnt, size_t *restrict buffer_crosstab, size_t *restrict buffer_split,
-                                bool has_na, size_t min_size, long double *gain, char *restrict split_subset)
+                                bool has_na, size_t min_size, long double *gain, signed char *restrict split_subset,
+                                bool *restrict has_zero_variance, bool *restrict binary_split)
 {
     *gain = -HUGE_VAL;
     long double this_gain;
@@ -967,6 +1052,8 @@ void split_categx_categy_subset(size_t *restrict ix_arr, size_t st, size_t end,
     split_info.tot = end - st + 1;
     split_info.ncat = ncat_y;
     size_t st_non_na;
+    *has_zero_variance = false;
+    *binary_split = false;
     /* will divide into 3 branches: NA, within subset, outside subset */
     memset(buffer_split, 0, 3 * ncat_y * sizeof(size_t));
@@ -993,6 +1080,19 @@ void split_categx_categy_subset(size_t *restrict ix_arr, size_t st, size_t end,
         }
     }
+    int n_unique_cat = 0;
+    for (size_t cat = 0; cat < ncat_x; cat++) {
+        n_unique_cat += buffer_cat_cnt[cat] > 0;
+        if (n_unique_cat >= 3) break;
+    }
+    if (n_unique_cat <= 1) {
+        *has_zero_variance = true;
+        return;
+    }
+    if (n_unique_cat == 2) {
+        *binary_split = true;
+    }
     /* put all categories on the right branch */
     memset(split_info.left_branch,   0, ncat_y * sizeof(size_t));
     memset(split_info.right_branch,  0, ncat_y * sizeof(size_t));
@@ -1012,6 +1112,10 @@ void split_categx_categy_subset(size_t *restrict ix_arr, size_t st, size_t end,
     size_t last_bit;
     size_t ncomb = pow2(ncat_x) - 1;
+    /* TODO: this is highly inefficient:
+       - categories with zero count can be discarded beforehand.
+       - could use C++ next_permutation instead. */
     /* iteration is done by putting a category in the left branch if the bit at its
        position in the binary representation of the combination number is a 1 */
     /* TODO: this would be faster with a depth-first search routine */

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: outliertree
 version: !ruby/object:Gem::Version
-  version: 0.2.1
+  version: 0.3.0
 platform: ruby
 authors:
 - Andrew Kane
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2021-05-23 00:00:00.000000000 Z
+date: 2022-06-13 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rice
@@ -44,7 +44,8 @@ files:
 - lib/outliertree/version.rb
 - vendor/outliertree/LICENSE
 - vendor/outliertree/README.md
-- vendor/outliertree/src/Makevars
+- vendor/outliertree/src/Makevars.in
+- vendor/outliertree/src/Makevars.win
 - vendor/outliertree/src/RcppExports.cpp
 - vendor/outliertree/src/Rwrapper.cpp
 - vendor/outliertree/src/cat_outlier.cpp
@@ -52,9 +53,10 @@ files:
 - vendor/outliertree/src/fit_model.cpp
 - vendor/outliertree/src/misc.cpp
 - vendor/outliertree/src/outlier_tree.hpp
+- vendor/outliertree/src/outliertree-win.def
 - vendor/outliertree/src/predict.cpp
 - vendor/outliertree/src/split.cpp
-homepage: https://github.com/ankane/outliertree
+homepage: https://github.com/ankane/outliertree-ruby
 licenses:
 - GPL-3.0-or-later
 metadata: {}
@@ -66,14 +68,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
-      version: '2.6'
+      version: '2.7'
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.2.3
+rubygems_version: 3.3.7
 signing_key:
 specification_version: 4
 summary: Explainable outlier/anomaly detection for Ruby

data/vendor/outliertree/src/Makevars DELETED Viewed

@@ -1,3 +0,0 @@
-PKG_CXXFLAGS = $(SHLIB_OPENMP_CXXFLAGS)
-PKG_LIBS = $(SHLIB_OPENMP_CXXFLAGS)
-CXX_STD = CXX11