RubyGems - outliertree - Versions diffs - 0.3.1 → 0.4.0 - Mend

outliertree 0.3.1 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +5 -0
data/README.md +1 -1
data/lib/outliertree/version.rb +1 -1
data/lib/outliertree.rb +4 -4
data/vendor/outliertree/src/clusters.cpp +37 -22
data/vendor/outliertree/src/fit_model.cpp +21 -11
data/vendor/outliertree/src/outlier_tree.hpp +2 -1
metadata +6 -6

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 8ed2332f581bd9cf68d32fe19bdb89c58f268c6ea6feb30e34e8422595920dc3
-  data.tar.gz: d7cc844317fc023bee7d461838ae3fad3567268845013d2fc4e761f325934534
+  metadata.gz: 107a39daf1b8743880c65c0c9bd20f6b2430687a843aa3394e4f57ba38b58766
+  data.tar.gz: 81e5e13612dd119624a6ec12652b048002c0c2103ee6389709682fb6bcb27e5e
 SHA512:
-  metadata.gz: 4afaa3d661d2d225dc55d708ba8d263b0a017fe2a5388d77cb881d5765f9e9cc9ef682b52ef541b706cf00fc0cc7d3834f537a7900c30d3783df1821ef7d432e
-  data.tar.gz: 6c0feea6f531277847d84c76cbf146ca3d3821f9dd9635c8f7bf1fe592b1905f02db2d288027600310609d817d891129b510f5dad377946e735364705d0949cd
+  metadata.gz: 2a8c6276389a465d548b7b06e7933e64094059960301b4393015bd906dd8deed361887876c152017bc2427fe54b81271e076de24f3e1df801f8f0c330a6c0f76
+  data.tar.gz: 27b9eb4c42adc7abf6c905ec3c787f6947aae6475ecb37283c9b00e560ebb49a8a6bd7ebacfce2c636ba289f014b6dd87821d65311cd3a8640700a4dae44464d

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,8 @@
+## 0.4.0 (2024-06-11)
+- Updated OutlierTree to 1.9.0
+- Dropped support for Ruby < 3.1
 ## 0.3.1 (2023-12-19)
 - Updated OutlierTree to 1.8.2

data/README.md CHANGED Viewed

@@ -10,7 +10,7 @@ Price (2.50) looks low given Department is Books and Sale is false
 :evergreen_tree: Check out [IsoTree](https://github.com/ankane/isotree-ruby) for an alternative approach that uses Isolation Forest
-[![Build Status](https://github.com/ankane/outliertree-ruby/workflows/build/badge.svg?branch=master)](https://github.com/ankane/outliertree-ruby/actions)
+[![Build Status](https://github.com/ankane/outliertree-ruby/actions/workflows/build.yml/badge.svg)](https://github.com/ankane/outliertree-ruby/actions)
 ## Installation

data/lib/outliertree/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module OutlierTree
-  VERSION = "0.3.1"
+  VERSION = "0.4.0"
 end

data/lib/outliertree.rb CHANGED Viewed

@@ -5,10 +5,10 @@ require "outliertree/ext"
 require "etc"
 # modules
-require "outliertree/dataset"
-require "outliertree/model"
-require "outliertree/result"
-require "outliertree/version"
+require_relative "outliertree/dataset"
+require_relative "outliertree/model"
+require_relative "outliertree/result"
+require_relative "outliertree/version"
 module OutlierTree
   def self.new(**options)

data/vendor/outliertree/src/clusters.cpp CHANGED Viewed

@@ -11,7 +11,7 @@
 *      arXiv preprint arXiv:2001.00636 (2020).
 *
 *
-*    Copyright 2020 David Cortes.
+*    Copyright 2020-2024 David Cortes.
 *
 *    Written for C++11 standard and OpenMP 2.0 or later. Code is meant to be wrapped into scripting languages
 *    such as R or Python.
@@ -97,6 +97,10 @@
 *        Model parameter. Default is 2.67.
 *    - z_outlier (in)
 *        Model parameter. Default is 8.0. Must be greater than z_norm.
+*    - check_nonneg_outliers (in)
+*        Whether to add an extra check for possible outliers defined as having negative values while all
+*        the rest have positive values, regardless of how many standard deviations away they are.
+*        This is currently only done on the first cluster (no conditions on any variable).
 *
 *    Returns:
 *        - Whether there were any outliers detected.
@@ -107,7 +111,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
                               size_t cluster_num, size_t tree_num, size_t tree_depth,
                               bool is_log_transf, double log_minval, bool is_exp_transf, double orig_mean, double orig_sd,
                               double left_tail, double right_tail, double *restrict orig_x,
-                              double max_perc_outliers, double z_norm, double z_outlier)
+                              double max_perc_outliers, double z_norm, double z_outlier,
+                              bool check_nonneg_outliers)
 {
     /*  TODO: this function could try to determine if the distribution is multimodal, and if so,
@@ -120,6 +125,7 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
     /* NAs and Inf should have already been removed, and outliers with fewer conditionals already discarded */
     bool has_low_values  = false;
     bool has_high_values = false;
+    bool has_outlier_neg_values = false;
     long double running_mean = 0;
     long double running_ssq  = 0;
     long double mean_prev    = 0;
@@ -127,14 +133,15 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
     double mean;
     double sd;
     size_t cnt;
-    size_t tail_size     = (size_t) calculate_max_outliers((long double)(end - st + 1), max_perc_outliers);
+    size_t tot           = end - st + 1;
+    size_t tail_size     = (size_t) calculate_max_outliers((long double)tot, max_perc_outliers);
     size_t st_non_tail   = st  + tail_size;
     size_t end_non_tail  = end - tail_size;
     size_t st_normals    = 0;
     size_t end_normals   = 0;
     double min_gap = z_outlier - z_norm;
-    double curr_gap, next_gap, eps, lim_by_orig;
+    double curr_gap, next_gap, lim_by_orig;
     /* Note: there is no good reason and no theory behind these numbers.
        TODO: find a better way of setting this */
@@ -166,9 +173,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
     if ((!isinf(left_tail) || !isinf(right_tail)) && !is_log_transf && !is_exp_transf) {
         sd *= 0.5;
     }
-    sd = std::fmax(sd, 1e-15);
     while (std::numeric_limits<double>::epsilon() > sd*std::fmin(min_gap, z_norm))
-        sd *= 4;
+        sd = std::nextafter(sd, std::numeric_limits<double>::infinity());
     cluster.cluster_mean = mean;
     cluster.cluster_sd = sd;
     cnt = end - st + 1;
@@ -212,10 +218,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
                 cluster.display_lim_low = orig_x[ix_arr[row + 1]];
                 cluster.perc_above = (long double)(end - st_normals + 1) / (long double)(end - st + 1);
-                eps = 1e-15;
                 while (cluster.display_lim_low <= cluster.lower_lim) {
-                    cluster.lower_lim -= eps;
-                    eps *= 4;
+                    cluster.lower_lim = std::nextafter(cluster.lower_lim, -std::numeric_limits<double>::infinity());
                 }
                 break;
             }
@@ -225,6 +229,7 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
         if (st_normals == 0) {
             has_low_values = false;
         } else {
+            assign_low_outliers:
             for (size_t row = st; row < st_normals; row++) {
                 /* assign outlier if it's a better cluster than previously assigned */
@@ -254,7 +259,23 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
             }
         }
     }
-    if (!has_low_values) {
+    /* special type of outliers not based on standard deviations */
+    if (check_nonneg_outliers && st_normals == 0 && tot >= 500 && orig_x[ix_arr[st]] < 0. && orig_x[ix_arr[end]] >= 2.) {
+        size_t max_neg_outliers = (tot < 10000)? 1 : ((tot < 100000)? 2 : 3);
+        if (orig_x[ix_arr[st + max_neg_outliers]] > 0.) {
+            size_t num_neg = 0;
+            for (size_t row = st; row < st + max_neg_outliers; row++) {
+                num_neg += orig_x[ix_arr[row]] < 0.;
+            }
+            st_normals = st + num_neg;
+            cluster.lower_lim = 0.;
+            cluster.display_lim_low = orig_x[ix_arr[st + st_normals]];
+            cluster.perc_above = (long double)(end - st_normals + 1) / (long double)(end - st + 1);
+            has_outlier_neg_values = true;
+            goto assign_low_outliers;
+        }
+    }
+    if (!has_low_values && !has_outlier_neg_values) {
         cluster.perc_above = 1.0;
         if (!is_log_transf && !is_exp_transf) {
@@ -271,10 +292,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
         }
         if (cluster.lower_lim > -HUGE_VAL) {
-            eps = 1e-15;
             while (cluster.lower_lim >= orig_x[ix_arr[st]]) {
-                cluster.lower_lim -= eps;
-                eps *= 4.;
+                cluster.lower_lim = std::nextafter(cluster.lower_lim, -std::numeric_limits<double>::infinity());
             }
         }
@@ -324,10 +343,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
                 cluster.display_lim_high = orig_x[ix_arr[row - 1]];
                 cluster.perc_below = (long double)(end_normals - st + 1) / (long double)(end - st + 1);
-                eps = 1e-15;
                 while (cluster.display_lim_high >= cluster.upper_lim) {
-                    cluster.upper_lim += eps;
-                    eps *= 4;
+                    cluster.upper_lim = std::nextafter(cluster.upper_lim, std::numeric_limits<double>::infinity());
                 }
                 break;
             }
@@ -384,10 +401,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
         }
         if (cluster.upper_lim < HUGE_VAL) {
-            eps = 1e-15;
             while (cluster.upper_lim <= orig_x[ix_arr[end]]) {
-                cluster.upper_lim += eps;
-                eps *= 4.;
+                cluster.upper_lim = std::nextafter(cluster.upper_lim, std::numeric_limits<double>::infinity());
             }
         }
@@ -406,8 +421,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
     }
     /* save displayed statistics for cluster */
-    if (has_high_values || has_low_values || is_log_transf || is_exp_transf) {
-        size_t st_disp  = has_low_values?  st_normals  : st;
+    if (has_high_values || has_low_values || is_log_transf || is_exp_transf || has_outlier_neg_values) {
+        size_t st_disp  = (has_low_values || has_outlier_neg_values)?  st_normals  : st;
         size_t end_disp = has_high_values? end_normals : end;
         running_mean = 0;
         running_ssq  = 0;
@@ -428,7 +443,7 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
     }
     /* report whether outliers were found or not */
-    return has_low_values || has_high_values;
+    return has_low_values || has_high_values || has_outlier_neg_values;
 }

data/vendor/outliertree/src/fit_model.cpp CHANGED Viewed

@@ -11,7 +11,7 @@
 *      arXiv preprint arXiv:2001.00636 (2020).
 *
 *
-*    Copyright 2020 David Cortes.
+*    Copyright 2020-2024 David Cortes.
 *
 *    Written for C++11 standard and OpenMP 2.0 or later. Code is meant to be wrapped into scripting languages
 *    such as R or Python.
@@ -552,7 +552,8 @@ void process_numeric_col(std::vector<Cluster> &cluster_root,
                                                           workspace.log_transf, workspace.log_minval, workspace.exp_transf,
                                                           workspace.orig_mean, workspace.orig_sd,
                                                           workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
-                                                          model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
+                                                          model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
+                                                          true);
     workspace.tree->back().clusters.push_back(0);
     /* remove outliers if any were found */
@@ -636,7 +637,8 @@ void recursive_split_numeric(Workspace &workspace,
                                                                   workspace.log_transf, workspace.log_minval, workspace.exp_transf,
                                                                   workspace.orig_mean, workspace.orig_sd,
                                                                   workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
-                                                                  model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
+                                                                  model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
+                                                                  false);
                 workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
                 if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
@@ -663,7 +665,8 @@ void recursive_split_numeric(Workspace &workspace,
                                                               workspace.log_transf, workspace.log_minval, workspace.exp_transf,
                                                               workspace.orig_mean, workspace.orig_sd,
                                                               workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
-                                                              model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
+                                                              model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
+                                                              false);
             workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
             if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
@@ -687,7 +690,8 @@ void recursive_split_numeric(Workspace &workspace,
                                                               workspace.log_transf, workspace.log_minval, workspace.exp_transf,
                                                               workspace.orig_mean, workspace.orig_sd,
                                                               workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
-                                                              model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
+                                                              model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
+                                                              false);
             workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
             if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
@@ -750,7 +754,8 @@ void recursive_split_numeric(Workspace &workspace,
                                                                   workspace.log_transf, workspace.log_minval, workspace.exp_transf,
                                                                   workspace.orig_mean, workspace.orig_sd,
                                                                   workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
-                                                                  model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
+                                                                  model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
+                                                                  false);
                 workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
                 if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
@@ -777,7 +782,8 @@ void recursive_split_numeric(Workspace &workspace,
                                                               workspace.log_transf, workspace.log_minval, workspace.exp_transf,
                                                               workspace.orig_mean, workspace.orig_sd,
                                                               workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
-                                                              model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
+                                                              model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
+                                                              false);
             workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
             if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
@@ -805,7 +811,8 @@ void recursive_split_numeric(Workspace &workspace,
                                                               workspace.log_transf, workspace.log_minval, workspace.exp_transf,
                                                               workspace.orig_mean, workspace.orig_sd,
                                                               workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
-                                                              model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
+                                                              model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
+                                                              false);
             workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
             if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
@@ -871,7 +878,8 @@ void recursive_split_numeric(Workspace &workspace,
                                                                   workspace.log_transf, workspace.log_minval, workspace.exp_transf,
                                                                   workspace.orig_mean, workspace.orig_sd,
                                                                   workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
-                                                                  model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
+                                                                  model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
+                                                                  false);
                 workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
                 if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
@@ -898,7 +906,8 @@ void recursive_split_numeric(Workspace &workspace,
                                                               workspace.log_transf, workspace.log_minval, workspace.exp_transf,
                                                               workspace.orig_mean, workspace.orig_sd,
                                                               workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
-                                                              model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
+                                                              model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
+                                                              false);
             workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
             if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
@@ -928,7 +937,8 @@ void recursive_split_numeric(Workspace &workspace,
                                                               workspace.log_transf, workspace.log_minval, workspace.exp_transf,
                                                               workspace.orig_mean, workspace.orig_sd,
                                                               workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
-                                                              model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
+                                                              model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
+                                                              false);
             workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
             if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {

data/vendor/outliertree/src/outlier_tree.hpp CHANGED Viewed

@@ -733,7 +733,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
                               size_t *restrict outlier_depth, Cluster &cluster, std::vector<Cluster> &clusters, size_t cluster_num, size_t tree_num, size_t tree_depth,
                               bool is_log_transf, double log_minval, bool is_exp_transf, double orig_mean, double orig_sd,
                               double left_tail, double right_tail, double *restrict orig_x,
-                              double max_perc_outliers, double z_norm, double z_outlier);
+                              double max_perc_outliers, double z_norm, double z_outlier,
+                              bool check_nonneg_outliers);
 void define_categ_cluster_no_cond(int *restrict x, size_t *restrict ix_arr, size_t st, size_t end, size_t ncateg,
                                   double *restrict outlier_scores, size_t *restrict outlier_clusters, size_t *restrict outlier_trees,
                                   size_t *restrict outlier_depth, Cluster &cluster,

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: outliertree
 version: !ruby/object:Gem::Version
-  version: 0.3.1
+  version: 0.4.0
 platform: ruby
 authors:
 - Andrew Kane
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2023-12-20 00:00:00.000000000 Z
+date: 2024-06-12 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rice
@@ -16,14 +16,14 @@ dependencies:
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        version: 4.0.2
+        version: '4.3'
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        version: 4.0.2
+        version: '4.3'
 description:
 email: andrew@ankane.org
 executables: []
@@ -68,14 +68,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
-      version: '2.7'
+      version: '3.1'
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.4.10
+rubygems_version: 3.5.9
 signing_key:
 specification_version: 4
 summary: Explainable outlier/anomaly detection for Ruby