RubyGems - outliertree - Versions diffs - 0.3.1 → 0.4.0 - Mend

outliertree 0.3.1 → 0.4.0

Files changed (9) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +5 -0
data/README.md +1 -1
data/lib/outliertree/version.rb +1 -1
data/lib/outliertree.rb +4 -4
data/vendor/outliertree/src/clusters.cpp +37 -22
data/vendor/outliertree/src/fit_model.cpp +21 -11
data/vendor/outliertree/src/outlier_tree.hpp +2 -1
metadata +6 -6

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 8ed2332f581bd9cf68d32fe19bdb89c58f268c6ea6feb30e34e8422595920dc3
-  data.tar.gz: d7cc844317fc023bee7d461838ae3fad3567268845013d2fc4e761f325934534
+  metadata.gz: 107a39daf1b8743880c65c0c9bd20f6b2430687a843aa3394e4f57ba38b58766
+  data.tar.gz: 81e5e13612dd119624a6ec12652b048002c0c2103ee6389709682fb6bcb27e5e
 SHA512:
-  metadata.gz: 4afaa3d661d2d225dc55d708ba8d263b0a017fe2a5388d77cb881d5765f9e9cc9ef682b52ef541b706cf00fc0cc7d3834f537a7900c30d3783df1821ef7d432e
-  data.tar.gz: 6c0feea6f531277847d84c76cbf146ca3d3821f9dd9635c8f7bf1fe592b1905f02db2d288027600310609d817d891129b510f5dad377946e735364705d0949cd
+  metadata.gz: 2a8c6276389a465d548b7b06e7933e64094059960301b4393015bd906dd8deed361887876c152017bc2427fe54b81271e076de24f3e1df801f8f0c330a6c0f76
+  data.tar.gz: 27b9eb4c42adc7abf6c905ec3c787f6947aae6475ecb37283c9b00e560ebb49a8a6bd7ebacfce2c636ba289f014b6dd87821d65311cd3a8640700a4dae44464d

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,8 @@
+## 0.4.0 (2024-06-11)
+- Updated OutlierTree to 1.9.0
+- Dropped support for Ruby < 3.1
 ## 0.3.1 (2023-12-19)
 - Updated OutlierTree to 1.8.2

data/README.md CHANGED Viewed

@@ -10,7 +10,7 @@ Price (2.50) looks low given Department is Books and Sale is false
 :evergreen_tree: Check out [IsoTree](https://github.com/ankane/isotree-ruby) for an alternative approach that uses Isolation Forest
-[![Build Status](https://github.com/ankane/outliertree-ruby/workflows/build/badge.svg?branch=master)](https://github.com/ankane/outliertree-ruby/actions)
+[![Build Status](https://github.com/ankane/outliertree-ruby/actions/workflows/build.yml/badge.svg)](https://github.com/ankane/outliertree-ruby/actions)
 ## Installation

data/lib/outliertree/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module OutlierTree
-  VERSION = "0.3.1"
+  VERSION = "0.4.0"
 end

data/lib/outliertree.rb CHANGED Viewed

@@ -5,10 +5,10 @@ require "outliertree/ext"
 require "etc"
 # modules
-require "outliertree/dataset"
-require "outliertree/model"
-require "outliertree/result"
-require "outliertree/version"
+require_relative "outliertree/dataset"
+require_relative "outliertree/model"
+require_relative "outliertree/result"
+require_relative "outliertree/version"
 module OutlierTree
   def self.new(**options)

data/vendor/outliertree/src/clusters.cpp CHANGED Viewed

@@ -11,7 +11,7 @@
 *      arXiv preprint arXiv:2001.00636 (2020).
 *
 *
-*    Copyright 2020 David Cortes.
+*    Copyright 2020-2024 David Cortes.
 *
 *    Written for C++11 standard and OpenMP 2.0 or later. Code is meant to be wrapped into scripting languages
 *    such as R or Python.
@@ -97,6 +97,10 @@
 *        Model parameter. Default is 2.67.
 *    - z_outlier (in)
 *        Model parameter. Default is 8.0. Must be greater than z_norm.
+*    - check_nonneg_outliers (in)
+*        Whether to add an extra check for possible outliers defined as having negative values while all
+*        the rest have positive values, regardless of how many standard deviations away they are.
+*        This is currently only done on the first cluster (no conditions on any variable).
 *
 *    Returns:
 *        - Whether there were any outliers detected.
@@ -107,7 +111,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
                               size_t cluster_num, size_t tree_num, size_t tree_depth,
                               bool is_log_transf, double log_minval, bool is_exp_transf, double orig_mean, double orig_sd,
                               double left_tail, double right_tail, double *restrict orig_x,
-                              double max_perc_outliers, double z_norm, double z_outlier)
+                              double max_perc_outliers, double z_norm, double z_outlier,
+                              bool check_nonneg_outliers)
 {
     /*  TODO: this function could try to determine if the distribution is multimodal, and if so,
@@ -120,6 +125,7 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
     /* NAs and Inf should have already been removed, and outliers with fewer conditionals already discarded */
     bool has_low_values  = false;
     bool has_high_values = false;
+    bool has_outlier_neg_values = false;
     long double running_mean = 0;
     long double running_ssq  = 0;
     long double mean_prev    = 0;
@@ -127,14 +133,15 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
     double mean;
     double sd;
     size_t cnt;
-    size_t tail_size     = (size_t) calculate_max_outliers((long double)(end - st + 1), max_perc_outliers);
+    size_t tot           = end - st + 1;
+    size_t tail_size     = (size_t) calculate_max_outliers((long double)tot, max_perc_outliers);
     size_t st_non_tail   = st  + tail_size;
     size_t end_non_tail  = end - tail_size;
     size_t st_normals    = 0;
     size_t end_normals   = 0;
     double min_gap = z_outlier - z_norm;
-    double curr_gap, next_gap, eps, lim_by_orig;
+    double curr_gap, next_gap, lim_by_orig;
     /* Note: there is no good reason and no theory behind these numbers.
        TODO: find a better way of setting this */
@@ -166,9 +173,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
     if ((!isinf(left_tail) || !isinf(right_tail)) && !is_log_transf && !is_exp_transf) {
         sd *= 0.5;
     }
-    sd = std::fmax(sd, 1e-15);
     while (std::numeric_limits<double>::epsilon() > sd*std::fmin(min_gap, z_norm))
-        sd *= 4;
+        sd = std::nextafter(sd, std::numeric_limits<double>::infinity());
     cluster.cluster_mean = mean;
     cluster.cluster_sd = sd;
     cnt = end - st + 1;
@@ -212,10 +218,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
                 cluster.display_lim_low = orig_x[ix_arr[row + 1]];
                 cluster.perc_above = (long double)(end - st_normals + 1) / (long double)(end - st + 1);
-                eps = 1e-15;
                 while (cluster.display_lim_low <= cluster.lower_lim) {
-                    cluster.lower_lim -= eps;
-                    eps *= 4;
+                    cluster.lower_lim = std::nextafter(cluster.lower_lim, -std::numeric_limits<double>::infinity());
                 }
                 break;
             }
@@ -225,6 +229,7 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
         if (st_normals == 0) {
             has_low_values = false;
         } else {
+            assign_low_outliers:
             for (size_t row = st; row < st_normals; row++) {
                 /* assign outlier if it's a better cluster than previously assigned */
@@ -254,7 +259,23 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
             }
         }
     }
-    if (!has_low_values) {
+    /* special type of outliers not based on standard deviations */
+    if (check_nonneg_outliers && st_normals == 0 && tot >= 500 && orig_x[ix_arr[st]] < 0. && orig_x[ix_arr[end]] >= 2.) {
+        size_t max_neg_outliers = (tot < 10000)? 1 : ((tot < 100000)? 2 : 3);
+        if (orig_x[ix_arr[st + max_neg_outliers]] > 0.) {
+            size_t num_neg = 0;
+            for (size_t row = st; row < st + max_neg_outliers; row++) {
+                num_neg += orig_x[ix_arr[row]] < 0.;
+            }
+            st_normals = st + num_neg;
+            cluster.lower_lim = 0.;
+            cluster.display_lim_low = orig_x[ix_arr[st + st_normals]];
+            cluster.perc_above = (long double)(end - st_normals + 1) / (long double)(end - st + 1);
+            has_outlier_neg_values = true;
+            goto assign_low_outliers;
+        }
+    }
+    if (!has_low_values && !has_outlier_neg_values) {
         cluster.perc_above = 1.0;
         if (!is_log_transf && !is_exp_transf) {
@@ -271,10 +292,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
         }
         if (cluster.lower_lim > -HUGE_VAL) {
-            eps = 1e-15;
             while (cluster.lower_lim >= orig_x[ix_arr[st]]) {
-                cluster.lower_lim -= eps;
-                eps *= 4.;
+                cluster.lower_lim = std::nextafter(cluster.lower_lim, -std::numeric_limits<double>::infinity());
             }
         }
@@ -324,10 +343,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
                 cluster.display_lim_high = orig_x[ix_arr[row - 1]];
                 cluster.perc_below = (long double)(end_normals - st + 1) / (long double)(end - st + 1);
-                eps = 1e-15;
                 while (cluster.display_lim_high >= cluster.upper_lim) {
-                    cluster.upper_lim += eps;
-                    eps *= 4;
+                    cluster.upper_lim = std::nextafter(cluster.upper_lim, std::numeric_limits<double>::infinity());
                 }
                 break;
             }
@@ -384,10 +401,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
         }
         if (cluster.upper_lim < HUGE_VAL) {
-            eps = 1e-15;
             while (cluster.upper_lim <= orig_x[ix_arr[end]]) {
-                cluster.upper_lim += eps;
-                eps *= 4.;
+                cluster.upper_lim = std::nextafter(cluster.upper_lim, std::numeric_limits<double>::infinity());
             }
         }
@@ -406,8 +421,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
     }
     /* save displayed statistics for cluster */
-    if (has_high_values || has_low_values || is_log_transf || is_exp_transf) {
-        size_t st_disp  = has_low_values?  st_normals  : st;
+    if (has_high_values || has_low_values || is_log_transf || is_exp_transf || has_outlier_neg_values) {
+        size_t st_disp  = (has_low_values || has_outlier_neg_values)?  st_normals  : st;
         size_t end_disp = has_high_values? end_normals : end;
         running_mean = 0;
         running_ssq  = 0;
@@ -428,7 +443,7 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
     }
     /* report whether outliers were found or not */
-    return has_low_values || has_high_values;
+    return has_low_values || has_high_values || has_outlier_neg_values;
 }

data/vendor/outliertree/src/fit_model.cpp CHANGED Viewed

@@ -11,7 +11,7 @@
 *      arXiv preprint arXiv:2001.00636 (2020).
 *
 *
-*    Copyright 2020 David Cortes.
+*    Copyright 2020-2024 David Cortes.
 *
 *    Written for C++11 standard and OpenMP 2.0 or later. Code is meant to be wrapped into scripting languages
 *    such as R or Python.
@@ -552,7 +552,8 @@ void process_numeric_col(std::vector<Cluster> &cluster_root,
                                                           workspace.log_transf, workspace.log_minval, workspace.exp_transf,
                                                           workspace.orig_mean, workspace.orig_sd,
                                                           workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
-                                                          model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
+                                                          model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
+                                                          true);
     workspace.tree->back().clusters.push_back(0);
     /* remove outliers if any were found */
@@ -636,7 +637,8 @@ void recursive_split_numeric(Workspace &workspace,
                                                                   workspace.log_transf, workspace.log_minval, workspace.exp_transf,
                                                                   workspace.orig_mean, workspace.orig_sd,
                                                                   workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
-                                                                  model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
+                                                                  model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
+                                                                  false);
                 workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
                 if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
@@ -663,7 +665,8 @@ void recursive_split_numeric(Workspace &workspace,
                                                               workspace.log_transf, workspace.log_minval, workspace.exp_transf,
                                                               workspace.orig_mean, workspace.orig_sd,
                                                               workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
-                                                              model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
+                                                              model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
+                                                              false);
             workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
             if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
@@ -687,7 +690,8 @@ void recursive_split_numeric(Workspace &workspace,
                                                               workspace.log_transf, workspace.log_minval, workspace.exp_transf,
                                                               workspace.orig_mean, workspace.orig_sd,
                                                               workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
-                                                              model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
+                                                              model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
+                                                              false);
             workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
             if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
@@ -750,7 +754,8 @@ void recursive_split_numeric(Workspace &workspace,
                                                                   workspace.log_transf, workspace.log_minval, workspace.exp_transf,
                                                                   workspace.orig_mean, workspace.orig_sd,
                                                                   workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
-                                                                  model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
+                                                                  model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
+                                                                  false);
                 workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
                 if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
@@ -777,7 +782,8 @@ void recursive_split_numeric(Workspace &workspace,
                                                               workspace.log_transf, workspace.log_minval, workspace.exp_transf,
                                                               workspace.orig_mean, workspace.orig_sd,
                                                               workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
-                                                              model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
+                                                              model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
+                                                              false);
             workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
             if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
@@ -805,7 +811,8 @@ void recursive_split_numeric(Workspace &workspace,
                                                               workspace.log_transf, workspace.log_minval, workspace.exp_transf,
                                                               workspace.orig_mean, workspace.orig_sd,
                                                               workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
-                                                              model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
+                                                              model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
+                                                              false);
             workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
             if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
@@ -871,7 +878,8 @@ void recursive_split_numeric(Workspace &workspace,
                                                                   workspace.log_transf, workspace.log_minval, workspace.exp_transf,
                                                                   workspace.orig_mean, workspace.orig_sd,
                                                                   workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
-                                                                  model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
+                                                                  model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
+                                                                  false);
                 workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
                 if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
@@ -898,7 +906,8 @@ void recursive_split_numeric(Workspace &workspace,
                                                               workspace.log_transf, workspace.log_minval, workspace.exp_transf,
                                                               workspace.orig_mean, workspace.orig_sd,
                                                               workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
-                                                              model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
+                                                              model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
+                                                              false);
             workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
             if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
@@ -928,7 +937,8 @@ void recursive_split_numeric(Workspace &workspace,
                                                               workspace.log_transf, workspace.log_minval, workspace.exp_transf,
                                                               workspace.orig_mean, workspace.orig_sd,
                                                               workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
-                                                              model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
+                                                              model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
+                                                              false);
             workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
             if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {

data/vendor/outliertree/src/outlier_tree.hpp CHANGED Viewed

@@ -733,7 +733,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
                               size_t *restrict outlier_depth, Cluster &cluster, std::vector<Cluster> &clusters, size_t cluster_num, size_t tree_num, size_t tree_depth,
                               bool is_log_transf, double log_minval, bool is_exp_transf, double orig_mean, double orig_sd,
                               double left_tail, double right_tail, double *restrict orig_x,
-                              double max_perc_outliers, double z_norm, double z_outlier);
+                              double max_perc_outliers, double z_norm, double z_outlier,
+                              bool check_nonneg_outliers);
 void define_categ_cluster_no_cond(int *restrict x, size_t *restrict ix_arr, size_t st, size_t end, size_t ncateg,
                                   double *restrict outlier_scores, size_t *restrict outlier_clusters, size_t *restrict outlier_trees,
                                   size_t *restrict outlier_depth, Cluster &cluster,

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: outliertree
 version: !ruby/object:Gem::Version
-  version: 0.3.1
+  version: 0.4.0
 platform: ruby
 authors:
 - Andrew Kane
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2023-12-20 00:00:00.000000000 Z
+date: 2024-06-12 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rice
@@ -16,14 +16,14 @@ dependencies:
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        version: 4.0.2
+        version: '4.3'
   type: :runtime
   prerelease: false
   version_requirements: !ruby/object:Gem::Requirement
     requirements:
     - - ">="
       - !ruby/object:Gem::Version
-        version: 4.0.2
+        version: '4.3'
 description:
 email: andrew@ankane.org
 executables: []
@@ -68,14 +68,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
-      version: '2.7'
+      version: '3.1'
 required_rubygems_version: !ruby/object:Gem::Requirement
   requirements:
   - - ">="
     - !ruby/object:Gem::Version
       version: '0'
 requirements: []
-rubygems_version: 3.4.10
+rubygems_version: 3.5.9
 signing_key:
 specification_version: 4
 summary: Explainable outlier/anomaly detection for Ruby