outliertree 0.3.1 → 0.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8ed2332f581bd9cf68d32fe19bdb89c58f268c6ea6feb30e34e8422595920dc3
4
- data.tar.gz: d7cc844317fc023bee7d461838ae3fad3567268845013d2fc4e761f325934534
3
+ metadata.gz: b2c75c112439d30795757595ab3d34de3fbe80049f1b03e7168f0d1eed405417
4
+ data.tar.gz: e08ddb70bdf7b1be287b2986fc9364aa554db465fc2ef11b193b8f2adbd7af19
5
5
  SHA512:
6
- metadata.gz: 4afaa3d661d2d225dc55d708ba8d263b0a017fe2a5388d77cb881d5765f9e9cc9ef682b52ef541b706cf00fc0cc7d3834f537a7900c30d3783df1821ef7d432e
7
- data.tar.gz: 6c0feea6f531277847d84c76cbf146ca3d3821f9dd9635c8f7bf1fe592b1905f02db2d288027600310609d817d891129b510f5dad377946e735364705d0949cd
6
+ metadata.gz: 8bf35fb1fad7023c10b5b7514281d6db9c57faacfa8fcbaaa5e39a2aefb43508f5ed1c9c4cac7dd7e96052fac5e0c6036de201223ec3df7241051426e67a349c
7
+ data.tar.gz: 36d0ffa632d62be0f2ebe73bb7cace6b4f31eddf6202f26afe8250f27a8bbf25606be9ca8e9b5c89a2596b3acdc0c7124cf039d7b5e9557902a08ae233416293
data/CHANGELOG.md CHANGED
@@ -1,3 +1,12 @@
1
+ ## 0.4.1 (2025-04-23)
2
+
3
+ - Updated OutlierTree to 1.10.0
4
+
5
+ ## 0.4.0 (2024-06-11)
6
+
7
+ - Updated OutlierTree to 1.9.0
8
+ - Dropped support for Ruby < 3.1
9
+
1
10
  ## 0.3.1 (2023-12-19)
2
11
 
3
12
  - Updated OutlierTree to 1.8.2
data/README.md CHANGED
@@ -10,7 +10,7 @@ Price (2.50) looks low given Department is Books and Sale is false
10
10
 
11
11
  :evergreen_tree: Check out [IsoTree](https://github.com/ankane/isotree-ruby) for an alternative approach that uses Isolation Forest
12
12
 
13
- [![Build Status](https://github.com/ankane/outliertree-ruby/workflows/build/badge.svg?branch=master)](https://github.com/ankane/outliertree-ruby/actions)
13
+ [![Build Status](https://github.com/ankane/outliertree-ruby/actions/workflows/build.yml/badge.svg)](https://github.com/ankane/outliertree-ruby/actions)
14
14
 
15
15
  ## Installation
16
16
 
@@ -80,7 +80,7 @@ module OutlierTree
80
80
  column: column,
81
81
  value: value,
82
82
  conditions: conditions,
83
- group_statistics: group_statistics,
83
+ group_statistics: group_statistics
84
84
  # leave out for simplicity
85
85
  # score: score,
86
86
  # tree_depth: model_outputs.outlier_depth_final[row],
@@ -128,7 +128,7 @@ module OutlierTree
128
128
  column: cond_col,
129
129
  comparison: colcond,
130
130
  to: condval,
131
- value: colval,
131
+ value: colval
132
132
  # leave out for simplicity
133
133
  # decimals: coldecim
134
134
  }
@@ -1,3 +1,3 @@
1
1
  module OutlierTree
2
- VERSION = "0.3.1"
2
+ VERSION = "0.4.1"
3
3
  end
data/lib/outliertree.rb CHANGED
@@ -5,10 +5,10 @@ require "outliertree/ext"
5
5
  require "etc"
6
6
 
7
7
  # modules
8
- require "outliertree/dataset"
9
- require "outliertree/model"
10
- require "outliertree/result"
11
- require "outliertree/version"
8
+ require_relative "outliertree/dataset"
9
+ require_relative "outliertree/model"
10
+ require_relative "outliertree/result"
11
+ require_relative "outliertree/version"
12
12
 
13
13
  module OutlierTree
14
14
  def self.new(**options)
@@ -11,7 +11,7 @@
11
11
  * arXiv preprint arXiv:2001.00636 (2020).
12
12
  *
13
13
  *
14
- * Copyright 2020 David Cortes.
14
+ * Copyright 2020-2024 David Cortes.
15
15
  *
16
16
  * Written for C++11 standard and OpenMP 2.0 or later. Code is meant to be wrapped into scripting languages
17
17
  * such as R or Python.
@@ -97,6 +97,10 @@
97
97
  * Model parameter. Default is 2.67.
98
98
  * - z_outlier (in)
99
99
  * Model parameter. Default is 8.0. Must be greater than z_norm.
100
+ * - check_nonneg_outliers (in)
101
+ * Whether to add an extra check for possible outliers defined as having negative values while all
102
+ * the rest have positive values, regardless of how many standard deviations away they are.
103
+ * This is currently only done on the first cluster (no conditions on any variable).
100
104
  *
101
105
  * Returns:
102
106
  * - Whether there were any outliers detected.
@@ -107,7 +111,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
107
111
  size_t cluster_num, size_t tree_num, size_t tree_depth,
108
112
  bool is_log_transf, double log_minval, bool is_exp_transf, double orig_mean, double orig_sd,
109
113
  double left_tail, double right_tail, double *restrict orig_x,
110
- double max_perc_outliers, double z_norm, double z_outlier)
114
+ double max_perc_outliers, double z_norm, double z_outlier,
115
+ bool check_nonneg_outliers)
111
116
  {
112
117
 
113
118
  /* TODO: this function could try to determine if the distribution is multimodal, and if so,
@@ -120,6 +125,7 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
120
125
  /* NAs and Inf should have already been removed, and outliers with fewer conditionals already discarded */
121
126
  bool has_low_values = false;
122
127
  bool has_high_values = false;
128
+ bool has_outlier_neg_values = false;
123
129
  long double running_mean = 0;
124
130
  long double running_ssq = 0;
125
131
  long double mean_prev = 0;
@@ -127,14 +133,15 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
127
133
  double mean;
128
134
  double sd;
129
135
  size_t cnt;
130
- size_t tail_size = (size_t) calculate_max_outliers((long double)(end - st + 1), max_perc_outliers);
136
+ size_t tot = end - st + 1;
137
+ size_t tail_size = (size_t) calculate_max_outliers((long double)tot, max_perc_outliers);
131
138
  size_t st_non_tail = st + tail_size;
132
139
  size_t end_non_tail = end - tail_size;
133
140
  size_t st_normals = 0;
134
141
  size_t end_normals = 0;
135
142
  double min_gap = z_outlier - z_norm;
136
143
 
137
- double curr_gap, next_gap, eps, lim_by_orig;
144
+ double curr_gap, next_gap, lim_by_orig;
138
145
 
139
146
  /* Note: there is no good reason and no theory behind these numbers.
140
147
  TODO: find a better way of setting this */
@@ -166,9 +173,7 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
166
173
  if ((!isinf(left_tail) || !isinf(right_tail)) && !is_log_transf && !is_exp_transf) {
167
174
  sd *= 0.5;
168
175
  }
169
- sd = std::fmax(sd, 1e-15);
170
- while (std::numeric_limits<double>::epsilon() > sd*std::fmin(min_gap, z_norm))
171
- sd *= 4;
176
+ sd = std::fmax(sd, std::numeric_limits<double>::epsilon() / std::fmin(min_gap, z_norm));
172
177
  cluster.cluster_mean = mean;
173
178
  cluster.cluster_sd = sd;
174
179
  cnt = end - st + 1;
@@ -212,10 +217,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
212
217
  cluster.display_lim_low = orig_x[ix_arr[row + 1]];
213
218
  cluster.perc_above = (long double)(end - st_normals + 1) / (long double)(end - st + 1);
214
219
 
215
- eps = 1e-15;
216
- while (cluster.display_lim_low <= cluster.lower_lim) {
217
- cluster.lower_lim -= eps;
218
- eps *= 4;
220
+ if (cluster.display_lim_low <= cluster.lower_lim) {
221
+ cluster.lower_lim = std::nextafter(cluster.display_lim_low, -std::numeric_limits<double>::infinity());
219
222
  }
220
223
  break;
221
224
  }
@@ -225,6 +228,7 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
225
228
  if (st_normals == 0) {
226
229
  has_low_values = false;
227
230
  } else {
231
+ assign_low_outliers:
228
232
  for (size_t row = st; row < st_normals; row++) {
229
233
 
230
234
  /* assign outlier if it's a better cluster than previously assigned */
@@ -254,7 +258,23 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
254
258
  }
255
259
  }
256
260
  }
257
- if (!has_low_values) {
261
+ /* special type of outliers not based on standard deviations */
262
+ if (check_nonneg_outliers && st_normals == 0 && tot >= 500 && orig_x[ix_arr[st]] < 0. && orig_x[ix_arr[end]] >= 2.) {
263
+ size_t max_neg_outliers = (tot < 10000)? 1 : ((tot < 100000)? 2 : 3);
264
+ if (orig_x[ix_arr[st + max_neg_outliers]] > 0.) {
265
+ size_t num_neg = 0;
266
+ for (size_t row = st; row < st + max_neg_outliers; row++) {
267
+ num_neg += orig_x[ix_arr[row]] < 0.;
268
+ }
269
+ st_normals = st + num_neg;
270
+ cluster.lower_lim = 0.;
271
+ cluster.display_lim_low = orig_x[ix_arr[st + st_normals]];
272
+ cluster.perc_above = (long double)(end - st_normals + 1) / (long double)(end - st + 1);
273
+ has_outlier_neg_values = true;
274
+ goto assign_low_outliers;
275
+ }
276
+ }
277
+ if (!has_low_values && !has_outlier_neg_values) {
258
278
  cluster.perc_above = 1.0;
259
279
  if (!is_log_transf && !is_exp_transf) {
260
280
 
@@ -271,10 +291,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
271
291
  }
272
292
 
273
293
  if (cluster.lower_lim > -HUGE_VAL) {
274
- eps = 1e-15;
275
- while (cluster.lower_lim >= orig_x[ix_arr[st]]) {
276
- cluster.lower_lim -= eps;
277
- eps *= 4.;
294
+ if (cluster.lower_lim >= orig_x[ix_arr[st]]) {
295
+ cluster.lower_lim = std::nextafter(orig_x[ix_arr[st]], -std::numeric_limits<double>::infinity());
278
296
  }
279
297
  }
280
298
 
@@ -324,10 +342,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
324
342
  cluster.display_lim_high = orig_x[ix_arr[row - 1]];
325
343
  cluster.perc_below = (long double)(end_normals - st + 1) / (long double)(end - st + 1);
326
344
 
327
- eps = 1e-15;
328
- while (cluster.display_lim_high >= cluster.upper_lim) {
329
- cluster.upper_lim += eps;
330
- eps *= 4;
345
+ if (cluster.display_lim_high >= cluster.upper_lim) {
346
+ cluster.upper_lim = std::nextafter(cluster.display_lim_high, -std::numeric_limits<double>::infinity());
331
347
  }
332
348
  break;
333
349
  }
@@ -384,10 +400,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
384
400
  }
385
401
 
386
402
  if (cluster.upper_lim < HUGE_VAL) {
387
- eps = 1e-15;
388
- while (cluster.upper_lim <= orig_x[ix_arr[end]]) {
389
- cluster.upper_lim += eps;
390
- eps *= 4.;
403
+ if (cluster.upper_lim <= orig_x[ix_arr[end]]) {
404
+ cluster.upper_lim = std::nextafter(orig_x[ix_arr[end]], std::numeric_limits<double>::infinity());
391
405
  }
392
406
  }
393
407
 
@@ -406,8 +420,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
406
420
  }
407
421
 
408
422
  /* save displayed statistics for cluster */
409
- if (has_high_values || has_low_values || is_log_transf || is_exp_transf) {
410
- size_t st_disp = has_low_values? st_normals : st;
423
+ if (has_high_values || has_low_values || is_log_transf || is_exp_transf || has_outlier_neg_values) {
424
+ size_t st_disp = (has_low_values || has_outlier_neg_values)? st_normals : st;
411
425
  size_t end_disp = has_high_values? end_normals : end;
412
426
  running_mean = 0;
413
427
  running_ssq = 0;
@@ -428,7 +442,7 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
428
442
  }
429
443
 
430
444
  /* report whether outliers were found or not */
431
- return has_low_values || has_high_values;
445
+ return has_low_values || has_high_values || has_outlier_neg_values;
432
446
  }
433
447
 
434
448
 
@@ -11,7 +11,7 @@
11
11
  * arXiv preprint arXiv:2001.00636 (2020).
12
12
  *
13
13
  *
14
- * Copyright 2020 David Cortes.
14
+ * Copyright 2020-2024 David Cortes.
15
15
  *
16
16
  * Written for C++11 standard and OpenMP 2.0 or later. Code is meant to be wrapped into scripting languages
17
17
  * such as R or Python.
@@ -552,7 +552,8 @@ void process_numeric_col(std::vector<Cluster> &cluster_root,
552
552
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
553
553
  workspace.orig_mean, workspace.orig_sd,
554
554
  workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
555
- model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
555
+ model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
556
+ true);
556
557
  workspace.tree->back().clusters.push_back(0);
557
558
 
558
559
  /* remove outliers if any were found */
@@ -636,7 +637,8 @@ void recursive_split_numeric(Workspace &workspace,
636
637
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
637
638
  workspace.orig_mean, workspace.orig_sd,
638
639
  workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
639
- model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
640
+ model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
641
+ false);
640
642
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
641
643
 
642
644
  if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
@@ -663,7 +665,8 @@ void recursive_split_numeric(Workspace &workspace,
663
665
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
664
666
  workspace.orig_mean, workspace.orig_sd,
665
667
  workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
666
- model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
668
+ model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
669
+ false);
667
670
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
668
671
 
669
672
  if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
@@ -687,7 +690,8 @@ void recursive_split_numeric(Workspace &workspace,
687
690
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
688
691
  workspace.orig_mean, workspace.orig_sd,
689
692
  workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
690
- model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
693
+ model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
694
+ false);
691
695
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
692
696
 
693
697
  if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
@@ -750,7 +754,8 @@ void recursive_split_numeric(Workspace &workspace,
750
754
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
751
755
  workspace.orig_mean, workspace.orig_sd,
752
756
  workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
753
- model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
757
+ model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
758
+ false);
754
759
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
755
760
 
756
761
  if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
@@ -777,7 +782,8 @@ void recursive_split_numeric(Workspace &workspace,
777
782
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
778
783
  workspace.orig_mean, workspace.orig_sd,
779
784
  workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
780
- model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
785
+ model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
786
+ false);
781
787
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
782
788
 
783
789
  if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
@@ -805,7 +811,8 @@ void recursive_split_numeric(Workspace &workspace,
805
811
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
806
812
  workspace.orig_mean, workspace.orig_sd,
807
813
  workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
808
- model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
814
+ model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
815
+ false);
809
816
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
810
817
 
811
818
  if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
@@ -871,7 +878,8 @@ void recursive_split_numeric(Workspace &workspace,
871
878
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
872
879
  workspace.orig_mean, workspace.orig_sd,
873
880
  workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
874
- model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
881
+ model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
882
+ false);
875
883
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
876
884
 
877
885
  if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
@@ -898,7 +906,8 @@ void recursive_split_numeric(Workspace &workspace,
898
906
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
899
907
  workspace.orig_mean, workspace.orig_sd,
900
908
  workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
901
- model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
909
+ model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
910
+ false);
902
911
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
903
912
 
904
913
  if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
@@ -928,7 +937,8 @@ void recursive_split_numeric(Workspace &workspace,
928
937
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
929
938
  workspace.orig_mean, workspace.orig_sd,
930
939
  workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
931
- model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
940
+ model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
941
+ false);
932
942
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
933
943
 
934
944
  if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
@@ -733,7 +733,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
733
733
  size_t *restrict outlier_depth, Cluster &cluster, std::vector<Cluster> &clusters, size_t cluster_num, size_t tree_num, size_t tree_depth,
734
734
  bool is_log_transf, double log_minval, bool is_exp_transf, double orig_mean, double orig_sd,
735
735
  double left_tail, double right_tail, double *restrict orig_x,
736
- double max_perc_outliers, double z_norm, double z_outlier);
736
+ double max_perc_outliers, double z_norm, double z_outlier,
737
+ bool check_nonneg_outliers);
737
738
  void define_categ_cluster_no_cond(int *restrict x, size_t *restrict ix_arr, size_t st, size_t end, size_t ncateg,
738
739
  double *restrict outlier_scores, size_t *restrict outlier_clusters, size_t *restrict outlier_trees,
739
740
  size_t *restrict outlier_depth, Cluster &cluster,
metadata CHANGED
@@ -1,14 +1,13 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: outliertree
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ version: 0.4.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
- autorequire:
9
8
  bindir: bin
10
9
  cert_chain: []
11
- date: 2023-12-20 00:00:00.000000000 Z
10
+ date: 1980-01-02 00:00:00.000000000 Z
12
11
  dependencies:
13
12
  - !ruby/object:Gem::Dependency
14
13
  name: rice
@@ -16,15 +15,14 @@ dependencies:
16
15
  requirements:
17
16
  - - ">="
18
17
  - !ruby/object:Gem::Version
19
- version: 4.0.2
18
+ version: 4.3.3
20
19
  type: :runtime
21
20
  prerelease: false
22
21
  version_requirements: !ruby/object:Gem::Requirement
23
22
  requirements:
24
23
  - - ">="
25
24
  - !ruby/object:Gem::Version
26
- version: 4.0.2
27
- description:
25
+ version: 4.3.3
28
26
  email: andrew@ankane.org
29
27
  executables: []
30
28
  extensions:
@@ -60,7 +58,6 @@ homepage: https://github.com/ankane/outliertree-ruby
60
58
  licenses:
61
59
  - GPL-3.0-or-later
62
60
  metadata: {}
63
- post_install_message:
64
61
  rdoc_options: []
65
62
  require_paths:
66
63
  - lib
@@ -68,15 +65,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
68
65
  requirements:
69
66
  - - ">="
70
67
  - !ruby/object:Gem::Version
71
- version: '2.7'
68
+ version: '3.1'
72
69
  required_rubygems_version: !ruby/object:Gem::Requirement
73
70
  requirements:
74
71
  - - ">="
75
72
  - !ruby/object:Gem::Version
76
73
  version: '0'
77
74
  requirements: []
78
- rubygems_version: 3.4.10
79
- signing_key:
75
+ rubygems_version: 3.6.7
80
76
  specification_version: 4
81
77
  summary: Explainable outlier/anomaly detection for Ruby
82
78
  test_files: []