outliertree 0.3.1 → 0.4.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8ed2332f581bd9cf68d32fe19bdb89c58f268c6ea6feb30e34e8422595920dc3
4
- data.tar.gz: d7cc844317fc023bee7d461838ae3fad3567268845013d2fc4e761f325934534
3
+ metadata.gz: 107a39daf1b8743880c65c0c9bd20f6b2430687a843aa3394e4f57ba38b58766
4
+ data.tar.gz: 81e5e13612dd119624a6ec12652b048002c0c2103ee6389709682fb6bcb27e5e
5
5
  SHA512:
6
- metadata.gz: 4afaa3d661d2d225dc55d708ba8d263b0a017fe2a5388d77cb881d5765f9e9cc9ef682b52ef541b706cf00fc0cc7d3834f537a7900c30d3783df1821ef7d432e
7
- data.tar.gz: 6c0feea6f531277847d84c76cbf146ca3d3821f9dd9635c8f7bf1fe592b1905f02db2d288027600310609d817d891129b510f5dad377946e735364705d0949cd
6
+ metadata.gz: 2a8c6276389a465d548b7b06e7933e64094059960301b4393015bd906dd8deed361887876c152017bc2427fe54b81271e076de24f3e1df801f8f0c330a6c0f76
7
+ data.tar.gz: 27b9eb4c42adc7abf6c905ec3c787f6947aae6475ecb37283c9b00e560ebb49a8a6bd7ebacfce2c636ba289f014b6dd87821d65311cd3a8640700a4dae44464d
data/CHANGELOG.md CHANGED
@@ -1,3 +1,8 @@
1
+ ## 0.4.0 (2024-06-11)
2
+
3
+ - Updated OutlierTree to 1.9.0
4
+ - Dropped support for Ruby < 3.1
5
+
1
6
  ## 0.3.1 (2023-12-19)
2
7
 
3
8
  - Updated OutlierTree to 1.8.2
data/README.md CHANGED
@@ -10,7 +10,7 @@ Price (2.50) looks low given Department is Books and Sale is false
10
10
 
11
11
  :evergreen_tree: Check out [IsoTree](https://github.com/ankane/isotree-ruby) for an alternative approach that uses Isolation Forest
12
12
 
13
- [![Build Status](https://github.com/ankane/outliertree-ruby/workflows/build/badge.svg?branch=master)](https://github.com/ankane/outliertree-ruby/actions)
13
+ [![Build Status](https://github.com/ankane/outliertree-ruby/actions/workflows/build.yml/badge.svg)](https://github.com/ankane/outliertree-ruby/actions)
14
14
 
15
15
  ## Installation
16
16
 
@@ -1,3 +1,3 @@
1
1
  module OutlierTree
2
- VERSION = "0.3.1"
2
+ VERSION = "0.4.0"
3
3
  end
data/lib/outliertree.rb CHANGED
@@ -5,10 +5,10 @@ require "outliertree/ext"
5
5
  require "etc"
6
6
 
7
7
  # modules
8
- require "outliertree/dataset"
9
- require "outliertree/model"
10
- require "outliertree/result"
11
- require "outliertree/version"
8
+ require_relative "outliertree/dataset"
9
+ require_relative "outliertree/model"
10
+ require_relative "outliertree/result"
11
+ require_relative "outliertree/version"
12
12
 
13
13
  module OutlierTree
14
14
  def self.new(**options)
@@ -11,7 +11,7 @@
11
11
  * arXiv preprint arXiv:2001.00636 (2020).
12
12
  *
13
13
  *
14
- * Copyright 2020 David Cortes.
14
+ * Copyright 2020-2024 David Cortes.
15
15
  *
16
16
  * Written for C++11 standard and OpenMP 2.0 or later. Code is meant to be wrapped into scripting languages
17
17
  * such as R or Python.
@@ -97,6 +97,10 @@
97
97
  * Model parameter. Default is 2.67.
98
98
  * - z_outlier (in)
99
99
  * Model parameter. Default is 8.0. Must be greater than z_norm.
100
+ * - check_nonneg_outliers (in)
101
+ * Whether to add an extra check for possible outliers defined as having negative values while all
102
+ * the rest have positive values, regardless of how many standard deviations away they are.
103
+ * This is currently only done on the first cluster (no conditions on any variable).
100
104
  *
101
105
  * Returns:
102
106
  * - Whether there were any outliers detected.
@@ -107,7 +111,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
107
111
  size_t cluster_num, size_t tree_num, size_t tree_depth,
108
112
  bool is_log_transf, double log_minval, bool is_exp_transf, double orig_mean, double orig_sd,
109
113
  double left_tail, double right_tail, double *restrict orig_x,
110
- double max_perc_outliers, double z_norm, double z_outlier)
114
+ double max_perc_outliers, double z_norm, double z_outlier,
115
+ bool check_nonneg_outliers)
111
116
  {
112
117
 
113
118
  /* TODO: this function could try to determine if the distribution is multimodal, and if so,
@@ -120,6 +125,7 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
120
125
  /* NAs and Inf should have already been removed, and outliers with fewer conditionals already discarded */
121
126
  bool has_low_values = false;
122
127
  bool has_high_values = false;
128
+ bool has_outlier_neg_values = false;
123
129
  long double running_mean = 0;
124
130
  long double running_ssq = 0;
125
131
  long double mean_prev = 0;
@@ -127,14 +133,15 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
127
133
  double mean;
128
134
  double sd;
129
135
  size_t cnt;
130
- size_t tail_size = (size_t) calculate_max_outliers((long double)(end - st + 1), max_perc_outliers);
136
+ size_t tot = end - st + 1;
137
+ size_t tail_size = (size_t) calculate_max_outliers((long double)tot, max_perc_outliers);
131
138
  size_t st_non_tail = st + tail_size;
132
139
  size_t end_non_tail = end - tail_size;
133
140
  size_t st_normals = 0;
134
141
  size_t end_normals = 0;
135
142
  double min_gap = z_outlier - z_norm;
136
143
 
137
- double curr_gap, next_gap, eps, lim_by_orig;
144
+ double curr_gap, next_gap, lim_by_orig;
138
145
 
139
146
  /* Note: there is no good reason and no theory behind these numbers.
140
147
  TODO: find a better way of setting this */
@@ -166,9 +173,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
166
173
  if ((!isinf(left_tail) || !isinf(right_tail)) && !is_log_transf && !is_exp_transf) {
167
174
  sd *= 0.5;
168
175
  }
169
- sd = std::fmax(sd, 1e-15);
170
176
  while (std::numeric_limits<double>::epsilon() > sd*std::fmin(min_gap, z_norm))
171
- sd *= 4;
177
+ sd = std::nextafter(sd, std::numeric_limits<double>::infinity());
172
178
  cluster.cluster_mean = mean;
173
179
  cluster.cluster_sd = sd;
174
180
  cnt = end - st + 1;
@@ -212,10 +218,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
212
218
  cluster.display_lim_low = orig_x[ix_arr[row + 1]];
213
219
  cluster.perc_above = (long double)(end - st_normals + 1) / (long double)(end - st + 1);
214
220
 
215
- eps = 1e-15;
216
221
  while (cluster.display_lim_low <= cluster.lower_lim) {
217
- cluster.lower_lim -= eps;
218
- eps *= 4;
222
+ cluster.lower_lim = std::nextafter(cluster.lower_lim, -std::numeric_limits<double>::infinity());
219
223
  }
220
224
  break;
221
225
  }
@@ -225,6 +229,7 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
225
229
  if (st_normals == 0) {
226
230
  has_low_values = false;
227
231
  } else {
232
+ assign_low_outliers:
228
233
  for (size_t row = st; row < st_normals; row++) {
229
234
 
230
235
  /* assign outlier if it's a better cluster than previously assigned */
@@ -254,7 +259,23 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
254
259
  }
255
260
  }
256
261
  }
257
- if (!has_low_values) {
262
+ /* special type of outliers not based on standard deviations */
263
+ if (check_nonneg_outliers && st_normals == 0 && tot >= 500 && orig_x[ix_arr[st]] < 0. && orig_x[ix_arr[end]] >= 2.) {
264
+ size_t max_neg_outliers = (tot < 10000)? 1 : ((tot < 100000)? 2 : 3);
265
+ if (orig_x[ix_arr[st + max_neg_outliers]] > 0.) {
266
+ size_t num_neg = 0;
267
+ for (size_t row = st; row < st + max_neg_outliers; row++) {
268
+ num_neg += orig_x[ix_arr[row]] < 0.;
269
+ }
270
+ st_normals = st + num_neg;
271
+ cluster.lower_lim = 0.;
272
+ cluster.display_lim_low = orig_x[ix_arr[st + st_normals]];
273
+ cluster.perc_above = (long double)(end - st_normals + 1) / (long double)(end - st + 1);
274
+ has_outlier_neg_values = true;
275
+ goto assign_low_outliers;
276
+ }
277
+ }
278
+ if (!has_low_values && !has_outlier_neg_values) {
258
279
  cluster.perc_above = 1.0;
259
280
  if (!is_log_transf && !is_exp_transf) {
260
281
 
@@ -271,10 +292,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
271
292
  }
272
293
 
273
294
  if (cluster.lower_lim > -HUGE_VAL) {
274
- eps = 1e-15;
275
295
  while (cluster.lower_lim >= orig_x[ix_arr[st]]) {
276
- cluster.lower_lim -= eps;
277
- eps *= 4.;
296
+ cluster.lower_lim = std::nextafter(cluster.lower_lim, -std::numeric_limits<double>::infinity());
278
297
  }
279
298
  }
280
299
 
@@ -324,10 +343,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
324
343
  cluster.display_lim_high = orig_x[ix_arr[row - 1]];
325
344
  cluster.perc_below = (long double)(end_normals - st + 1) / (long double)(end - st + 1);
326
345
 
327
- eps = 1e-15;
328
346
  while (cluster.display_lim_high >= cluster.upper_lim) {
329
- cluster.upper_lim += eps;
330
- eps *= 4;
347
+ cluster.upper_lim = std::nextafter(cluster.upper_lim, std::numeric_limits<double>::infinity());
331
348
  }
332
349
  break;
333
350
  }
@@ -384,10 +401,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
384
401
  }
385
402
 
386
403
  if (cluster.upper_lim < HUGE_VAL) {
387
- eps = 1e-15;
388
404
  while (cluster.upper_lim <= orig_x[ix_arr[end]]) {
389
- cluster.upper_lim += eps;
390
- eps *= 4.;
405
+ cluster.upper_lim = std::nextafter(cluster.upper_lim, std::numeric_limits<double>::infinity());
391
406
  }
392
407
  }
393
408
 
@@ -406,8 +421,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
406
421
  }
407
422
 
408
423
  /* save displayed statistics for cluster */
409
- if (has_high_values || has_low_values || is_log_transf || is_exp_transf) {
410
- size_t st_disp = has_low_values? st_normals : st;
424
+ if (has_high_values || has_low_values || is_log_transf || is_exp_transf || has_outlier_neg_values) {
425
+ size_t st_disp = (has_low_values || has_outlier_neg_values)? st_normals : st;
411
426
  size_t end_disp = has_high_values? end_normals : end;
412
427
  running_mean = 0;
413
428
  running_ssq = 0;
@@ -428,7 +443,7 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
428
443
  }
429
444
 
430
445
  /* report whether outliers were found or not */
431
- return has_low_values || has_high_values;
446
+ return has_low_values || has_high_values || has_outlier_neg_values;
432
447
  }
433
448
 
434
449
 
@@ -11,7 +11,7 @@
11
11
  * arXiv preprint arXiv:2001.00636 (2020).
12
12
  *
13
13
  *
14
- * Copyright 2020 David Cortes.
14
+ * Copyright 2020-2024 David Cortes.
15
15
  *
16
16
  * Written for C++11 standard and OpenMP 2.0 or later. Code is meant to be wrapped into scripting languages
17
17
  * such as R or Python.
@@ -552,7 +552,8 @@ void process_numeric_col(std::vector<Cluster> &cluster_root,
552
552
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
553
553
  workspace.orig_mean, workspace.orig_sd,
554
554
  workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
555
- model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
555
+ model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
556
+ true);
556
557
  workspace.tree->back().clusters.push_back(0);
557
558
 
558
559
  /* remove outliers if any were found */
@@ -636,7 +637,8 @@ void recursive_split_numeric(Workspace &workspace,
636
637
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
637
638
  workspace.orig_mean, workspace.orig_sd,
638
639
  workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
639
- model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
640
+ model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
641
+ false);
640
642
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
641
643
 
642
644
  if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
@@ -663,7 +665,8 @@ void recursive_split_numeric(Workspace &workspace,
663
665
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
664
666
  workspace.orig_mean, workspace.orig_sd,
665
667
  workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
666
- model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
668
+ model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
669
+ false);
667
670
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
668
671
 
669
672
  if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
@@ -687,7 +690,8 @@ void recursive_split_numeric(Workspace &workspace,
687
690
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
688
691
  workspace.orig_mean, workspace.orig_sd,
689
692
  workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
690
- model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
693
+ model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
694
+ false);
691
695
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
692
696
 
693
697
  if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
@@ -750,7 +754,8 @@ void recursive_split_numeric(Workspace &workspace,
750
754
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
751
755
  workspace.orig_mean, workspace.orig_sd,
752
756
  workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
753
- model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
757
+ model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
758
+ false);
754
759
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
755
760
 
756
761
  if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
@@ -777,7 +782,8 @@ void recursive_split_numeric(Workspace &workspace,
777
782
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
778
783
  workspace.orig_mean, workspace.orig_sd,
779
784
  workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
780
- model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
785
+ model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
786
+ false);
781
787
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
782
788
 
783
789
  if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
@@ -805,7 +811,8 @@ void recursive_split_numeric(Workspace &workspace,
805
811
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
806
812
  workspace.orig_mean, workspace.orig_sd,
807
813
  workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
808
- model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
814
+ model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
815
+ false);
809
816
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
810
817
 
811
818
  if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
@@ -871,7 +878,8 @@ void recursive_split_numeric(Workspace &workspace,
871
878
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
872
879
  workspace.orig_mean, workspace.orig_sd,
873
880
  workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
874
- model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
881
+ model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
882
+ false);
875
883
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
876
884
 
877
885
  if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
@@ -898,7 +906,8 @@ void recursive_split_numeric(Workspace &workspace,
898
906
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
899
907
  workspace.orig_mean, workspace.orig_sd,
900
908
  workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
901
- model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
909
+ model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
910
+ false);
902
911
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
903
912
 
904
913
  if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
@@ -928,7 +937,8 @@ void recursive_split_numeric(Workspace &workspace,
928
937
  workspace.log_transf, workspace.log_minval, workspace.exp_transf,
929
938
  workspace.orig_mean, workspace.orig_sd,
930
939
  workspace.left_tail, workspace.right_tail, workspace.orig_target_col,
931
- model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier);
940
+ model_params.max_perc_outliers, model_params.z_norm, model_params.z_outlier,
941
+ false);
932
942
  workspace.lev_has_outliers = workspace.has_outliers? true : workspace.lev_has_outliers;
933
943
 
934
944
  if (model_params.follow_all && ((curr_depth + 1) < model_params.max_depth)) {
@@ -733,7 +733,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
733
733
  size_t *restrict outlier_depth, Cluster &cluster, std::vector<Cluster> &clusters, size_t cluster_num, size_t tree_num, size_t tree_depth,
734
734
  bool is_log_transf, double log_minval, bool is_exp_transf, double orig_mean, double orig_sd,
735
735
  double left_tail, double right_tail, double *restrict orig_x,
736
- double max_perc_outliers, double z_norm, double z_outlier);
736
+ double max_perc_outliers, double z_norm, double z_outlier,
737
+ bool check_nonneg_outliers);
737
738
  void define_categ_cluster_no_cond(int *restrict x, size_t *restrict ix_arr, size_t st, size_t end, size_t ncateg,
738
739
  double *restrict outlier_scores, size_t *restrict outlier_clusters, size_t *restrict outlier_trees,
739
740
  size_t *restrict outlier_depth, Cluster &cluster,
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: outliertree
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.1
4
+ version: 0.4.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-12-20 00:00:00.000000000 Z
11
+ date: 2024-06-12 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rice
@@ -16,14 +16,14 @@ dependencies:
16
16
  requirements:
17
17
  - - ">="
18
18
  - !ruby/object:Gem::Version
19
- version: 4.0.2
19
+ version: '4.3'
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - ">="
25
25
  - !ruby/object:Gem::Version
26
- version: 4.0.2
26
+ version: '4.3'
27
27
  description:
28
28
  email: andrew@ankane.org
29
29
  executables: []
@@ -68,14 +68,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
68
68
  requirements:
69
69
  - - ">="
70
70
  - !ruby/object:Gem::Version
71
- version: '2.7'
71
+ version: '3.1'
72
72
  required_rubygems_version: !ruby/object:Gem::Requirement
73
73
  requirements:
74
74
  - - ">="
75
75
  - !ruby/object:Gem::Version
76
76
  version: '0'
77
77
  requirements: []
78
- rubygems_version: 3.4.10
78
+ rubygems_version: 3.5.9
79
79
  signing_key:
80
80
  specification_version: 4
81
81
  summary: Explainable outlier/anomaly detection for Ruby