outliertree 0.2.1 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -121,8 +121,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
121
121
  bool has_low_values = false;
122
122
  bool has_high_values = false;
123
123
  long double running_mean = 0;
124
- long double mean_prev = 0;
125
124
  long double running_ssq = 0;
125
+ long double mean_prev = 0;
126
126
  double xval;
127
127
  double mean;
128
128
  double sd;
@@ -134,6 +134,14 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
134
134
  size_t end_normals = 0;
135
135
  double min_gap = z_outlier - z_norm;
136
136
 
137
+ double curr_gap, next_gap, eps, lim_by_orig;
138
+
139
+ /* Note: there is no good reason and no theory behind these numbers.
140
+ TODO: find a better way of setting this */
141
+ double min_gap_orig_scale = log(sqrtl((long double)(end - st + 1))) / 2.;
142
+ min_gap_orig_scale = std::fmax(1.1, min_gap_orig_scale);
143
+ min_gap_orig_scale = std::fmin(2.5, min_gap_orig_scale);
144
+
137
145
  /* TODO: here it's not necessary to sort the whole data, only top/bottom N */
138
146
 
139
147
  /* sort the data */
@@ -141,6 +149,7 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
141
149
 
142
150
  /* calculate statistics with tails and previous outliers excluded */
143
151
  cnt = end_non_tail - st_non_tail + 1;
152
+ mean_prev = x[ ix_arr[st_non_tail] ];
144
153
  for (size_t row = st_non_tail; row <= end_non_tail; row++) {
145
154
  xval = x[ ix_arr[row] ];
146
155
  running_mean += (xval - running_mean) / (long double)(row - st_non_tail + 1);
@@ -157,10 +166,16 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
157
166
  if ((!isinf(left_tail) || !isinf(right_tail)) && !is_log_transf && !is_exp_transf) {
158
167
  sd *= 0.5;
159
168
  }
169
+ sd = std::fmax(sd, 1e-15);
170
+ while (std::numeric_limits<double>::epsilon() > sd*std::fmin(min_gap, z_norm))
171
+ sd *= 4;
160
172
  cluster.cluster_mean = mean;
161
173
  cluster.cluster_sd = sd;
162
174
  cnt = end - st + 1;
163
175
 
176
+ /* TODO: review how to better set this limit */
177
+ tail_size = std::min(tail_size, log2ceil(end - st + 1));
178
+
164
179
  /* see if the minimum and/or maximum values qualify for outliers */
165
180
  if (-z_score(x[ix_arr[st]], mean, sd) >= z_outlier && x[ix_arr[st]] > left_tail) has_low_values = true;
166
181
  if ( z_score(x[ix_arr[end]], mean, sd) >= z_outlier && x[ix_arr[end]] < right_tail) has_high_values = true;
@@ -170,6 +185,22 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
170
185
  for (size_t row = st; row < st + tail_size; row++) {
171
186
 
172
187
  if (( z_score(x[ix_arr[row + 1]], mean, sd) - z_score(x[ix_arr[row]], mean, sd) ) >= min_gap) {
188
+
189
+ /* if the variable was transformed, check that the gap is still wide in the original scale */
190
+ if (is_exp_transf || is_log_transf) {
191
+ curr_gap = orig_x[ix_arr[row + 1]] - orig_x[ix_arr[row]];
192
+ next_gap = 0;
193
+ for (size_t rr = row + 1; rr < end; rr++) {
194
+ if (orig_x[ix_arr[rr+1]] > orig_x[ix_arr[rr]]) {
195
+ next_gap = orig_x[ix_arr[rr+1]] - orig_x[ix_arr[rr]];
196
+ break;
197
+ }
198
+ }
199
+
200
+ if (next_gap > 0 && curr_gap/next_gap < min_gap_orig_scale)
201
+ continue;
202
+ }
203
+
173
204
  st_normals = row + 1;
174
205
  if (is_exp_transf) {
175
206
  cluster.lower_lim = log(x[ix_arr[row + 1]] - min_gap * sd) * orig_sd + orig_mean;
@@ -180,6 +211,12 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
180
211
  }
181
212
  cluster.display_lim_low = orig_x[ix_arr[row + 1]];
182
213
  cluster.perc_above = (long double)(end - st_normals + 1) / (long double)(end - st + 1);
214
+
215
+ eps = 1e-15;
216
+ while (cluster.display_lim_low <= cluster.lower_lim) {
217
+ cluster.lower_lim -= eps;
218
+ eps *= 4;
219
+ }
183
220
  break;
184
221
  }
185
222
  if (z_score(x[ix_arr[row]], mean, sd) > -z_outlier) break;
@@ -233,6 +270,25 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
233
270
  cluster.lower_lim = exp(x[ix_arr[st]] - min_gap * sd) + log_minval;
234
271
  }
235
272
 
273
+ if (cluster.lower_lim > -HUGE_VAL) {
274
+ eps = 1e-15;
275
+ while (cluster.lower_lim >= orig_x[ix_arr[st]]) {
276
+ cluster.lower_lim -= eps;
277
+ eps *= 4.;
278
+ }
279
+ }
280
+
281
+ if (is_exp_transf || is_log_transf) {
282
+ for (size_t row = st; row < end; row++) {
283
+ if (orig_x[ix_arr[row+1]] > orig_x[ix_arr[row]]) {
284
+ curr_gap = orig_x[ix_arr[row+1]] - orig_x[ix_arr[row]];
285
+ lim_by_orig = orig_x[ix_arr[st]] - min_gap_orig_scale * curr_gap;
286
+ cluster.lower_lim = std::fmin(cluster.lower_lim, lim_by_orig);
287
+ break;
288
+ }
289
+ }
290
+ }
291
+
236
292
  cluster.display_lim_low = orig_x[ix_arr[st]];
237
293
 
238
294
  }
@@ -241,6 +297,22 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
241
297
  for (size_t row = end; row > (end - tail_size); row--) {
242
298
 
243
299
  if (( z_score(x[ix_arr[row]], mean, sd) - z_score(x[ix_arr[row - 1]], mean, sd) ) >= min_gap) {
300
+
301
+ /* if the variable was transformed, check that the gap is still wide in the original scale */
302
+ if (is_exp_transf || is_log_transf) {
303
+ curr_gap = orig_x[ix_arr[row]] - orig_x[ix_arr[row - 1]];
304
+ next_gap = 0;
305
+ for (size_t rr = row-1; rr > st; rr--) {
306
+ if (orig_x[ix_arr[rr]] > orig_x[ix_arr[rr-1]]) {
307
+ next_gap = orig_x[ix_arr[rr]] - orig_x[ix_arr[rr-1]];
308
+ break;
309
+ }
310
+ }
311
+
312
+ if (next_gap > 0 && curr_gap/next_gap < min_gap_orig_scale)
313
+ continue;
314
+ }
315
+
244
316
  end_normals = row - 1;
245
317
  if (is_exp_transf) {
246
318
  cluster.upper_lim = log(x[ix_arr[row - 1]] + min_gap * sd) * orig_sd + orig_mean;
@@ -251,6 +323,12 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
251
323
  }
252
324
  cluster.display_lim_high = orig_x[ix_arr[row - 1]];
253
325
  cluster.perc_below = (long double)(end_normals - st + 1) / (long double)(end - st + 1);
326
+
327
+ eps = 1e-15;
328
+ while (cluster.display_lim_high >= cluster.upper_lim) {
329
+ cluster.upper_lim += eps;
330
+ eps *= 4;
331
+ }
254
332
  break;
255
333
  }
256
334
  if (z_score(x[ix_arr[row]], mean, sd) < z_outlier) break;
@@ -305,6 +383,25 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
305
383
  cluster.upper_lim = exp(x[ix_arr[end]] + min_gap * sd) + log_minval;
306
384
  }
307
385
 
386
+ if (cluster.upper_lim < HUGE_VAL) {
387
+ eps = 1e-15;
388
+ while (cluster.upper_lim <= orig_x[ix_arr[end]]) {
389
+ cluster.upper_lim += eps;
390
+ eps *= 4.;
391
+ }
392
+ }
393
+
394
+ if (is_exp_transf || is_log_transf) {
395
+ for (size_t row = end; row < st; row--) {
396
+ if (orig_x[ix_arr[row]] > orig_x[ix_arr[row-1]]) {
397
+ curr_gap = orig_x[ix_arr[row]] - orig_x[ix_arr[row-1]];
398
+ lim_by_orig = orig_x[ix_arr[end]] + min_gap_orig_scale * curr_gap;
399
+ cluster.upper_lim = std::fmax(cluster.upper_lim, lim_by_orig);
400
+ break;
401
+ }
402
+ }
403
+ }
404
+
308
405
  cluster.display_lim_high = orig_x[ix_arr[end]];
309
406
  }
310
407
 
@@ -313,8 +410,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
313
410
  size_t st_disp = has_low_values? st_normals : st;
314
411
  size_t end_disp = has_high_values? end_normals : end;
315
412
  running_mean = 0;
316
- mean_prev = 0;
317
413
  running_ssq = 0;
414
+ mean_prev = orig_x[ix_arr[st_disp]];
318
415
  for (size_t row = st_disp; row <= end_disp; row++) {
319
416
  xval = orig_x[ix_arr[row]];
320
417
  running_mean += (xval - running_mean) / (long double)(row - st_disp + 1);
@@ -372,7 +469,7 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
372
469
  void define_categ_cluster_no_cond(int *restrict x, size_t *restrict ix_arr, size_t st, size_t end, size_t ncateg,
373
470
  double *restrict outlier_scores, size_t *restrict outlier_clusters, size_t *restrict outlier_trees,
374
471
  size_t *restrict outlier_depth, Cluster &cluster,
375
- size_t *restrict categ_counts, char *restrict is_outlier, double perc_next_most_comm)
472
+ size_t *restrict categ_counts, signed char *restrict is_outlier, double perc_next_most_comm)
376
473
  {
377
474
  size_t cnt_common = end - st + 1;
378
475
  cluster.cluster_size = cnt_common;
@@ -474,7 +571,7 @@ bool define_categ_cluster(int *restrict x, size_t *restrict ix_arr, size_t st, s
474
571
  double max_perc_outliers, double z_norm, double z_outlier,
475
572
  long double *restrict perc_threshold, long double *restrict prop_prior,
476
573
  size_t *restrict buffer_categ_counts, long double *restrict buffer_categ_pct,
477
- size_t *restrict buffer_categ_ix, char *restrict buffer_outliers,
574
+ size_t *restrict buffer_categ_ix, signed char *restrict buffer_outliers,
478
575
  bool *restrict drop_cluster)
479
576
  {
480
577
  bool found_outliers, new_is_outlier;
@@ -567,7 +664,7 @@ bool define_categ_cluster(int *restrict x, size_t *restrict ix_arr, size_t st, s
567
664
 
568
665
  cluster.perc_in_subset = (long double) buffer_categ_counts[cluster.categ_maj] / tot_dbl;
569
666
  for (size_t cat = 0; cat < ncateg; cat++) {
570
- if (cat == cluster.categ_maj)
667
+ if ((int)cat == cluster.categ_maj)
571
668
  continue;
572
669
  if (cluster.subset_common[cat] != 0) {
573
670
  cluster.score_categ[cat] = (long double)(tot - buffer_categ_counts[cluster.categ_maj] + 1)
@@ -626,10 +723,10 @@ void simplify_when_equal_cond(std::vector<Cluster> &clusters, int ncat_ord[])
626
723
  } else {
627
724
 
628
725
  size_subset_excl = std::accumulate(clusters[clust].split_subset.begin(), clusters[clust].split_subset.end(), (size_t)0,
629
- [](const size_t a, const char b){return a + ((b < 0)? 1 : 0);});
726
+ [](const size_t a, const signed char b){return a + ((b < 0)? 1 : 0);});
630
727
  if (size_subset_excl > 0) continue;
631
728
  size_subset = std::accumulate(clusters[clust].split_subset.begin(), clusters[clust].split_subset.end(), (size_t)0,
632
- [](const size_t a, const char b){return a + ((b > 0)? 1 : 0);});
729
+ [](const size_t a, const signed char b){return a + ((b > 0)? 1 : 0);});
633
730
  if (size_subset == 1) {
634
731
 
635
732
  do {col_equal++;} while (clusters[clust].split_subset[col_equal] <= 0);
@@ -681,6 +778,7 @@ void simplify_when_equal_cond(std::vector<Cluster> &clusters, int ncat_ord[])
681
778
  break;
682
779
  }
683
780
 
781
+ default: {}
684
782
  }
685
783
 
686
784
  }
@@ -711,7 +809,7 @@ void simplify_when_equal_cond(std::vector<ClusterTree> &trees, int ncat_ord[])
711
809
  case Categorical:
712
810
  {
713
811
  size_subset_excl = std::accumulate(trees[tree].split_subset.begin(), trees[tree].split_subset.end(), (size_t)0,
714
- [](const size_t a, const char b){return a + ((b < 0)? 1 : 0);});
812
+ [](const size_t a, const signed char b){return a + ((b < 0)? 1 : 0);});
715
813
  if (size_subset_excl > 0) continue;
716
814
 
717
815
  col_equal = -1;
@@ -747,6 +845,8 @@ void simplify_when_equal_cond(std::vector<ClusterTree> &trees, int ncat_ord[])
747
845
  trees[tree].split_this_branch = Equal;
748
846
  break;
749
847
  }
848
+
849
+ default: {}
750
850
  }
751
851
  }
752
852
 
@@ -755,7 +855,7 @@ void simplify_when_equal_cond(std::vector<ClusterTree> &trees, int ncat_ord[])
755
855
  else {
756
856
 
757
857
  size_subset = std::accumulate(trees[tree].split_subset.begin(), trees[tree].split_subset.end(), (size_t)0,
758
- [](const size_t a, const char b){return a + ((b > 0)? 1 : 0);});
858
+ [](const size_t a, const signed char b){return a + ((b > 0)? 1 : 0);});
759
859
  if (size_subset == 1) {
760
860
 
761
861
  do {col_equal++;} while (trees[tree].split_subset[col_equal] <= 0);
@@ -778,6 +878,8 @@ void simplify_when_equal_cond(std::vector<ClusterTree> &trees, int ncat_ord[])
778
878
  trees[tree].split_this_branch = Equal;
779
879
  break;
780
880
  }
881
+
882
+ default: {}
781
883
  }
782
884
  }
783
885
 
@@ -801,6 +903,8 @@ void simplify_when_equal_cond(std::vector<ClusterTree> &trees, int ncat_ord[])
801
903
  trees[tree].split_this_branch = Equal;
802
904
  break;
803
905
  }
906
+
907
+ default: {}
804
908
  }
805
909
  }
806
910
 
@@ -846,6 +950,7 @@ void simplify_when_equal_cond(std::vector<ClusterTree> &trees, int ncat_ord[])
846
950
  break;
847
951
  }
848
952
 
953
+ default: {}
849
954
  }
850
955
 
851
956
  }