outliertree 0.2.1 → 0.3.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -121,8 +121,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
121
121
  bool has_low_values = false;
122
122
  bool has_high_values = false;
123
123
  long double running_mean = 0;
124
- long double mean_prev = 0;
125
124
  long double running_ssq = 0;
125
+ long double mean_prev = 0;
126
126
  double xval;
127
127
  double mean;
128
128
  double sd;
@@ -134,6 +134,14 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
134
134
  size_t end_normals = 0;
135
135
  double min_gap = z_outlier - z_norm;
136
136
 
137
+ double curr_gap, next_gap, eps, lim_by_orig;
138
+
139
+ /* Note: there is no good reason and no theory behind these numbers.
140
+ TODO: find a better way of setting this */
141
+ double min_gap_orig_scale = log(sqrtl((long double)(end - st + 1))) / 2.;
142
+ min_gap_orig_scale = std::fmax(1.1, min_gap_orig_scale);
143
+ min_gap_orig_scale = std::fmin(2.5, min_gap_orig_scale);
144
+
137
145
  /* TODO: here it's not necessary to sort the whole data, only top/bottom N */
138
146
 
139
147
  /* sort the data */
@@ -141,6 +149,7 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
141
149
 
142
150
  /* calculate statistics with tails and previous outliers excluded */
143
151
  cnt = end_non_tail - st_non_tail + 1;
152
+ mean_prev = x[ ix_arr[st_non_tail] ];
144
153
  for (size_t row = st_non_tail; row <= end_non_tail; row++) {
145
154
  xval = x[ ix_arr[row] ];
146
155
  running_mean += (xval - running_mean) / (long double)(row - st_non_tail + 1);
@@ -157,10 +166,16 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
157
166
  if ((!isinf(left_tail) || !isinf(right_tail)) && !is_log_transf && !is_exp_transf) {
158
167
  sd *= 0.5;
159
168
  }
169
+ sd = std::fmax(sd, 1e-15);
170
+ while (std::numeric_limits<double>::epsilon() > sd*std::fmin(min_gap, z_norm))
171
+ sd *= 4;
160
172
  cluster.cluster_mean = mean;
161
173
  cluster.cluster_sd = sd;
162
174
  cnt = end - st + 1;
163
175
 
176
+ /* TODO: review how to better set this limit */
177
+ tail_size = std::min(tail_size, log2ceil(end - st + 1));
178
+
164
179
  /* see if the minimum and/or maximum values qualify for outliers */
165
180
  if (-z_score(x[ix_arr[st]], mean, sd) >= z_outlier && x[ix_arr[st]] > left_tail) has_low_values = true;
166
181
  if ( z_score(x[ix_arr[end]], mean, sd) >= z_outlier && x[ix_arr[end]] < right_tail) has_high_values = true;
@@ -170,6 +185,22 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
170
185
  for (size_t row = st; row < st + tail_size; row++) {
171
186
 
172
187
  if (( z_score(x[ix_arr[row + 1]], mean, sd) - z_score(x[ix_arr[row]], mean, sd) ) >= min_gap) {
188
+
189
+ /* if the variable was transformed, check that the gap is still wide in the original scale */
190
+ if (is_exp_transf || is_log_transf) {
191
+ curr_gap = orig_x[ix_arr[row + 1]] - orig_x[ix_arr[row]];
192
+ next_gap = 0;
193
+ for (size_t rr = row + 1; rr < end; rr++) {
194
+ if (orig_x[ix_arr[rr+1]] > orig_x[ix_arr[rr]]) {
195
+ next_gap = orig_x[ix_arr[rr+1]] - orig_x[ix_arr[rr]];
196
+ break;
197
+ }
198
+ }
199
+
200
+ if (next_gap > 0 && curr_gap/next_gap < min_gap_orig_scale)
201
+ continue;
202
+ }
203
+
173
204
  st_normals = row + 1;
174
205
  if (is_exp_transf) {
175
206
  cluster.lower_lim = log(x[ix_arr[row + 1]] - min_gap * sd) * orig_sd + orig_mean;
@@ -180,6 +211,12 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
180
211
  }
181
212
  cluster.display_lim_low = orig_x[ix_arr[row + 1]];
182
213
  cluster.perc_above = (long double)(end - st_normals + 1) / (long double)(end - st + 1);
214
+
215
+ eps = 1e-15;
216
+ while (cluster.display_lim_low <= cluster.lower_lim) {
217
+ cluster.lower_lim -= eps;
218
+ eps *= 4;
219
+ }
183
220
  break;
184
221
  }
185
222
  if (z_score(x[ix_arr[row]], mean, sd) > -z_outlier) break;
@@ -233,6 +270,25 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
233
270
  cluster.lower_lim = exp(x[ix_arr[st]] - min_gap * sd) + log_minval;
234
271
  }
235
272
 
273
+ if (cluster.lower_lim > -HUGE_VAL) {
274
+ eps = 1e-15;
275
+ while (cluster.lower_lim >= orig_x[ix_arr[st]]) {
276
+ cluster.lower_lim -= eps;
277
+ eps *= 4.;
278
+ }
279
+ }
280
+
281
+ if (is_exp_transf || is_log_transf) {
282
+ for (size_t row = st; row < end; row++) {
283
+ if (orig_x[ix_arr[row+1]] > orig_x[ix_arr[row]]) {
284
+ curr_gap = orig_x[ix_arr[row+1]] - orig_x[ix_arr[row]];
285
+ lim_by_orig = orig_x[ix_arr[st]] - min_gap_orig_scale * curr_gap;
286
+ cluster.lower_lim = std::fmin(cluster.lower_lim, lim_by_orig);
287
+ break;
288
+ }
289
+ }
290
+ }
291
+
236
292
  cluster.display_lim_low = orig_x[ix_arr[st]];
237
293
 
238
294
  }
@@ -241,6 +297,22 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
241
297
  for (size_t row = end; row > (end - tail_size); row--) {
242
298
 
243
299
  if (( z_score(x[ix_arr[row]], mean, sd) - z_score(x[ix_arr[row - 1]], mean, sd) ) >= min_gap) {
300
+
301
+ /* if the variable was transformed, check that the gap is still wide in the original scale */
302
+ if (is_exp_transf || is_log_transf) {
303
+ curr_gap = orig_x[ix_arr[row]] - orig_x[ix_arr[row - 1]];
304
+ next_gap = 0;
305
+ for (size_t rr = row-1; rr > st; rr--) {
306
+ if (orig_x[ix_arr[rr]] > orig_x[ix_arr[rr-1]]) {
307
+ next_gap = orig_x[ix_arr[rr]] - orig_x[ix_arr[rr-1]];
308
+ break;
309
+ }
310
+ }
311
+
312
+ if (next_gap > 0 && curr_gap/next_gap < min_gap_orig_scale)
313
+ continue;
314
+ }
315
+
244
316
  end_normals = row - 1;
245
317
  if (is_exp_transf) {
246
318
  cluster.upper_lim = log(x[ix_arr[row - 1]] + min_gap * sd) * orig_sd + orig_mean;
@@ -251,6 +323,12 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
251
323
  }
252
324
  cluster.display_lim_high = orig_x[ix_arr[row - 1]];
253
325
  cluster.perc_below = (long double)(end_normals - st + 1) / (long double)(end - st + 1);
326
+
327
+ eps = 1e-15;
328
+ while (cluster.display_lim_high >= cluster.upper_lim) {
329
+ cluster.upper_lim += eps;
330
+ eps *= 4;
331
+ }
254
332
  break;
255
333
  }
256
334
  if (z_score(x[ix_arr[row]], mean, sd) < z_outlier) break;
@@ -305,6 +383,25 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
305
383
  cluster.upper_lim = exp(x[ix_arr[end]] + min_gap * sd) + log_minval;
306
384
  }
307
385
 
386
+ if (cluster.upper_lim < HUGE_VAL) {
387
+ eps = 1e-15;
388
+ while (cluster.upper_lim <= orig_x[ix_arr[end]]) {
389
+ cluster.upper_lim += eps;
390
+ eps *= 4.;
391
+ }
392
+ }
393
+
394
+ if (is_exp_transf || is_log_transf) {
395
+ for (size_t row = end; row < st; row--) {
396
+ if (orig_x[ix_arr[row]] > orig_x[ix_arr[row-1]]) {
397
+ curr_gap = orig_x[ix_arr[row]] - orig_x[ix_arr[row-1]];
398
+ lim_by_orig = orig_x[ix_arr[end]] + min_gap_orig_scale * curr_gap;
399
+ cluster.upper_lim = std::fmax(cluster.upper_lim, lim_by_orig);
400
+ break;
401
+ }
402
+ }
403
+ }
404
+
308
405
  cluster.display_lim_high = orig_x[ix_arr[end]];
309
406
  }
310
407
 
@@ -313,8 +410,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
313
410
  size_t st_disp = has_low_values? st_normals : st;
314
411
  size_t end_disp = has_high_values? end_normals : end;
315
412
  running_mean = 0;
316
- mean_prev = 0;
317
413
  running_ssq = 0;
414
+ mean_prev = orig_x[ix_arr[st_disp]];
318
415
  for (size_t row = st_disp; row <= end_disp; row++) {
319
416
  xval = orig_x[ix_arr[row]];
320
417
  running_mean += (xval - running_mean) / (long double)(row - st_disp + 1);
@@ -372,7 +469,7 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
372
469
  void define_categ_cluster_no_cond(int *restrict x, size_t *restrict ix_arr, size_t st, size_t end, size_t ncateg,
373
470
  double *restrict outlier_scores, size_t *restrict outlier_clusters, size_t *restrict outlier_trees,
374
471
  size_t *restrict outlier_depth, Cluster &cluster,
375
- size_t *restrict categ_counts, char *restrict is_outlier, double perc_next_most_comm)
472
+ size_t *restrict categ_counts, signed char *restrict is_outlier, double perc_next_most_comm)
376
473
  {
377
474
  size_t cnt_common = end - st + 1;
378
475
  cluster.cluster_size = cnt_common;
@@ -474,7 +571,7 @@ bool define_categ_cluster(int *restrict x, size_t *restrict ix_arr, size_t st, s
474
571
  double max_perc_outliers, double z_norm, double z_outlier,
475
572
  long double *restrict perc_threshold, long double *restrict prop_prior,
476
573
  size_t *restrict buffer_categ_counts, long double *restrict buffer_categ_pct,
477
- size_t *restrict buffer_categ_ix, char *restrict buffer_outliers,
574
+ size_t *restrict buffer_categ_ix, signed char *restrict buffer_outliers,
478
575
  bool *restrict drop_cluster)
479
576
  {
480
577
  bool found_outliers, new_is_outlier;
@@ -567,7 +664,7 @@ bool define_categ_cluster(int *restrict x, size_t *restrict ix_arr, size_t st, s
567
664
 
568
665
  cluster.perc_in_subset = (long double) buffer_categ_counts[cluster.categ_maj] / tot_dbl;
569
666
  for (size_t cat = 0; cat < ncateg; cat++) {
570
- if (cat == cluster.categ_maj)
667
+ if ((int)cat == cluster.categ_maj)
571
668
  continue;
572
669
  if (cluster.subset_common[cat] != 0) {
573
670
  cluster.score_categ[cat] = (long double)(tot - buffer_categ_counts[cluster.categ_maj] + 1)
@@ -626,10 +723,10 @@ void simplify_when_equal_cond(std::vector<Cluster> &clusters, int ncat_ord[])
626
723
  } else {
627
724
 
628
725
  size_subset_excl = std::accumulate(clusters[clust].split_subset.begin(), clusters[clust].split_subset.end(), (size_t)0,
629
- [](const size_t a, const char b){return a + ((b < 0)? 1 : 0);});
726
+ [](const size_t a, const signed char b){return a + ((b < 0)? 1 : 0);});
630
727
  if (size_subset_excl > 0) continue;
631
728
  size_subset = std::accumulate(clusters[clust].split_subset.begin(), clusters[clust].split_subset.end(), (size_t)0,
632
- [](const size_t a, const char b){return a + ((b > 0)? 1 : 0);});
729
+ [](const size_t a, const signed char b){return a + ((b > 0)? 1 : 0);});
633
730
  if (size_subset == 1) {
634
731
 
635
732
  do {col_equal++;} while (clusters[clust].split_subset[col_equal] <= 0);
@@ -681,6 +778,7 @@ void simplify_when_equal_cond(std::vector<Cluster> &clusters, int ncat_ord[])
681
778
  break;
682
779
  }
683
780
 
781
+ default: {}
684
782
  }
685
783
 
686
784
  }
@@ -711,7 +809,7 @@ void simplify_when_equal_cond(std::vector<ClusterTree> &trees, int ncat_ord[])
711
809
  case Categorical:
712
810
  {
713
811
  size_subset_excl = std::accumulate(trees[tree].split_subset.begin(), trees[tree].split_subset.end(), (size_t)0,
714
- [](const size_t a, const char b){return a + ((b < 0)? 1 : 0);});
812
+ [](const size_t a, const signed char b){return a + ((b < 0)? 1 : 0);});
715
813
  if (size_subset_excl > 0) continue;
716
814
 
717
815
  col_equal = -1;
@@ -747,6 +845,8 @@ void simplify_when_equal_cond(std::vector<ClusterTree> &trees, int ncat_ord[])
747
845
  trees[tree].split_this_branch = Equal;
748
846
  break;
749
847
  }
848
+
849
+ default: {}
750
850
  }
751
851
  }
752
852
 
@@ -755,7 +855,7 @@ void simplify_when_equal_cond(std::vector<ClusterTree> &trees, int ncat_ord[])
755
855
  else {
756
856
 
757
857
  size_subset = std::accumulate(trees[tree].split_subset.begin(), trees[tree].split_subset.end(), (size_t)0,
758
- [](const size_t a, const char b){return a + ((b > 0)? 1 : 0);});
858
+ [](const size_t a, const signed char b){return a + ((b > 0)? 1 : 0);});
759
859
  if (size_subset == 1) {
760
860
 
761
861
  do {col_equal++;} while (trees[tree].split_subset[col_equal] <= 0);
@@ -778,6 +878,8 @@ void simplify_when_equal_cond(std::vector<ClusterTree> &trees, int ncat_ord[])
778
878
  trees[tree].split_this_branch = Equal;
779
879
  break;
780
880
  }
881
+
882
+ default: {}
781
883
  }
782
884
  }
783
885
 
@@ -801,6 +903,8 @@ void simplify_when_equal_cond(std::vector<ClusterTree> &trees, int ncat_ord[])
801
903
  trees[tree].split_this_branch = Equal;
802
904
  break;
803
905
  }
906
+
907
+ default: {}
804
908
  }
805
909
  }
806
910
 
@@ -846,6 +950,7 @@ void simplify_when_equal_cond(std::vector<ClusterTree> &trees, int ncat_ord[])
846
950
  break;
847
951
  }
848
952
 
953
+ default: {}
849
954
  }
850
955
 
851
956
  }