outliertree 0.2.1 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/NOTICE.txt +1 -1
- data/README.md +11 -10
- data/ext/outliertree/extconf.rb +1 -1
- data/lib/outliertree/result.rb +3 -3
- data/lib/outliertree/version.rb +1 -1
- data/vendor/outliertree/README.md +77 -40
- data/vendor/outliertree/src/Makevars.in +4 -0
- data/vendor/outliertree/src/Makevars.win +4 -0
- data/vendor/outliertree/src/RcppExports.cpp +20 -9
- data/vendor/outliertree/src/Rwrapper.cpp +256 -57
- data/vendor/outliertree/src/cat_outlier.cpp +6 -6
- data/vendor/outliertree/src/clusters.cpp +114 -9
- data/vendor/outliertree/src/fit_model.cpp +505 -308
- data/vendor/outliertree/src/misc.cpp +165 -4
- data/vendor/outliertree/src/outlier_tree.hpp +159 -51
- data/vendor/outliertree/src/outliertree-win.def +3 -0
- data/vendor/outliertree/src/predict.cpp +33 -0
- data/vendor/outliertree/src/split.cpp +124 -20
- metadata +8 -6
- data/vendor/outliertree/src/Makevars +0 -3
@@ -121,8 +121,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
|
|
121
121
|
bool has_low_values = false;
|
122
122
|
bool has_high_values = false;
|
123
123
|
long double running_mean = 0;
|
124
|
-
long double mean_prev = 0;
|
125
124
|
long double running_ssq = 0;
|
125
|
+
long double mean_prev = 0;
|
126
126
|
double xval;
|
127
127
|
double mean;
|
128
128
|
double sd;
|
@@ -134,6 +134,14 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
|
|
134
134
|
size_t end_normals = 0;
|
135
135
|
double min_gap = z_outlier - z_norm;
|
136
136
|
|
137
|
+
double curr_gap, next_gap, eps, lim_by_orig;
|
138
|
+
|
139
|
+
/* Note: there is no good reason and no theory behind these numbers.
|
140
|
+
TODO: find a better way of setting this */
|
141
|
+
double min_gap_orig_scale = log(sqrtl((long double)(end - st + 1))) / 2.;
|
142
|
+
min_gap_orig_scale = std::fmax(1.1, min_gap_orig_scale);
|
143
|
+
min_gap_orig_scale = std::fmin(2.5, min_gap_orig_scale);
|
144
|
+
|
137
145
|
/* TODO: here it's not necessary to sort the whole data, only top/bottom N */
|
138
146
|
|
139
147
|
/* sort the data */
|
@@ -141,6 +149,7 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
|
|
141
149
|
|
142
150
|
/* calculate statistics with tails and previous outliers excluded */
|
143
151
|
cnt = end_non_tail - st_non_tail + 1;
|
152
|
+
mean_prev = x[ ix_arr[st_non_tail] ];
|
144
153
|
for (size_t row = st_non_tail; row <= end_non_tail; row++) {
|
145
154
|
xval = x[ ix_arr[row] ];
|
146
155
|
running_mean += (xval - running_mean) / (long double)(row - st_non_tail + 1);
|
@@ -157,10 +166,16 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
|
|
157
166
|
if ((!isinf(left_tail) || !isinf(right_tail)) && !is_log_transf && !is_exp_transf) {
|
158
167
|
sd *= 0.5;
|
159
168
|
}
|
169
|
+
sd = std::fmax(sd, 1e-15);
|
170
|
+
while (std::numeric_limits<double>::epsilon() > sd*std::fmin(min_gap, z_norm))
|
171
|
+
sd *= 4;
|
160
172
|
cluster.cluster_mean = mean;
|
161
173
|
cluster.cluster_sd = sd;
|
162
174
|
cnt = end - st + 1;
|
163
175
|
|
176
|
+
/* TODO: review how to better set this limit */
|
177
|
+
tail_size = std::min(tail_size, log2ceil(end - st + 1));
|
178
|
+
|
164
179
|
/* see if the minimum and/or maximum values qualify for outliers */
|
165
180
|
if (-z_score(x[ix_arr[st]], mean, sd) >= z_outlier && x[ix_arr[st]] > left_tail) has_low_values = true;
|
166
181
|
if ( z_score(x[ix_arr[end]], mean, sd) >= z_outlier && x[ix_arr[end]] < right_tail) has_high_values = true;
|
@@ -170,6 +185,22 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
|
|
170
185
|
for (size_t row = st; row < st + tail_size; row++) {
|
171
186
|
|
172
187
|
if (( z_score(x[ix_arr[row + 1]], mean, sd) - z_score(x[ix_arr[row]], mean, sd) ) >= min_gap) {
|
188
|
+
|
189
|
+
/* if the variable was transformed, check that the gap is still wide in the original scale */
|
190
|
+
if (is_exp_transf || is_log_transf) {
|
191
|
+
curr_gap = orig_x[ix_arr[row + 1]] - orig_x[ix_arr[row]];
|
192
|
+
next_gap = 0;
|
193
|
+
for (size_t rr = row + 1; rr < end; rr++) {
|
194
|
+
if (orig_x[ix_arr[rr+1]] > orig_x[ix_arr[rr]]) {
|
195
|
+
next_gap = orig_x[ix_arr[rr+1]] - orig_x[ix_arr[rr]];
|
196
|
+
break;
|
197
|
+
}
|
198
|
+
}
|
199
|
+
|
200
|
+
if (next_gap > 0 && curr_gap/next_gap < min_gap_orig_scale)
|
201
|
+
continue;
|
202
|
+
}
|
203
|
+
|
173
204
|
st_normals = row + 1;
|
174
205
|
if (is_exp_transf) {
|
175
206
|
cluster.lower_lim = log(x[ix_arr[row + 1]] - min_gap * sd) * orig_sd + orig_mean;
|
@@ -180,6 +211,12 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
|
|
180
211
|
}
|
181
212
|
cluster.display_lim_low = orig_x[ix_arr[row + 1]];
|
182
213
|
cluster.perc_above = (long double)(end - st_normals + 1) / (long double)(end - st + 1);
|
214
|
+
|
215
|
+
eps = 1e-15;
|
216
|
+
while (cluster.display_lim_low <= cluster.lower_lim) {
|
217
|
+
cluster.lower_lim -= eps;
|
218
|
+
eps *= 4;
|
219
|
+
}
|
183
220
|
break;
|
184
221
|
}
|
185
222
|
if (z_score(x[ix_arr[row]], mean, sd) > -z_outlier) break;
|
@@ -233,6 +270,25 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
|
|
233
270
|
cluster.lower_lim = exp(x[ix_arr[st]] - min_gap * sd) + log_minval;
|
234
271
|
}
|
235
272
|
|
273
|
+
if (cluster.lower_lim > -HUGE_VAL) {
|
274
|
+
eps = 1e-15;
|
275
|
+
while (cluster.lower_lim >= orig_x[ix_arr[st]]) {
|
276
|
+
cluster.lower_lim -= eps;
|
277
|
+
eps *= 4.;
|
278
|
+
}
|
279
|
+
}
|
280
|
+
|
281
|
+
if (is_exp_transf || is_log_transf) {
|
282
|
+
for (size_t row = st; row < end; row++) {
|
283
|
+
if (orig_x[ix_arr[row+1]] > orig_x[ix_arr[row]]) {
|
284
|
+
curr_gap = orig_x[ix_arr[row+1]] - orig_x[ix_arr[row]];
|
285
|
+
lim_by_orig = orig_x[ix_arr[st]] - min_gap_orig_scale * curr_gap;
|
286
|
+
cluster.lower_lim = std::fmin(cluster.lower_lim, lim_by_orig);
|
287
|
+
break;
|
288
|
+
}
|
289
|
+
}
|
290
|
+
}
|
291
|
+
|
236
292
|
cluster.display_lim_low = orig_x[ix_arr[st]];
|
237
293
|
|
238
294
|
}
|
@@ -241,6 +297,22 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
|
|
241
297
|
for (size_t row = end; row > (end - tail_size); row--) {
|
242
298
|
|
243
299
|
if (( z_score(x[ix_arr[row]], mean, sd) - z_score(x[ix_arr[row - 1]], mean, sd) ) >= min_gap) {
|
300
|
+
|
301
|
+
/* if the variable was transformed, check that the gap is still wide in the original scale */
|
302
|
+
if (is_exp_transf || is_log_transf) {
|
303
|
+
curr_gap = orig_x[ix_arr[row]] - orig_x[ix_arr[row - 1]];
|
304
|
+
next_gap = 0;
|
305
|
+
for (size_t rr = row-1; rr > st; rr--) {
|
306
|
+
if (orig_x[ix_arr[rr]] > orig_x[ix_arr[rr-1]]) {
|
307
|
+
next_gap = orig_x[ix_arr[rr]] - orig_x[ix_arr[rr-1]];
|
308
|
+
break;
|
309
|
+
}
|
310
|
+
}
|
311
|
+
|
312
|
+
if (next_gap > 0 && curr_gap/next_gap < min_gap_orig_scale)
|
313
|
+
continue;
|
314
|
+
}
|
315
|
+
|
244
316
|
end_normals = row - 1;
|
245
317
|
if (is_exp_transf) {
|
246
318
|
cluster.upper_lim = log(x[ix_arr[row - 1]] + min_gap * sd) * orig_sd + orig_mean;
|
@@ -251,6 +323,12 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
|
|
251
323
|
}
|
252
324
|
cluster.display_lim_high = orig_x[ix_arr[row - 1]];
|
253
325
|
cluster.perc_below = (long double)(end_normals - st + 1) / (long double)(end - st + 1);
|
326
|
+
|
327
|
+
eps = 1e-15;
|
328
|
+
while (cluster.display_lim_high >= cluster.upper_lim) {
|
329
|
+
cluster.upper_lim += eps;
|
330
|
+
eps *= 4;
|
331
|
+
}
|
254
332
|
break;
|
255
333
|
}
|
256
334
|
if (z_score(x[ix_arr[row]], mean, sd) < z_outlier) break;
|
@@ -305,6 +383,25 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
|
|
305
383
|
cluster.upper_lim = exp(x[ix_arr[end]] + min_gap * sd) + log_minval;
|
306
384
|
}
|
307
385
|
|
386
|
+
if (cluster.upper_lim < HUGE_VAL) {
|
387
|
+
eps = 1e-15;
|
388
|
+
while (cluster.upper_lim <= orig_x[ix_arr[end]]) {
|
389
|
+
cluster.upper_lim += eps;
|
390
|
+
eps *= 4.;
|
391
|
+
}
|
392
|
+
}
|
393
|
+
|
394
|
+
if (is_exp_transf || is_log_transf) {
|
395
|
+
for (size_t row = end; row < st; row--) {
|
396
|
+
if (orig_x[ix_arr[row]] > orig_x[ix_arr[row-1]]) {
|
397
|
+
curr_gap = orig_x[ix_arr[row]] - orig_x[ix_arr[row-1]];
|
398
|
+
lim_by_orig = orig_x[ix_arr[end]] + min_gap_orig_scale * curr_gap;
|
399
|
+
cluster.upper_lim = std::fmax(cluster.upper_lim, lim_by_orig);
|
400
|
+
break;
|
401
|
+
}
|
402
|
+
}
|
403
|
+
}
|
404
|
+
|
308
405
|
cluster.display_lim_high = orig_x[ix_arr[end]];
|
309
406
|
}
|
310
407
|
|
@@ -313,8 +410,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
|
|
313
410
|
size_t st_disp = has_low_values? st_normals : st;
|
314
411
|
size_t end_disp = has_high_values? end_normals : end;
|
315
412
|
running_mean = 0;
|
316
|
-
mean_prev = 0;
|
317
413
|
running_ssq = 0;
|
414
|
+
mean_prev = orig_x[ix_arr[st_disp]];
|
318
415
|
for (size_t row = st_disp; row <= end_disp; row++) {
|
319
416
|
xval = orig_x[ix_arr[row]];
|
320
417
|
running_mean += (xval - running_mean) / (long double)(row - st_disp + 1);
|
@@ -372,7 +469,7 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
|
|
372
469
|
void define_categ_cluster_no_cond(int *restrict x, size_t *restrict ix_arr, size_t st, size_t end, size_t ncateg,
|
373
470
|
double *restrict outlier_scores, size_t *restrict outlier_clusters, size_t *restrict outlier_trees,
|
374
471
|
size_t *restrict outlier_depth, Cluster &cluster,
|
375
|
-
size_t *restrict categ_counts, char *restrict is_outlier, double perc_next_most_comm)
|
472
|
+
size_t *restrict categ_counts, signed char *restrict is_outlier, double perc_next_most_comm)
|
376
473
|
{
|
377
474
|
size_t cnt_common = end - st + 1;
|
378
475
|
cluster.cluster_size = cnt_common;
|
@@ -474,7 +571,7 @@ bool define_categ_cluster(int *restrict x, size_t *restrict ix_arr, size_t st, s
|
|
474
571
|
double max_perc_outliers, double z_norm, double z_outlier,
|
475
572
|
long double *restrict perc_threshold, long double *restrict prop_prior,
|
476
573
|
size_t *restrict buffer_categ_counts, long double *restrict buffer_categ_pct,
|
477
|
-
size_t *restrict buffer_categ_ix, char *restrict buffer_outliers,
|
574
|
+
size_t *restrict buffer_categ_ix, signed char *restrict buffer_outliers,
|
478
575
|
bool *restrict drop_cluster)
|
479
576
|
{
|
480
577
|
bool found_outliers, new_is_outlier;
|
@@ -567,7 +664,7 @@ bool define_categ_cluster(int *restrict x, size_t *restrict ix_arr, size_t st, s
|
|
567
664
|
|
568
665
|
cluster.perc_in_subset = (long double) buffer_categ_counts[cluster.categ_maj] / tot_dbl;
|
569
666
|
for (size_t cat = 0; cat < ncateg; cat++) {
|
570
|
-
if (cat == cluster.categ_maj)
|
667
|
+
if ((int)cat == cluster.categ_maj)
|
571
668
|
continue;
|
572
669
|
if (cluster.subset_common[cat] != 0) {
|
573
670
|
cluster.score_categ[cat] = (long double)(tot - buffer_categ_counts[cluster.categ_maj] + 1)
|
@@ -626,10 +723,10 @@ void simplify_when_equal_cond(std::vector<Cluster> &clusters, int ncat_ord[])
|
|
626
723
|
} else {
|
627
724
|
|
628
725
|
size_subset_excl = std::accumulate(clusters[clust].split_subset.begin(), clusters[clust].split_subset.end(), (size_t)0,
|
629
|
-
[](const size_t a, const char b){return a + ((b < 0)? 1 : 0);});
|
726
|
+
[](const size_t a, const signed char b){return a + ((b < 0)? 1 : 0);});
|
630
727
|
if (size_subset_excl > 0) continue;
|
631
728
|
size_subset = std::accumulate(clusters[clust].split_subset.begin(), clusters[clust].split_subset.end(), (size_t)0,
|
632
|
-
[](const size_t a, const char b){return a + ((b > 0)? 1 : 0);});
|
729
|
+
[](const size_t a, const signed char b){return a + ((b > 0)? 1 : 0);});
|
633
730
|
if (size_subset == 1) {
|
634
731
|
|
635
732
|
do {col_equal++;} while (clusters[clust].split_subset[col_equal] <= 0);
|
@@ -681,6 +778,7 @@ void simplify_when_equal_cond(std::vector<Cluster> &clusters, int ncat_ord[])
|
|
681
778
|
break;
|
682
779
|
}
|
683
780
|
|
781
|
+
default: {}
|
684
782
|
}
|
685
783
|
|
686
784
|
}
|
@@ -711,7 +809,7 @@ void simplify_when_equal_cond(std::vector<ClusterTree> &trees, int ncat_ord[])
|
|
711
809
|
case Categorical:
|
712
810
|
{
|
713
811
|
size_subset_excl = std::accumulate(trees[tree].split_subset.begin(), trees[tree].split_subset.end(), (size_t)0,
|
714
|
-
[](const size_t a, const char b){return a + ((b < 0)? 1 : 0);});
|
812
|
+
[](const size_t a, const signed char b){return a + ((b < 0)? 1 : 0);});
|
715
813
|
if (size_subset_excl > 0) continue;
|
716
814
|
|
717
815
|
col_equal = -1;
|
@@ -747,6 +845,8 @@ void simplify_when_equal_cond(std::vector<ClusterTree> &trees, int ncat_ord[])
|
|
747
845
|
trees[tree].split_this_branch = Equal;
|
748
846
|
break;
|
749
847
|
}
|
848
|
+
|
849
|
+
default: {}
|
750
850
|
}
|
751
851
|
}
|
752
852
|
|
@@ -755,7 +855,7 @@ void simplify_when_equal_cond(std::vector<ClusterTree> &trees, int ncat_ord[])
|
|
755
855
|
else {
|
756
856
|
|
757
857
|
size_subset = std::accumulate(trees[tree].split_subset.begin(), trees[tree].split_subset.end(), (size_t)0,
|
758
|
-
[](const size_t a, const char b){return a + ((b > 0)? 1 : 0);});
|
858
|
+
[](const size_t a, const signed char b){return a + ((b > 0)? 1 : 0);});
|
759
859
|
if (size_subset == 1) {
|
760
860
|
|
761
861
|
do {col_equal++;} while (trees[tree].split_subset[col_equal] <= 0);
|
@@ -778,6 +878,8 @@ void simplify_when_equal_cond(std::vector<ClusterTree> &trees, int ncat_ord[])
|
|
778
878
|
trees[tree].split_this_branch = Equal;
|
779
879
|
break;
|
780
880
|
}
|
881
|
+
|
882
|
+
default: {}
|
781
883
|
}
|
782
884
|
}
|
783
885
|
|
@@ -801,6 +903,8 @@ void simplify_when_equal_cond(std::vector<ClusterTree> &trees, int ncat_ord[])
|
|
801
903
|
trees[tree].split_this_branch = Equal;
|
802
904
|
break;
|
803
905
|
}
|
906
|
+
|
907
|
+
default: {}
|
804
908
|
}
|
805
909
|
}
|
806
910
|
|
@@ -846,6 +950,7 @@ void simplify_when_equal_cond(std::vector<ClusterTree> &trees, int ncat_ord[])
|
|
846
950
|
break;
|
847
951
|
}
|
848
952
|
|
953
|
+
default: {}
|
849
954
|
}
|
850
955
|
|
851
956
|
}
|