outliertree 0.1.2 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/NOTICE.txt +1 -1
- data/README.md +11 -10
- data/ext/outliertree/ext.cpp +104 -105
- data/ext/outliertree/extconf.rb +1 -1
- data/lib/outliertree/result.rb +3 -3
- data/lib/outliertree/version.rb +1 -1
- data/vendor/outliertree/README.md +77 -40
- data/vendor/outliertree/src/Makevars.in +4 -0
- data/vendor/outliertree/src/Makevars.win +4 -0
- data/vendor/outliertree/src/RcppExports.cpp +20 -9
- data/vendor/outliertree/src/Rwrapper.cpp +256 -57
- data/vendor/outliertree/src/cat_outlier.cpp +6 -6
- data/vendor/outliertree/src/clusters.cpp +114 -9
- data/vendor/outliertree/src/fit_model.cpp +505 -308
- data/vendor/outliertree/src/misc.cpp +165 -4
- data/vendor/outliertree/src/outlier_tree.hpp +159 -51
- data/vendor/outliertree/src/outliertree-win.def +3 -0
- data/vendor/outliertree/src/predict.cpp +33 -0
- data/vendor/outliertree/src/split.cpp +124 -20
- metadata +10 -8
- data/vendor/outliertree/src/Makevars +0 -3
@@ -121,8 +121,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
|
|
121
121
|
bool has_low_values = false;
|
122
122
|
bool has_high_values = false;
|
123
123
|
long double running_mean = 0;
|
124
|
-
long double mean_prev = 0;
|
125
124
|
long double running_ssq = 0;
|
125
|
+
long double mean_prev = 0;
|
126
126
|
double xval;
|
127
127
|
double mean;
|
128
128
|
double sd;
|
@@ -134,6 +134,14 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
|
|
134
134
|
size_t end_normals = 0;
|
135
135
|
double min_gap = z_outlier - z_norm;
|
136
136
|
|
137
|
+
double curr_gap, next_gap, eps, lim_by_orig;
|
138
|
+
|
139
|
+
/* Note: there is no good reason and no theory behind these numbers.
|
140
|
+
TODO: find a better way of setting this */
|
141
|
+
double min_gap_orig_scale = log(sqrtl((long double)(end - st + 1))) / 2.;
|
142
|
+
min_gap_orig_scale = std::fmax(1.1, min_gap_orig_scale);
|
143
|
+
min_gap_orig_scale = std::fmin(2.5, min_gap_orig_scale);
|
144
|
+
|
137
145
|
/* TODO: here it's not necessary to sort the whole data, only top/bottom N */
|
138
146
|
|
139
147
|
/* sort the data */
|
@@ -141,6 +149,7 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
|
|
141
149
|
|
142
150
|
/* calculate statistics with tails and previous outliers excluded */
|
143
151
|
cnt = end_non_tail - st_non_tail + 1;
|
152
|
+
mean_prev = x[ ix_arr[st_non_tail] ];
|
144
153
|
for (size_t row = st_non_tail; row <= end_non_tail; row++) {
|
145
154
|
xval = x[ ix_arr[row] ];
|
146
155
|
running_mean += (xval - running_mean) / (long double)(row - st_non_tail + 1);
|
@@ -157,10 +166,16 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
|
|
157
166
|
if ((!isinf(left_tail) || !isinf(right_tail)) && !is_log_transf && !is_exp_transf) {
|
158
167
|
sd *= 0.5;
|
159
168
|
}
|
169
|
+
sd = std::fmax(sd, 1e-15);
|
170
|
+
while (std::numeric_limits<double>::epsilon() > sd*std::fmin(min_gap, z_norm))
|
171
|
+
sd *= 4;
|
160
172
|
cluster.cluster_mean = mean;
|
161
173
|
cluster.cluster_sd = sd;
|
162
174
|
cnt = end - st + 1;
|
163
175
|
|
176
|
+
/* TODO: review how to better set this limit */
|
177
|
+
tail_size = std::min(tail_size, log2ceil(end - st + 1));
|
178
|
+
|
164
179
|
/* see if the minimum and/or maximum values qualify for outliers */
|
165
180
|
if (-z_score(x[ix_arr[st]], mean, sd) >= z_outlier && x[ix_arr[st]] > left_tail) has_low_values = true;
|
166
181
|
if ( z_score(x[ix_arr[end]], mean, sd) >= z_outlier && x[ix_arr[end]] < right_tail) has_high_values = true;
|
@@ -170,6 +185,22 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
|
|
170
185
|
for (size_t row = st; row < st + tail_size; row++) {
|
171
186
|
|
172
187
|
if (( z_score(x[ix_arr[row + 1]], mean, sd) - z_score(x[ix_arr[row]], mean, sd) ) >= min_gap) {
|
188
|
+
|
189
|
+
/* if the variable was transformed, check that the gap is still wide in the original scale */
|
190
|
+
if (is_exp_transf || is_log_transf) {
|
191
|
+
curr_gap = orig_x[ix_arr[row + 1]] - orig_x[ix_arr[row]];
|
192
|
+
next_gap = 0;
|
193
|
+
for (size_t rr = row + 1; rr < end; rr++) {
|
194
|
+
if (orig_x[ix_arr[rr+1]] > orig_x[ix_arr[rr]]) {
|
195
|
+
next_gap = orig_x[ix_arr[rr+1]] - orig_x[ix_arr[rr]];
|
196
|
+
break;
|
197
|
+
}
|
198
|
+
}
|
199
|
+
|
200
|
+
if (next_gap > 0 && curr_gap/next_gap < min_gap_orig_scale)
|
201
|
+
continue;
|
202
|
+
}
|
203
|
+
|
173
204
|
st_normals = row + 1;
|
174
205
|
if (is_exp_transf) {
|
175
206
|
cluster.lower_lim = log(x[ix_arr[row + 1]] - min_gap * sd) * orig_sd + orig_mean;
|
@@ -180,6 +211,12 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
|
|
180
211
|
}
|
181
212
|
cluster.display_lim_low = orig_x[ix_arr[row + 1]];
|
182
213
|
cluster.perc_above = (long double)(end - st_normals + 1) / (long double)(end - st + 1);
|
214
|
+
|
215
|
+
eps = 1e-15;
|
216
|
+
while (cluster.display_lim_low <= cluster.lower_lim) {
|
217
|
+
cluster.lower_lim -= eps;
|
218
|
+
eps *= 4;
|
219
|
+
}
|
183
220
|
break;
|
184
221
|
}
|
185
222
|
if (z_score(x[ix_arr[row]], mean, sd) > -z_outlier) break;
|
@@ -233,6 +270,25 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
|
|
233
270
|
cluster.lower_lim = exp(x[ix_arr[st]] - min_gap * sd) + log_minval;
|
234
271
|
}
|
235
272
|
|
273
|
+
if (cluster.lower_lim > -HUGE_VAL) {
|
274
|
+
eps = 1e-15;
|
275
|
+
while (cluster.lower_lim >= orig_x[ix_arr[st]]) {
|
276
|
+
cluster.lower_lim -= eps;
|
277
|
+
eps *= 4.;
|
278
|
+
}
|
279
|
+
}
|
280
|
+
|
281
|
+
if (is_exp_transf || is_log_transf) {
|
282
|
+
for (size_t row = st; row < end; row++) {
|
283
|
+
if (orig_x[ix_arr[row+1]] > orig_x[ix_arr[row]]) {
|
284
|
+
curr_gap = orig_x[ix_arr[row+1]] - orig_x[ix_arr[row]];
|
285
|
+
lim_by_orig = orig_x[ix_arr[st]] - min_gap_orig_scale * curr_gap;
|
286
|
+
cluster.lower_lim = std::fmin(cluster.lower_lim, lim_by_orig);
|
287
|
+
break;
|
288
|
+
}
|
289
|
+
}
|
290
|
+
}
|
291
|
+
|
236
292
|
cluster.display_lim_low = orig_x[ix_arr[st]];
|
237
293
|
|
238
294
|
}
|
@@ -241,6 +297,22 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
|
|
241
297
|
for (size_t row = end; row > (end - tail_size); row--) {
|
242
298
|
|
243
299
|
if (( z_score(x[ix_arr[row]], mean, sd) - z_score(x[ix_arr[row - 1]], mean, sd) ) >= min_gap) {
|
300
|
+
|
301
|
+
/* if the variable was transformed, check that the gap is still wide in the original scale */
|
302
|
+
if (is_exp_transf || is_log_transf) {
|
303
|
+
curr_gap = orig_x[ix_arr[row]] - orig_x[ix_arr[row - 1]];
|
304
|
+
next_gap = 0;
|
305
|
+
for (size_t rr = row-1; rr > st; rr--) {
|
306
|
+
if (orig_x[ix_arr[rr]] > orig_x[ix_arr[rr-1]]) {
|
307
|
+
next_gap = orig_x[ix_arr[rr]] - orig_x[ix_arr[rr-1]];
|
308
|
+
break;
|
309
|
+
}
|
310
|
+
}
|
311
|
+
|
312
|
+
if (next_gap > 0 && curr_gap/next_gap < min_gap_orig_scale)
|
313
|
+
continue;
|
314
|
+
}
|
315
|
+
|
244
316
|
end_normals = row - 1;
|
245
317
|
if (is_exp_transf) {
|
246
318
|
cluster.upper_lim = log(x[ix_arr[row - 1]] + min_gap * sd) * orig_sd + orig_mean;
|
@@ -251,6 +323,12 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
|
|
251
323
|
}
|
252
324
|
cluster.display_lim_high = orig_x[ix_arr[row - 1]];
|
253
325
|
cluster.perc_below = (long double)(end_normals - st + 1) / (long double)(end - st + 1);
|
326
|
+
|
327
|
+
eps = 1e-15;
|
328
|
+
while (cluster.display_lim_high >= cluster.upper_lim) {
|
329
|
+
cluster.upper_lim += eps;
|
330
|
+
eps *= 4;
|
331
|
+
}
|
254
332
|
break;
|
255
333
|
}
|
256
334
|
if (z_score(x[ix_arr[row]], mean, sd) < z_outlier) break;
|
@@ -305,6 +383,25 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
|
|
305
383
|
cluster.upper_lim = exp(x[ix_arr[end]] + min_gap * sd) + log_minval;
|
306
384
|
}
|
307
385
|
|
386
|
+
if (cluster.upper_lim < HUGE_VAL) {
|
387
|
+
eps = 1e-15;
|
388
|
+
while (cluster.upper_lim <= orig_x[ix_arr[end]]) {
|
389
|
+
cluster.upper_lim += eps;
|
390
|
+
eps *= 4.;
|
391
|
+
}
|
392
|
+
}
|
393
|
+
|
394
|
+
if (is_exp_transf || is_log_transf) {
|
395
|
+
for (size_t row = end; row < st; row--) {
|
396
|
+
if (orig_x[ix_arr[row]] > orig_x[ix_arr[row-1]]) {
|
397
|
+
curr_gap = orig_x[ix_arr[row]] - orig_x[ix_arr[row-1]];
|
398
|
+
lim_by_orig = orig_x[ix_arr[end]] + min_gap_orig_scale * curr_gap;
|
399
|
+
cluster.upper_lim = std::fmax(cluster.upper_lim, lim_by_orig);
|
400
|
+
break;
|
401
|
+
}
|
402
|
+
}
|
403
|
+
}
|
404
|
+
|
308
405
|
cluster.display_lim_high = orig_x[ix_arr[end]];
|
309
406
|
}
|
310
407
|
|
@@ -313,8 +410,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
|
|
313
410
|
size_t st_disp = has_low_values? st_normals : st;
|
314
411
|
size_t end_disp = has_high_values? end_normals : end;
|
315
412
|
running_mean = 0;
|
316
|
-
mean_prev = 0;
|
317
413
|
running_ssq = 0;
|
414
|
+
mean_prev = orig_x[ix_arr[st_disp]];
|
318
415
|
for (size_t row = st_disp; row <= end_disp; row++) {
|
319
416
|
xval = orig_x[ix_arr[row]];
|
320
417
|
running_mean += (xval - running_mean) / (long double)(row - st_disp + 1);
|
@@ -372,7 +469,7 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
|
|
372
469
|
void define_categ_cluster_no_cond(int *restrict x, size_t *restrict ix_arr, size_t st, size_t end, size_t ncateg,
|
373
470
|
double *restrict outlier_scores, size_t *restrict outlier_clusters, size_t *restrict outlier_trees,
|
374
471
|
size_t *restrict outlier_depth, Cluster &cluster,
|
375
|
-
size_t *restrict categ_counts, char *restrict is_outlier, double perc_next_most_comm)
|
472
|
+
size_t *restrict categ_counts, signed char *restrict is_outlier, double perc_next_most_comm)
|
376
473
|
{
|
377
474
|
size_t cnt_common = end - st + 1;
|
378
475
|
cluster.cluster_size = cnt_common;
|
@@ -474,7 +571,7 @@ bool define_categ_cluster(int *restrict x, size_t *restrict ix_arr, size_t st, s
|
|
474
571
|
double max_perc_outliers, double z_norm, double z_outlier,
|
475
572
|
long double *restrict perc_threshold, long double *restrict prop_prior,
|
476
573
|
size_t *restrict buffer_categ_counts, long double *restrict buffer_categ_pct,
|
477
|
-
size_t *restrict buffer_categ_ix, char *restrict buffer_outliers,
|
574
|
+
size_t *restrict buffer_categ_ix, signed char *restrict buffer_outliers,
|
478
575
|
bool *restrict drop_cluster)
|
479
576
|
{
|
480
577
|
bool found_outliers, new_is_outlier;
|
@@ -567,7 +664,7 @@ bool define_categ_cluster(int *restrict x, size_t *restrict ix_arr, size_t st, s
|
|
567
664
|
|
568
665
|
cluster.perc_in_subset = (long double) buffer_categ_counts[cluster.categ_maj] / tot_dbl;
|
569
666
|
for (size_t cat = 0; cat < ncateg; cat++) {
|
570
|
-
if (cat == cluster.categ_maj)
|
667
|
+
if ((int)cat == cluster.categ_maj)
|
571
668
|
continue;
|
572
669
|
if (cluster.subset_common[cat] != 0) {
|
573
670
|
cluster.score_categ[cat] = (long double)(tot - buffer_categ_counts[cluster.categ_maj] + 1)
|
@@ -626,10 +723,10 @@ void simplify_when_equal_cond(std::vector<Cluster> &clusters, int ncat_ord[])
|
|
626
723
|
} else {
|
627
724
|
|
628
725
|
size_subset_excl = std::accumulate(clusters[clust].split_subset.begin(), clusters[clust].split_subset.end(), (size_t)0,
|
629
|
-
[](const size_t a, const char b){return a + ((b < 0)? 1 : 0);});
|
726
|
+
[](const size_t a, const signed char b){return a + ((b < 0)? 1 : 0);});
|
630
727
|
if (size_subset_excl > 0) continue;
|
631
728
|
size_subset = std::accumulate(clusters[clust].split_subset.begin(), clusters[clust].split_subset.end(), (size_t)0,
|
632
|
-
[](const size_t a, const char b){return a + ((b > 0)? 1 : 0);});
|
729
|
+
[](const size_t a, const signed char b){return a + ((b > 0)? 1 : 0);});
|
633
730
|
if (size_subset == 1) {
|
634
731
|
|
635
732
|
do {col_equal++;} while (clusters[clust].split_subset[col_equal] <= 0);
|
@@ -681,6 +778,7 @@ void simplify_when_equal_cond(std::vector<Cluster> &clusters, int ncat_ord[])
|
|
681
778
|
break;
|
682
779
|
}
|
683
780
|
|
781
|
+
default: {}
|
684
782
|
}
|
685
783
|
|
686
784
|
}
|
@@ -711,7 +809,7 @@ void simplify_when_equal_cond(std::vector<ClusterTree> &trees, int ncat_ord[])
|
|
711
809
|
case Categorical:
|
712
810
|
{
|
713
811
|
size_subset_excl = std::accumulate(trees[tree].split_subset.begin(), trees[tree].split_subset.end(), (size_t)0,
|
714
|
-
[](const size_t a, const char b){return a + ((b < 0)? 1 : 0);});
|
812
|
+
[](const size_t a, const signed char b){return a + ((b < 0)? 1 : 0);});
|
715
813
|
if (size_subset_excl > 0) continue;
|
716
814
|
|
717
815
|
col_equal = -1;
|
@@ -747,6 +845,8 @@ void simplify_when_equal_cond(std::vector<ClusterTree> &trees, int ncat_ord[])
|
|
747
845
|
trees[tree].split_this_branch = Equal;
|
748
846
|
break;
|
749
847
|
}
|
848
|
+
|
849
|
+
default: {}
|
750
850
|
}
|
751
851
|
}
|
752
852
|
|
@@ -755,7 +855,7 @@ void simplify_when_equal_cond(std::vector<ClusterTree> &trees, int ncat_ord[])
|
|
755
855
|
else {
|
756
856
|
|
757
857
|
size_subset = std::accumulate(trees[tree].split_subset.begin(), trees[tree].split_subset.end(), (size_t)0,
|
758
|
-
[](const size_t a, const char b){return a + ((b > 0)? 1 : 0);});
|
858
|
+
[](const size_t a, const signed char b){return a + ((b > 0)? 1 : 0);});
|
759
859
|
if (size_subset == 1) {
|
760
860
|
|
761
861
|
do {col_equal++;} while (trees[tree].split_subset[col_equal] <= 0);
|
@@ -778,6 +878,8 @@ void simplify_when_equal_cond(std::vector<ClusterTree> &trees, int ncat_ord[])
|
|
778
878
|
trees[tree].split_this_branch = Equal;
|
779
879
|
break;
|
780
880
|
}
|
881
|
+
|
882
|
+
default: {}
|
781
883
|
}
|
782
884
|
}
|
783
885
|
|
@@ -801,6 +903,8 @@ void simplify_when_equal_cond(std::vector<ClusterTree> &trees, int ncat_ord[])
|
|
801
903
|
trees[tree].split_this_branch = Equal;
|
802
904
|
break;
|
803
905
|
}
|
906
|
+
|
907
|
+
default: {}
|
804
908
|
}
|
805
909
|
}
|
806
910
|
|
@@ -846,6 +950,7 @@ void simplify_when_equal_cond(std::vector<ClusterTree> &trees, int ncat_ord[])
|
|
846
950
|
break;
|
847
951
|
}
|
848
952
|
|
953
|
+
default: {}
|
849
954
|
}
|
850
955
|
|
851
956
|
}
|