outliertree 0.2.1 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/NOTICE.txt +1 -1
- data/README.md +11 -10
- data/ext/outliertree/ext.cpp +23 -0
- data/ext/outliertree/extconf.rb +1 -1
- data/lib/outliertree/result.rb +3 -3
- data/lib/outliertree/version.rb +1 -1
- data/vendor/outliertree/README.md +83 -41
- data/vendor/outliertree/src/Makevars.in +3 -0
- data/vendor/outliertree/src/Makevars.win +3 -0
- data/vendor/outliertree/src/RcppExports.cpp +17 -27
- data/vendor/outliertree/src/Rwrapper.cpp +354 -62
- data/vendor/outliertree/src/cat_outlier.cpp +6 -6
- data/vendor/outliertree/src/clusters.cpp +114 -9
- data/vendor/outliertree/src/fit_model.cpp +525 -331
- data/vendor/outliertree/src/misc.cpp +166 -17
- data/vendor/outliertree/src/outlier_tree.hpp +164 -56
- data/vendor/outliertree/src/outliertree-win.def +3 -0
- data/vendor/outliertree/src/predict.cpp +33 -0
- data/vendor/outliertree/src/split.cpp +124 -20
- metadata +8 -6
- data/vendor/outliertree/src/Makevars +0 -3
@@ -36,12 +36,15 @@
|
|
36
36
|
|
37
37
|
/* TODO: don't divide the gains by tot at every calculation as it makes it slower */
|
38
38
|
|
39
|
-
/* TODO: sorting here is the slowest thing, so it could be improved by using radix sort for categorical/ordinal
|
39
|
+
/* TODO: sorting here is the slowest thing, so it could be improved by using radix sort for categorical/ordinal */
|
40
40
|
|
41
41
|
/* TODO: columns that split by numeric should output the sum/sum_sq to pass it to the cluster functions, instead of recalculating them later */
|
42
42
|
|
43
|
+
/* TODO: the calculations of standard deviations when splitting a numeric column by a categorical column are
|
44
|
+
highly imprecise and might throw negative variances. Should switch to a more robust procedure. */
|
43
45
|
|
44
|
-
|
46
|
+
|
47
|
+
void subset_to_onehot(size_t ix_arr[], size_t n_true, size_t n_tot, signed char onehot[])
|
45
48
|
{
|
46
49
|
memset(onehot, 0, sizeof(bool) * n_tot);
|
47
50
|
for (size_t i = 0; i <= n_true; i++) onehot[ix_arr[i]] = 1;
|
@@ -62,7 +65,7 @@ size_t move_zero_count_to_front(size_t *restrict cat_sorted, size_t *restrict ca
|
|
62
65
|
return st_cat;
|
63
66
|
}
|
64
67
|
|
65
|
-
void flag_zero_counts(char split_subset[], size_t buffer_cat_cnt[], size_t ncat_x)
|
68
|
+
void flag_zero_counts(signed char split_subset[], size_t buffer_cat_cnt[], size_t ncat_x)
|
66
69
|
{
|
67
70
|
for (size_t cat = 0; cat < ncat_x; cat++)
|
68
71
|
if (buffer_cat_cnt[cat] == 0) split_subset[cat] = -1;
|
@@ -71,20 +74,20 @@ void flag_zero_counts(char split_subset[], size_t buffer_cat_cnt[], size_t ncat_
|
|
71
74
|
long double calc_sd(size_t cnt, long double sum, long double sum_sq)
|
72
75
|
{
|
73
76
|
if (cnt < 3) return 0;
|
74
|
-
return
|
77
|
+
return std::sqrt( (sum_sq - (square(sum) / (long double) cnt) + SD_REG) / (long double) (cnt - 1) );
|
75
78
|
}
|
76
79
|
|
77
80
|
long double calc_sd(NumericBranch &branch)
|
78
81
|
{
|
79
82
|
if (branch.cnt < 3) return 0;
|
80
|
-
return
|
83
|
+
return std::sqrt((branch.sum_sq - (square(branch.sum) / (long double) branch.cnt) + SD_REG) / (long double) (branch.cnt - 1));
|
81
84
|
}
|
82
85
|
|
83
86
|
long double calc_sd(size_t ix_arr[], double *restrict x, size_t st, size_t end, double *restrict mean)
|
84
87
|
{
|
85
88
|
long double running_mean = 0;
|
86
|
-
long double mean_prev = 0;
|
87
89
|
long double running_ssq = 0;
|
90
|
+
long double mean_prev = x[ix_arr[st]];
|
88
91
|
double xval;
|
89
92
|
for (size_t row = st; row <= end; row++) {
|
90
93
|
xval = x[ix_arr[row]];
|
@@ -93,7 +96,7 @@ long double calc_sd(size_t ix_arr[], double *restrict x, size_t st, size_t end,
|
|
93
96
|
mean_prev = running_mean;
|
94
97
|
}
|
95
98
|
*mean = (double) running_mean;
|
96
|
-
return
|
99
|
+
return std::sqrt(running_ssq / (long double)(end - st));
|
97
100
|
|
98
101
|
}
|
99
102
|
|
@@ -242,11 +245,13 @@ long double categ_gain_from_split(size_t *restrict ix_arr, int *restrict x, size
|
|
242
245
|
* - split_left (out)
|
243
246
|
* Index at which the data is split between the two branches (includes last from left branch).
|
244
247
|
* - split_NA (out)
|
245
|
-
* Index at which the NA data is separated from the other branches
|
248
|
+
* Index at which the NA data is separated from the other branches.
|
249
|
+
* - has_zero_variance (out)
|
250
|
+
* Whether the 'x' column has zero variance (contains only one unique value).
|
246
251
|
*/
|
247
252
|
void split_numericx_numericy(size_t *restrict ix_arr, size_t st, size_t end, double *restrict x, double *restrict y,
|
248
253
|
long double sd_y, bool has_na, size_t min_size, bool take_mid, long double *restrict buffer_sd,
|
249
|
-
long double *restrict gain, double *restrict split_point, size_t *restrict split_left, size_t *restrict split_NA)
|
254
|
+
long double *restrict gain, double *restrict split_point, size_t *restrict split_left, size_t *restrict split_NA, bool *restrict has_zero_variance)
|
250
255
|
{
|
251
256
|
|
252
257
|
*gain = -HUGE_VAL;
|
@@ -255,11 +260,12 @@ void split_numericx_numericy(size_t *restrict ix_arr, size_t st, size_t end, dou
|
|
255
260
|
long double this_gain;
|
256
261
|
long double cnt_dbl = (long double)(end - st + 1);
|
257
262
|
long double running_mean = 0;
|
258
|
-
long double mean_prev = 0;
|
259
263
|
long double running_ssq = 0;
|
264
|
+
long double mean_prev = 0;
|
260
265
|
double xval;
|
261
266
|
long double info_left;
|
262
267
|
long double info_NA = 0;
|
268
|
+
*has_zero_variance = false;
|
263
269
|
|
264
270
|
/* check that there are enough observations for a split */
|
265
271
|
if ((end - st + 1) < (2 * min_size)) return;
|
@@ -281,8 +287,13 @@ void split_numericx_numericy(size_t *restrict ix_arr, size_t st, size_t end, dou
|
|
281
287
|
|
282
288
|
/* sort the remaining non-NA values in ascending order */
|
283
289
|
std::sort(ix_arr + st_non_na, ix_arr + end + 1, [&x](const size_t a, const size_t b){return x[a] < x[b];});
|
290
|
+
if (x[ix_arr[st_non_na]] == x[ix_arr[end]]) {
|
291
|
+
*has_zero_variance = true;
|
292
|
+
return;
|
293
|
+
}
|
284
294
|
|
285
295
|
/* calculate SD*N backwards first, then forwards */
|
296
|
+
mean_prev = y[ix_arr[end]];
|
286
297
|
for (size_t i = end; i >= st_non_na; i--) {
|
287
298
|
xval = y[ix_arr[i]];
|
288
299
|
running_mean += (xval - running_mean) / (long double)(end - i + 1);
|
@@ -297,7 +308,7 @@ void split_numericx_numericy(size_t *restrict ix_arr, size_t st, size_t end, dou
|
|
297
308
|
/* look for the best split point, by moving one observation at a time to the left branch*/
|
298
309
|
running_mean = 0;
|
299
310
|
running_ssq = 0;
|
300
|
-
mean_prev =
|
311
|
+
mean_prev = y[ix_arr[st_non_na]];
|
301
312
|
for (size_t i = st_non_na; i <= (end - min_size); i++) {
|
302
313
|
xval = y[ix_arr[i]];
|
303
314
|
running_mean += (xval - running_mean) / (long double)(i - st_non_na + 1);
|
@@ -366,12 +377,16 @@ void split_numericx_numericy(size_t *restrict ix_arr, size_t st, size_t end, dou
|
|
366
377
|
* Array that will indicate which categories go into the left branch in the chosen split.
|
367
378
|
* (value of 1 means it's on the left branch, 0 in the right branch, -1 not applicable)
|
368
379
|
* - split_point (out)
|
369
|
-
* Split level for ordinal X variables (left branch is <= this)
|
380
|
+
* Split level for ordinal X variables (left branch is <= this).
|
381
|
+
* - has_zero_variance (out)
|
382
|
+
* Whether the 'x' column has zero variance (contains only one unique value).
|
383
|
+
* - binary_split
|
384
|
+
* Whether the produced split is binary (single category at each branch).
|
370
385
|
*/
|
371
386
|
void split_categx_numericy(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, double *restrict y, long double sd_y, double ymean,
|
372
387
|
bool x_is_ordinal, size_t ncat_x, size_t *restrict buffer_cat_cnt, long double *restrict buffer_cat_sum,
|
373
388
|
long double *restrict buffer_cat_sum_sq, size_t *restrict buffer_cat_sorted,
|
374
|
-
bool has_na, size_t min_size, long double *gain, char *restrict split_subset, int *restrict split_point)
|
389
|
+
bool has_na, size_t min_size, long double *gain, signed char *restrict split_subset, int *restrict split_point, bool *restrict has_zero_variance, bool *restrict binary_split)
|
375
390
|
{
|
376
391
|
|
377
392
|
/* output parameters and variables to use */
|
@@ -380,9 +395,11 @@ void split_categx_numericy(size_t *restrict ix_arr, size_t st, size_t end, int *
|
|
380
395
|
NumericSplit split_info;
|
381
396
|
size_t st_cat = 0;
|
382
397
|
double sd_y_d = (double) sd_y;
|
398
|
+
*has_zero_variance = false;
|
399
|
+
*binary_split = false;
|
383
400
|
|
384
401
|
/* reset the buffers */
|
385
|
-
memset(split_subset, 0, sizeof(char) * ncat_x);
|
402
|
+
memset(split_subset, 0, sizeof(signed char) * ncat_x);
|
386
403
|
memset(buffer_cat_cnt, 0, sizeof(size_t) * (ncat_x + 1));
|
387
404
|
memset(buffer_cat_sum, 0, sizeof(long double) * (ncat_x + 1));
|
388
405
|
memset(buffer_cat_sum_sq, 0, sizeof(long double) * (ncat_x + 1));
|
@@ -415,6 +432,16 @@ void split_categx_numericy(size_t *restrict ix_arr, size_t st, size_t end, int *
|
|
415
432
|
|
416
433
|
}
|
417
434
|
|
435
|
+
int n_unique_cat = 0;
|
436
|
+
for (size_t cat = 0; cat < ncat_x; cat++) {
|
437
|
+
n_unique_cat += buffer_cat_sum_sq[cat] > 0;
|
438
|
+
if (n_unique_cat >= 2) break;
|
439
|
+
}
|
440
|
+
if (n_unique_cat <= 1) {
|
441
|
+
*has_zero_variance = true;
|
442
|
+
return;
|
443
|
+
}
|
444
|
+
|
418
445
|
/* set NAs to their own branch */
|
419
446
|
if (buffer_cat_cnt[ncat_x] > 0) {
|
420
447
|
split_info.NA_branch = {buffer_cat_cnt[ncat_x], buffer_cat_sum[ncat_x], buffer_cat_sum_sq[ncat_x]};
|
@@ -430,6 +457,8 @@ void split_categx_numericy(size_t *restrict ix_arr, size_t st, size_t end, int *
|
|
430
457
|
split_info.right_branch = {buffer_cat_cnt[1], buffer_cat_sum[1], buffer_cat_sum_sq[1]};
|
431
458
|
*gain = numeric_gain(split_info, 1.0) * sd_y;
|
432
459
|
split_subset[0] = 1;
|
460
|
+
|
461
|
+
*binary_split = true;
|
433
462
|
}
|
434
463
|
|
435
464
|
/* subset and ordinal splits */
|
@@ -443,7 +472,7 @@ void split_categx_numericy(size_t *restrict ix_arr, size_t st, size_t end, int *
|
|
443
472
|
}
|
444
473
|
|
445
474
|
/* if it's an ordinal variable, must respect the order */
|
446
|
-
|
475
|
+
std::iota(buffer_cat_sorted, buffer_cat_sorted + ncat_x, (size_t)0);
|
447
476
|
|
448
477
|
if (!x_is_ordinal) {
|
449
478
|
/* otherwise, sort the categories according to their mean of y */
|
@@ -458,6 +487,10 @@ void split_categx_numericy(size_t *restrict ix_arr, size_t st, size_t end, int *
|
|
458
487
|
return (buffer_cat_sum[a] / (long double) buffer_cat_cnt[a]) >
|
459
488
|
(buffer_cat_sum[b] / (long double) buffer_cat_cnt[b]);
|
460
489
|
});
|
490
|
+
|
491
|
+
if (ncat_x - st_cat == 2) {
|
492
|
+
*binary_split = true;
|
493
|
+
}
|
461
494
|
}
|
462
495
|
|
463
496
|
/* try moving each category to the left branch in the given order */
|
@@ -530,11 +563,13 @@ void split_categx_numericy(size_t *restrict ix_arr, size_t st, size_t end, int *
|
|
530
563
|
* Index at which the data is split between the two branches (includes last from left branch).
|
531
564
|
* - split_NA (out)
|
532
565
|
* Index at which the NA data is separated from the other branches
|
566
|
+
* - has_zero_variance (out)
|
567
|
+
* Whether the 'x' column has zero variance (contains only one unique value).
|
533
568
|
*/
|
534
569
|
void split_numericx_categy(size_t *restrict ix_arr, size_t st, size_t end, double *restrict x, int *restrict y,
|
535
570
|
size_t ncat_y, long double base_info, size_t *restrict buffer_cat_cnt,
|
536
571
|
bool has_na, size_t min_size, bool take_mid, long double *restrict gain, double *restrict split_point,
|
537
|
-
size_t *restrict split_left, size_t *restrict split_NA)
|
572
|
+
size_t *restrict split_left, size_t *restrict split_NA, bool *restrict has_zero_variance)
|
538
573
|
{
|
539
574
|
*gain = -HUGE_VAL;
|
540
575
|
*split_point = -HUGE_VAL;
|
@@ -543,6 +578,7 @@ void split_numericx_categy(size_t *restrict ix_arr, size_t st, size_t end, doubl
|
|
543
578
|
CategSplit split_info;
|
544
579
|
split_info.ncat = ncat_y;
|
545
580
|
split_info.tot = end - st + 1;
|
581
|
+
*has_zero_variance = false;
|
546
582
|
|
547
583
|
/* check that there are enough observations for a split */
|
548
584
|
if ((end - st + 1) < (2 * min_size)) return;
|
@@ -571,6 +607,10 @@ void split_numericx_categy(size_t *restrict ix_arr, size_t st, size_t end, doubl
|
|
571
607
|
|
572
608
|
/* sort the remaining non-NA values in ascending order */
|
573
609
|
std::sort(ix_arr + st_non_na, ix_arr + end + 1, [&x](const size_t a, const size_t b){return x[a] < x[b];});
|
610
|
+
if (x[ix_arr[st_non_na]] == x[ix_arr[end]]) {
|
611
|
+
*has_zero_variance = true;
|
612
|
+
return;
|
613
|
+
}
|
574
614
|
|
575
615
|
/* put all observations on the right branch */
|
576
616
|
for (size_t i = st_non_na; i <= end; i++) split_info.right_branch[ y[ix_arr[i]] ]++;
|
@@ -638,11 +678,16 @@ void split_numericx_categy(size_t *restrict ix_arr, size_t st, size_t end, doubl
|
|
638
678
|
* Gain calculated on the best split found. If no split is possible, will return -Inf.
|
639
679
|
* - split_point (out)
|
640
680
|
* Threshold for splitting on values of 'x'. If no split is posible, will return -1.
|
681
|
+
* - has_zero_variance (out)
|
682
|
+
* Whether the 'x' column has zero variance (contains only one unique value).
|
683
|
+
* - binary_split
|
684
|
+
* Whether the produced split is binary (single category at each branch).
|
641
685
|
*/
|
642
686
|
void split_ordx_categy(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, int *restrict y,
|
643
687
|
size_t ncat_y, size_t ncat_x, long double base_info,
|
644
688
|
size_t *restrict buffer_cat_cnt, size_t *restrict buffer_crosstab, size_t *restrict buffer_ord_cnt,
|
645
|
-
bool has_na, size_t min_size, long double *gain, int *split_point
|
689
|
+
bool has_na, size_t min_size, long double *gain, int *split_point,
|
690
|
+
bool *restrict has_zero_variance, bool *restrict binary_split)
|
646
691
|
{
|
647
692
|
*gain = -HUGE_VAL;
|
648
693
|
*split_point = -1;
|
@@ -651,6 +696,8 @@ void split_ordx_categy(size_t *restrict ix_arr, size_t st, size_t end, int *rest
|
|
651
696
|
CategSplit split_info;
|
652
697
|
split_info.ncat = ncat_y;
|
653
698
|
split_info.tot = end - st + 1;
|
699
|
+
*has_zero_variance = false;
|
700
|
+
*binary_split = false;
|
654
701
|
|
655
702
|
/* check that there are enough observations for a split */
|
656
703
|
if ((end - st + 1) < (2 * min_size)) return;
|
@@ -687,6 +734,19 @@ void split_ordx_categy(size_t *restrict ix_arr, size_t st, size_t end, int *rest
|
|
687
734
|
split_info.size_right = end - st_non_na + 1;
|
688
735
|
split_info.size_left = 0;
|
689
736
|
|
737
|
+
int n_unique_cat = 0;
|
738
|
+
for (size_t cat = 0; cat < ncat_x; cat++) {
|
739
|
+
n_unique_cat += buffer_ord_cnt[cat] > 0;
|
740
|
+
if (n_unique_cat >= 3) break;
|
741
|
+
}
|
742
|
+
if (n_unique_cat <= 1) {
|
743
|
+
*has_zero_variance = true;
|
744
|
+
return;
|
745
|
+
}
|
746
|
+
if (n_unique_cat == 2) {
|
747
|
+
*binary_split = true;
|
748
|
+
}
|
749
|
+
|
690
750
|
/* look for the best split point, by moving one observation at a time to the left branch*/
|
691
751
|
for (size_t ord_cat = 0; ord_cat < (ncat_x - 1); ord_cat++) {
|
692
752
|
|
@@ -749,11 +809,16 @@ void split_ordx_categy(size_t *restrict ix_arr, size_t st, size_t end, int *rest
|
|
749
809
|
* - split_subset[ncat_x] (out)
|
750
810
|
* Array that will indicate which categories go into the left branch in the chosen split.
|
751
811
|
* (value of 1 means it's on the left branch, 0 in the right branch, -1 not applicable)
|
812
|
+
* - has_zero_variance (out)
|
813
|
+
* Whether the 'x' column has zero variance (contains only one unique value).
|
814
|
+
* - binary_split
|
815
|
+
* Whether the produced split is binary (single category at each branch).
|
752
816
|
*/
|
753
817
|
void split_categx_biny(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, int *restrict y,
|
754
818
|
size_t ncat_x, long double base_info,
|
755
819
|
size_t *restrict buffer_cat_cnt, size_t *restrict buffer_crosstab, size_t *restrict buffer_cat_sorted,
|
756
|
-
bool has_na, size_t min_size, long double *gain, char *restrict split_subset
|
820
|
+
bool has_na, size_t min_size, long double *gain, signed char *restrict split_subset,
|
821
|
+
bool *restrict has_zero_variance, bool *restrict binary_split)
|
757
822
|
{
|
758
823
|
*gain = -HUGE_VAL;
|
759
824
|
size_t st_non_na;
|
@@ -763,6 +828,8 @@ void split_categx_biny(size_t *restrict ix_arr, size_t st, size_t end, int *rest
|
|
763
828
|
size_t st_cat;
|
764
829
|
split_info.ncat = 2;
|
765
830
|
split_info.tot = end - st + 1;
|
831
|
+
*has_zero_variance = false;
|
832
|
+
*binary_split = false;
|
766
833
|
|
767
834
|
/* check that there are enough observations for a split */
|
768
835
|
if ((end - st + 1) < (2 * min_size)) return;
|
@@ -798,8 +865,18 @@ void split_categx_biny(size_t *restrict ix_arr, size_t st, size_t end, int *rest
|
|
798
865
|
split_info.size_right = end - st_non_na + 1;
|
799
866
|
split_info.size_left = 0;
|
800
867
|
|
868
|
+
int n_unique_cat = 0;
|
869
|
+
for (size_t cat = 0; cat < ncat_x; cat++) {
|
870
|
+
n_unique_cat += buffer_cat_cnt[cat] > 0;
|
871
|
+
if (n_unique_cat >= 2) break;
|
872
|
+
}
|
873
|
+
if (n_unique_cat <= 1) {
|
874
|
+
*has_zero_variance = true;
|
875
|
+
return;
|
876
|
+
}
|
877
|
+
|
801
878
|
/* sort the categories according to their mean of y */
|
802
|
-
|
879
|
+
std::iota(buffer_cat_sorted, buffer_cat_sorted + ncat_x, (size_t)0);
|
803
880
|
st_cat = move_zero_count_to_front(buffer_cat_sorted, buffer_cat_cnt, ncat_x);
|
804
881
|
std::sort(buffer_cat_sorted + st_cat, buffer_cat_sorted + ncat_x,
|
805
882
|
[&buffer_crosstab, &buffer_cat_cnt](const size_t a, const size_t b)
|
@@ -807,6 +884,9 @@ void split_categx_biny(size_t *restrict ix_arr, size_t st, size_t end, int *rest
|
|
807
884
|
return ((long double) buffer_crosstab[2 * a] / (long double) buffer_cat_cnt[a]) >
|
808
885
|
((long double) buffer_crosstab[2 * b] / (long double) buffer_cat_cnt[b]);
|
809
886
|
});
|
887
|
+
if (ncat_x - st_cat == 2) {
|
888
|
+
*binary_split = true;
|
889
|
+
}
|
810
890
|
|
811
891
|
/* look for the best split subset, by moving one category at a time to the left branch*/
|
812
892
|
for (size_t cat = st_cat; cat < (ncat_x - 1); cat++) {
|
@@ -954,11 +1034,16 @@ void split_categx_categy_separate(size_t *restrict ix_arr, size_t st, size_t end
|
|
954
1034
|
* - split_subset[ncat_x] (out)
|
955
1035
|
* Array that will indicate which categories go into the left branch in the chosen split.
|
956
1036
|
* (value of 1 means it's on the left branch, 0 in the right branch, -1 not applicable)
|
1037
|
+
* - has_zero_variance (out)
|
1038
|
+
* Whether the 'x' column has zero variance (contains only one unique value).
|
1039
|
+
* - binary_split
|
1040
|
+
* Whether the produced split is binary (single category at each branch).
|
957
1041
|
*/
|
958
1042
|
void split_categx_categy_subset(size_t *restrict ix_arr, size_t st, size_t end, int *restrict x, int *restrict y,
|
959
1043
|
size_t ncat_x, size_t ncat_y, long double base_info,
|
960
1044
|
size_t *restrict buffer_cat_cnt, size_t *restrict buffer_crosstab, size_t *restrict buffer_split,
|
961
|
-
bool has_na, size_t min_size, long double *gain, char *restrict split_subset
|
1045
|
+
bool has_na, size_t min_size, long double *gain, signed char *restrict split_subset,
|
1046
|
+
bool *restrict has_zero_variance, bool *restrict binary_split)
|
962
1047
|
{
|
963
1048
|
*gain = -HUGE_VAL;
|
964
1049
|
long double this_gain;
|
@@ -967,6 +1052,8 @@ void split_categx_categy_subset(size_t *restrict ix_arr, size_t st, size_t end,
|
|
967
1052
|
split_info.tot = end - st + 1;
|
968
1053
|
split_info.ncat = ncat_y;
|
969
1054
|
size_t st_non_na;
|
1055
|
+
*has_zero_variance = false;
|
1056
|
+
*binary_split = false;
|
970
1057
|
|
971
1058
|
/* will divide into 3 branches: NA, within subset, outside subset */
|
972
1059
|
memset(buffer_split, 0, 3 * ncat_y * sizeof(size_t));
|
@@ -993,6 +1080,19 @@ void split_categx_categy_subset(size_t *restrict ix_arr, size_t st, size_t end,
|
|
993
1080
|
}
|
994
1081
|
}
|
995
1082
|
|
1083
|
+
int n_unique_cat = 0;
|
1084
|
+
for (size_t cat = 0; cat < ncat_x; cat++) {
|
1085
|
+
n_unique_cat += buffer_cat_cnt[cat] > 0;
|
1086
|
+
if (n_unique_cat >= 3) break;
|
1087
|
+
}
|
1088
|
+
if (n_unique_cat <= 1) {
|
1089
|
+
*has_zero_variance = true;
|
1090
|
+
return;
|
1091
|
+
}
|
1092
|
+
if (n_unique_cat == 2) {
|
1093
|
+
*binary_split = true;
|
1094
|
+
}
|
1095
|
+
|
996
1096
|
/* put all categories on the right branch */
|
997
1097
|
memset(split_info.left_branch, 0, ncat_y * sizeof(size_t));
|
998
1098
|
memset(split_info.right_branch, 0, ncat_y * sizeof(size_t));
|
@@ -1012,6 +1112,10 @@ void split_categx_categy_subset(size_t *restrict ix_arr, size_t st, size_t end,
|
|
1012
1112
|
size_t last_bit;
|
1013
1113
|
size_t ncomb = pow2(ncat_x) - 1;
|
1014
1114
|
|
1115
|
+
/* TODO: this is highly inefficient:
|
1116
|
+
- categories with zero count can be discarded beforehand.
|
1117
|
+
- could use C++ next_permutation instead. */
|
1118
|
+
|
1015
1119
|
/* iteration is done by putting a category in the left branch if the bit at its
|
1016
1120
|
position in the binary representation of the combination number is a 1 */
|
1017
1121
|
/* TODO: this would be faster with a depth-first search routine */
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: outliertree
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2023-12-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rice
|
@@ -44,7 +44,8 @@ files:
|
|
44
44
|
- lib/outliertree/version.rb
|
45
45
|
- vendor/outliertree/LICENSE
|
46
46
|
- vendor/outliertree/README.md
|
47
|
-
- vendor/outliertree/src/Makevars
|
47
|
+
- vendor/outliertree/src/Makevars.in
|
48
|
+
- vendor/outliertree/src/Makevars.win
|
48
49
|
- vendor/outliertree/src/RcppExports.cpp
|
49
50
|
- vendor/outliertree/src/Rwrapper.cpp
|
50
51
|
- vendor/outliertree/src/cat_outlier.cpp
|
@@ -52,9 +53,10 @@ files:
|
|
52
53
|
- vendor/outliertree/src/fit_model.cpp
|
53
54
|
- vendor/outliertree/src/misc.cpp
|
54
55
|
- vendor/outliertree/src/outlier_tree.hpp
|
56
|
+
- vendor/outliertree/src/outliertree-win.def
|
55
57
|
- vendor/outliertree/src/predict.cpp
|
56
58
|
- vendor/outliertree/src/split.cpp
|
57
|
-
homepage: https://github.com/ankane/outliertree
|
59
|
+
homepage: https://github.com/ankane/outliertree-ruby
|
58
60
|
licenses:
|
59
61
|
- GPL-3.0-or-later
|
60
62
|
metadata: {}
|
@@ -66,14 +68,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
66
68
|
requirements:
|
67
69
|
- - ">="
|
68
70
|
- !ruby/object:Gem::Version
|
69
|
-
version: '2.
|
71
|
+
version: '2.7'
|
70
72
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
71
73
|
requirements:
|
72
74
|
- - ">="
|
73
75
|
- !ruby/object:Gem::Version
|
74
76
|
version: '0'
|
75
77
|
requirements: []
|
76
|
-
rubygems_version: 3.
|
78
|
+
rubygems_version: 3.4.10
|
77
79
|
signing_key:
|
78
80
|
specification_version: 4
|
79
81
|
summary: Explainable outlier/anomaly detection for Ruby
|