outliertree 0.4.0 → 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/outliertree/ext.cpp +56 -41
- data/lib/outliertree/model.rb +13 -4
- data/lib/outliertree/result.rb +2 -2
- data/lib/outliertree/version.rb +1 -1
- data/vendor/outliertree/src/clusters.cpp +9 -10
- metadata +5 -9
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: fd25a8154076c68420d1908a6dcb29cf0b9d1fec972cbc24063c3c4cf6de63e4
|
|
4
|
+
data.tar.gz: a834dcb5791ee8083d7ae8721cb22abfb468e123ae78940143a33df0d2ef0f98
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 0ab3d03adf97e689d188a91ceb86b77fbe69bbc45ca64426f84ae93360df2cdd395f5c1b66d7d8f371d1212c4a615c2466bdb8d041ef60d200d092967bbd3341
|
|
7
|
+
data.tar.gz: eb60aee1c0bb479db304491c804209e9c3ea51a5d089c5c05b90f1490c0190339b7a8f7c8b64c20b82ffafa7853f8f3cce2493647d95f0d86cbc890df4226987
|
data/CHANGELOG.md
CHANGED
data/ext/outliertree/ext.cpp
CHANGED
|
@@ -1,73 +1,83 @@
|
|
|
1
|
+
#include <complex>
|
|
2
|
+
#include <vector>
|
|
3
|
+
|
|
1
4
|
// outliertree
|
|
2
5
|
#include <outlier_tree.hpp>
|
|
3
6
|
|
|
7
|
+
// fix warning
|
|
8
|
+
#undef restrict
|
|
9
|
+
|
|
4
10
|
// rice
|
|
5
11
|
#include <rice/rice.hpp>
|
|
6
12
|
#include <rice/stl.hpp>
|
|
7
13
|
|
|
8
14
|
using Rice::Array;
|
|
9
15
|
using Rice::Hash;
|
|
10
|
-
using Rice::Module;
|
|
11
16
|
using Rice::Object;
|
|
12
17
|
using Rice::String;
|
|
13
18
|
using Rice::Symbol;
|
|
14
|
-
using Rice::define_class_under;
|
|
15
|
-
using Rice::define_module;
|
|
16
19
|
|
|
17
|
-
namespace Rice::detail
|
|
18
|
-
{
|
|
20
|
+
namespace Rice::detail {
|
|
19
21
|
template<typename T>
|
|
20
|
-
class To_Ruby<std::vector<T>>
|
|
21
|
-
{
|
|
22
|
+
class To_Ruby<std::vector<T>> {
|
|
22
23
|
public:
|
|
23
|
-
|
|
24
|
-
|
|
24
|
+
To_Ruby() = default;
|
|
25
|
+
|
|
26
|
+
explicit To_Ruby(Arg* arg) : arg_(arg) { }
|
|
27
|
+
|
|
28
|
+
VALUE convert(std::vector<T> const & x) {
|
|
25
29
|
auto a = rb_ary_new2(x.size());
|
|
26
30
|
for (const auto& v : x) {
|
|
27
31
|
rb_ary_push(a, To_Ruby<T>().convert(v));
|
|
28
32
|
}
|
|
29
33
|
return a;
|
|
30
34
|
}
|
|
35
|
+
|
|
36
|
+
private:
|
|
37
|
+
Arg* arg_ = nullptr;
|
|
31
38
|
};
|
|
32
39
|
|
|
33
40
|
template<>
|
|
34
|
-
class To_Ruby<std::vector<signed char>>
|
|
35
|
-
{
|
|
41
|
+
class To_Ruby<std::vector<signed char>> {
|
|
36
42
|
public:
|
|
37
|
-
|
|
38
|
-
|
|
43
|
+
To_Ruby() = default;
|
|
44
|
+
|
|
45
|
+
explicit To_Ruby(Arg* arg) : arg_(arg) { }
|
|
46
|
+
|
|
47
|
+
VALUE convert(std::vector<signed char> const & x) {
|
|
39
48
|
auto a = rb_ary_new2(x.size());
|
|
40
49
|
for (const auto& v : x) {
|
|
41
50
|
rb_ary_push(a, To_Ruby<signed char>().convert(v));
|
|
42
51
|
}
|
|
43
52
|
return a;
|
|
44
53
|
}
|
|
54
|
+
|
|
55
|
+
private:
|
|
56
|
+
Arg* arg_ = nullptr;
|
|
45
57
|
};
|
|
46
58
|
|
|
47
59
|
template<>
|
|
48
|
-
struct Type<std::vector<signed char>>
|
|
49
|
-
|
|
50
|
-
static bool verify()
|
|
51
|
-
{
|
|
60
|
+
struct Type<std::vector<signed char>> {
|
|
61
|
+
static bool verify() {
|
|
52
62
|
return true;
|
|
53
63
|
}
|
|
54
64
|
};
|
|
55
65
|
|
|
56
66
|
template<>
|
|
57
|
-
struct Type<ColType>
|
|
58
|
-
|
|
59
|
-
static bool verify()
|
|
60
|
-
{
|
|
67
|
+
struct Type<ColType> {
|
|
68
|
+
static bool verify() {
|
|
61
69
|
return true;
|
|
62
70
|
}
|
|
63
71
|
};
|
|
64
72
|
|
|
65
73
|
template<>
|
|
66
|
-
class To_Ruby<ColType>
|
|
67
|
-
{
|
|
74
|
+
class To_Ruby<ColType> {
|
|
68
75
|
public:
|
|
69
|
-
|
|
70
|
-
|
|
76
|
+
To_Ruby() = default;
|
|
77
|
+
|
|
78
|
+
explicit To_Ruby(Arg* arg) : arg_(arg) { }
|
|
79
|
+
|
|
80
|
+
VALUE convert(ColType const & x) {
|
|
71
81
|
switch (x) {
|
|
72
82
|
case Numeric: return Symbol("numeric");
|
|
73
83
|
case Categorical: return Symbol("categorical");
|
|
@@ -76,23 +86,26 @@ namespace Rice::detail
|
|
|
76
86
|
}
|
|
77
87
|
throw std::runtime_error("Unknown column type");
|
|
78
88
|
}
|
|
89
|
+
|
|
90
|
+
private:
|
|
91
|
+
Arg* arg_ = nullptr;
|
|
79
92
|
};
|
|
80
93
|
|
|
81
94
|
template<>
|
|
82
|
-
struct Type<SplitType>
|
|
83
|
-
|
|
84
|
-
static bool verify()
|
|
85
|
-
{
|
|
95
|
+
struct Type<SplitType> {
|
|
96
|
+
static bool verify() {
|
|
86
97
|
return true;
|
|
87
98
|
}
|
|
88
99
|
};
|
|
89
100
|
|
|
90
101
|
template<>
|
|
91
|
-
class To_Ruby<SplitType>
|
|
92
|
-
{
|
|
102
|
+
class To_Ruby<SplitType> {
|
|
93
103
|
public:
|
|
94
|
-
|
|
95
|
-
|
|
104
|
+
To_Ruby() = default;
|
|
105
|
+
|
|
106
|
+
explicit To_Ruby(Arg* arg) : arg_(arg) { }
|
|
107
|
+
|
|
108
|
+
VALUE convert(SplitType const & x) {
|
|
96
109
|
switch (x) {
|
|
97
110
|
case LessOrEqual: return Symbol("less_or_equal");
|
|
98
111
|
case Greater: return Symbol("greater");
|
|
@@ -107,16 +120,18 @@ namespace Rice::detail
|
|
|
107
120
|
}
|
|
108
121
|
throw std::runtime_error("Unknown split type");
|
|
109
122
|
}
|
|
123
|
+
|
|
124
|
+
private:
|
|
125
|
+
Arg* arg_ = nullptr;
|
|
110
126
|
};
|
|
111
|
-
}
|
|
127
|
+
} // namespace Rice::detail
|
|
112
128
|
|
|
113
129
|
extern "C"
|
|
114
|
-
void Init_ext()
|
|
115
|
-
|
|
116
|
-
Module
|
|
117
|
-
Module rb_mExt = define_module_under(rb_mOutlierTree, "Ext");
|
|
130
|
+
void Init_ext() {
|
|
131
|
+
Rice::Module rb_mOutlierTree = Rice::define_module("OutlierTree");
|
|
132
|
+
Rice::Module rb_mExt = Rice::define_module_under(rb_mOutlierTree, "Ext");
|
|
118
133
|
|
|
119
|
-
define_class_under<Cluster>(rb_mExt, "Cluster")
|
|
134
|
+
Rice::define_class_under<Cluster>(rb_mExt, "Cluster")
|
|
120
135
|
.define_method("upper_lim", [](Cluster& self) { return self.upper_lim; })
|
|
121
136
|
.define_method("display_lim_high", [](Cluster& self) { return self.display_lim_high; })
|
|
122
137
|
.define_method("perc_below", [](Cluster& self) { return self.perc_below; })
|
|
@@ -133,7 +148,7 @@ void Init_ext()
|
|
|
133
148
|
.define_method("has_na_branch", [](Cluster& self) { return self.has_NA_branch; })
|
|
134
149
|
.define_method("col_num", [](Cluster& self) { return self.col_num; });
|
|
135
150
|
|
|
136
|
-
define_class_under<ClusterTree>(rb_mExt, "ClusterTree")
|
|
151
|
+
Rice::define_class_under<ClusterTree>(rb_mExt, "ClusterTree")
|
|
137
152
|
.define_method("parent_branch", [](ClusterTree& self) { return self.parent_branch; })
|
|
138
153
|
.define_method("parent", [](ClusterTree& self) { return self.parent; })
|
|
139
154
|
.define_method("all_branches", [](ClusterTree& self) { return self.all_branches; })
|
|
@@ -143,7 +158,7 @@ void Init_ext()
|
|
|
143
158
|
.define_method("split_subset", [](ClusterTree& self) { return self.split_subset; })
|
|
144
159
|
.define_method("split_lev", [](ClusterTree& self) { return self.split_lev; });
|
|
145
160
|
|
|
146
|
-
define_class_under<ModelOutputs>(rb_mExt, "ModelOutputs")
|
|
161
|
+
Rice::define_class_under<ModelOutputs>(rb_mExt, "ModelOutputs")
|
|
147
162
|
.define_method("outlier_scores_final", [](ModelOutputs& self) { return self.outlier_scores_final; })
|
|
148
163
|
.define_method("outlier_columns_final", [](ModelOutputs& self) { return self.outlier_columns_final; })
|
|
149
164
|
.define_method("outlier_clusters_final", [](ModelOutputs& self) { return self.outlier_clusters_final; })
|
data/lib/outliertree/model.rb
CHANGED
|
@@ -1,11 +1,20 @@
|
|
|
1
1
|
module OutlierTree
|
|
2
2
|
class Model
|
|
3
3
|
def initialize(
|
|
4
|
-
max_depth: 4,
|
|
5
|
-
|
|
6
|
-
|
|
4
|
+
max_depth: 4,
|
|
5
|
+
min_gain: 0.01,
|
|
6
|
+
z_norm: 2.67,
|
|
7
|
+
z_outlier: 8.0,
|
|
8
|
+
pct_outliers: 0.01,
|
|
9
|
+
min_size_numeric: 25,
|
|
10
|
+
min_size_categ: 50,
|
|
11
|
+
categ_split: "binarize",
|
|
12
|
+
categ_outliers: "tail",
|
|
13
|
+
numeric_split: "raw",
|
|
14
|
+
follow_all: false,
|
|
15
|
+
gain_as_pct: true,
|
|
16
|
+
nthreads: -1
|
|
7
17
|
)
|
|
8
|
-
|
|
9
18
|
# TODO validate values
|
|
10
19
|
@max_depth = max_depth
|
|
11
20
|
@min_gain = min_gain
|
data/lib/outliertree/result.rb
CHANGED
|
@@ -80,7 +80,7 @@ module OutlierTree
|
|
|
80
80
|
column: column,
|
|
81
81
|
value: value,
|
|
82
82
|
conditions: conditions,
|
|
83
|
-
group_statistics: group_statistics
|
|
83
|
+
group_statistics: group_statistics
|
|
84
84
|
# leave out for simplicity
|
|
85
85
|
# score: score,
|
|
86
86
|
# tree_depth: model_outputs.outlier_depth_final[row],
|
|
@@ -128,7 +128,7 @@ module OutlierTree
|
|
|
128
128
|
column: cond_col,
|
|
129
129
|
comparison: colcond,
|
|
130
130
|
to: condval,
|
|
131
|
-
value: colval
|
|
131
|
+
value: colval
|
|
132
132
|
# leave out for simplicity
|
|
133
133
|
# decimals: coldecim
|
|
134
134
|
}
|
data/lib/outliertree/version.rb
CHANGED
|
@@ -173,8 +173,7 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
|
|
|
173
173
|
if ((!isinf(left_tail) || !isinf(right_tail)) && !is_log_transf && !is_exp_transf) {
|
|
174
174
|
sd *= 0.5;
|
|
175
175
|
}
|
|
176
|
-
|
|
177
|
-
sd = std::nextafter(sd, std::numeric_limits<double>::infinity());
|
|
176
|
+
sd = std::fmax(sd, std::numeric_limits<double>::epsilon() / std::fmin(min_gap, z_norm));
|
|
178
177
|
cluster.cluster_mean = mean;
|
|
179
178
|
cluster.cluster_sd = sd;
|
|
180
179
|
cnt = end - st + 1;
|
|
@@ -218,8 +217,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
|
|
|
218
217
|
cluster.display_lim_low = orig_x[ix_arr[row + 1]];
|
|
219
218
|
cluster.perc_above = (long double)(end - st_normals + 1) / (long double)(end - st + 1);
|
|
220
219
|
|
|
221
|
-
|
|
222
|
-
cluster.lower_lim = std::nextafter(cluster.
|
|
220
|
+
if (cluster.display_lim_low <= cluster.lower_lim) {
|
|
221
|
+
cluster.lower_lim = std::nextafter(cluster.display_lim_low, -std::numeric_limits<double>::infinity());
|
|
223
222
|
}
|
|
224
223
|
break;
|
|
225
224
|
}
|
|
@@ -292,8 +291,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
|
|
|
292
291
|
}
|
|
293
292
|
|
|
294
293
|
if (cluster.lower_lim > -HUGE_VAL) {
|
|
295
|
-
|
|
296
|
-
cluster.lower_lim = std::nextafter(
|
|
294
|
+
if (cluster.lower_lim >= orig_x[ix_arr[st]]) {
|
|
295
|
+
cluster.lower_lim = std::nextafter(orig_x[ix_arr[st]], -std::numeric_limits<double>::infinity());
|
|
297
296
|
}
|
|
298
297
|
}
|
|
299
298
|
|
|
@@ -343,8 +342,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
|
|
|
343
342
|
cluster.display_lim_high = orig_x[ix_arr[row - 1]];
|
|
344
343
|
cluster.perc_below = (long double)(end_normals - st + 1) / (long double)(end - st + 1);
|
|
345
344
|
|
|
346
|
-
|
|
347
|
-
cluster.upper_lim = std::nextafter(cluster.
|
|
345
|
+
if (cluster.display_lim_high >= cluster.upper_lim) {
|
|
346
|
+
cluster.upper_lim = std::nextafter(cluster.display_lim_high, -std::numeric_limits<double>::infinity());
|
|
348
347
|
}
|
|
349
348
|
break;
|
|
350
349
|
}
|
|
@@ -401,8 +400,8 @@ bool define_numerical_cluster(double *restrict x, size_t *restrict ix_arr, size_
|
|
|
401
400
|
}
|
|
402
401
|
|
|
403
402
|
if (cluster.upper_lim < HUGE_VAL) {
|
|
404
|
-
|
|
405
|
-
cluster.upper_lim = std::nextafter(
|
|
403
|
+
if (cluster.upper_lim <= orig_x[ix_arr[end]]) {
|
|
404
|
+
cluster.upper_lim = std::nextafter(orig_x[ix_arr[end]], std::numeric_limits<double>::infinity());
|
|
406
405
|
}
|
|
407
406
|
}
|
|
408
407
|
|
metadata
CHANGED
|
@@ -1,14 +1,13 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: outliertree
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.4.
|
|
4
|
+
version: 0.4.2
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Andrew Kane
|
|
8
|
-
autorequire:
|
|
9
8
|
bindir: bin
|
|
10
9
|
cert_chain: []
|
|
11
|
-
date:
|
|
10
|
+
date: 1980-01-02 00:00:00.000000000 Z
|
|
12
11
|
dependencies:
|
|
13
12
|
- !ruby/object:Gem::Dependency
|
|
14
13
|
name: rice
|
|
@@ -16,15 +15,14 @@ dependencies:
|
|
|
16
15
|
requirements:
|
|
17
16
|
- - ">="
|
|
18
17
|
- !ruby/object:Gem::Version
|
|
19
|
-
version:
|
|
18
|
+
version: 4.3.3
|
|
20
19
|
type: :runtime
|
|
21
20
|
prerelease: false
|
|
22
21
|
version_requirements: !ruby/object:Gem::Requirement
|
|
23
22
|
requirements:
|
|
24
23
|
- - ">="
|
|
25
24
|
- !ruby/object:Gem::Version
|
|
26
|
-
version:
|
|
27
|
-
description:
|
|
25
|
+
version: 4.3.3
|
|
28
26
|
email: andrew@ankane.org
|
|
29
27
|
executables: []
|
|
30
28
|
extensions:
|
|
@@ -60,7 +58,6 @@ homepage: https://github.com/ankane/outliertree-ruby
|
|
|
60
58
|
licenses:
|
|
61
59
|
- GPL-3.0-or-later
|
|
62
60
|
metadata: {}
|
|
63
|
-
post_install_message:
|
|
64
61
|
rdoc_options: []
|
|
65
62
|
require_paths:
|
|
66
63
|
- lib
|
|
@@ -75,8 +72,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
75
72
|
- !ruby/object:Gem::Version
|
|
76
73
|
version: '0'
|
|
77
74
|
requirements: []
|
|
78
|
-
rubygems_version: 3.
|
|
79
|
-
signing_key:
|
|
75
|
+
rubygems_version: 3.6.9
|
|
80
76
|
specification_version: 4
|
|
81
77
|
summary: Explainable outlier/anomaly detection for Ruby
|
|
82
78
|
test_files: []
|