outliertree 0.4.1 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/NOTICE.txt +1 -1
- data/ext/outliertree/ext.cpp +89 -102
- data/lib/outliertree/model.rb +13 -4
- data/lib/outliertree/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: ef5a7a99c49ab6a263bfac8293d85a914fb21794566245737640001315d34cc8
|
|
4
|
+
data.tar.gz: 29a5aec79b8e24912b422c279fed5ca58de62775296ee1aaad6af2b0fdbbad7a
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 4b67ff777a77cbd3b01617583deed6c2a033e28b9d7f57721d201e1adc5d72a3ec5ccaa6e4f41337dd607f594dd3a9a1de8ace4ffd55726bcf92b18631bc70cd
|
|
7
|
+
data.tar.gz: 020d5831ea8abedbcc25c130ea7ed2b009d9126ca64e61ecc27cf051cbe451ec5bd6ec01e0b606c7ff834d252119a19da80a199ecab1fab549caa5958f00ae88
|
data/CHANGELOG.md
CHANGED
data/NOTICE.txt
CHANGED
data/ext/outliertree/ext.cpp
CHANGED
|
@@ -1,73 +1,55 @@
|
|
|
1
|
+
#include <cstddef>
|
|
2
|
+
#include <stdexcept>
|
|
3
|
+
#include <vector>
|
|
4
|
+
|
|
1
5
|
// outliertree
|
|
2
6
|
#include <outlier_tree.hpp>
|
|
3
7
|
|
|
8
|
+
// fix warning
|
|
9
|
+
#undef restrict
|
|
10
|
+
|
|
4
11
|
// rice
|
|
5
12
|
#include <rice/rice.hpp>
|
|
6
|
-
#include <rice/stl.hpp>
|
|
7
13
|
|
|
8
|
-
using Rice::Array;
|
|
9
14
|
using Rice::Hash;
|
|
10
|
-
using Rice::Module;
|
|
11
|
-
using Rice::Object;
|
|
12
15
|
using Rice::String;
|
|
13
16
|
using Rice::Symbol;
|
|
14
|
-
using Rice::define_class_under;
|
|
15
|
-
using Rice::define_module;
|
|
16
17
|
|
|
17
|
-
namespace Rice::detail
|
|
18
|
-
{
|
|
18
|
+
namespace Rice::detail {
|
|
19
19
|
template<typename T>
|
|
20
|
-
class To_Ruby<std::vector<T>>
|
|
21
|
-
{
|
|
20
|
+
class To_Ruby<std::vector<T>> {
|
|
22
21
|
public:
|
|
23
|
-
|
|
24
|
-
{
|
|
25
|
-
auto a = rb_ary_new2(x.size());
|
|
26
|
-
for (const auto& v : x) {
|
|
27
|
-
rb_ary_push(a, To_Ruby<T>().convert(v));
|
|
28
|
-
}
|
|
29
|
-
return a;
|
|
30
|
-
}
|
|
31
|
-
};
|
|
22
|
+
To_Ruby() = default;
|
|
32
23
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
VALUE convert(std::vector<signed char> const & x)
|
|
38
|
-
{
|
|
39
|
-
auto a = rb_ary_new2(x.size());
|
|
24
|
+
explicit To_Ruby(Arg* arg) : arg_(arg) { }
|
|
25
|
+
|
|
26
|
+
VALUE convert(const std::vector<T>& x) {
|
|
27
|
+
auto a = detail::protect(rb_ary_new2, x.size());
|
|
40
28
|
for (const auto& v : x) {
|
|
41
|
-
rb_ary_push
|
|
29
|
+
detail::protect(rb_ary_push, a, To_Ruby<T>().convert(v));
|
|
42
30
|
}
|
|
43
31
|
return a;
|
|
44
32
|
}
|
|
45
|
-
};
|
|
46
33
|
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
{
|
|
50
|
-
static bool verify()
|
|
51
|
-
{
|
|
52
|
-
return true;
|
|
53
|
-
}
|
|
34
|
+
private:
|
|
35
|
+
Arg* arg_ = nullptr;
|
|
54
36
|
};
|
|
55
37
|
|
|
56
38
|
template<>
|
|
57
|
-
struct Type<ColType>
|
|
58
|
-
|
|
59
|
-
static bool verify()
|
|
60
|
-
{
|
|
39
|
+
struct Type<ColType> {
|
|
40
|
+
static bool verify() {
|
|
61
41
|
return true;
|
|
62
42
|
}
|
|
63
43
|
};
|
|
64
44
|
|
|
65
45
|
template<>
|
|
66
|
-
class To_Ruby<ColType>
|
|
67
|
-
{
|
|
46
|
+
class To_Ruby<ColType> {
|
|
68
47
|
public:
|
|
69
|
-
|
|
70
|
-
|
|
48
|
+
To_Ruby() = default;
|
|
49
|
+
|
|
50
|
+
explicit To_Ruby(Arg* arg) : arg_(arg) { }
|
|
51
|
+
|
|
52
|
+
VALUE convert(ColType const & x) {
|
|
71
53
|
switch (x) {
|
|
72
54
|
case Numeric: return Symbol("numeric");
|
|
73
55
|
case Categorical: return Symbol("categorical");
|
|
@@ -76,23 +58,26 @@ namespace Rice::detail
|
|
|
76
58
|
}
|
|
77
59
|
throw std::runtime_error("Unknown column type");
|
|
78
60
|
}
|
|
61
|
+
|
|
62
|
+
private:
|
|
63
|
+
Arg* arg_ = nullptr;
|
|
79
64
|
};
|
|
80
65
|
|
|
81
66
|
template<>
|
|
82
|
-
struct Type<SplitType>
|
|
83
|
-
|
|
84
|
-
static bool verify()
|
|
85
|
-
{
|
|
67
|
+
struct Type<SplitType> {
|
|
68
|
+
static bool verify() {
|
|
86
69
|
return true;
|
|
87
70
|
}
|
|
88
71
|
};
|
|
89
72
|
|
|
90
73
|
template<>
|
|
91
|
-
class To_Ruby<SplitType>
|
|
92
|
-
{
|
|
74
|
+
class To_Ruby<SplitType> {
|
|
93
75
|
public:
|
|
94
|
-
|
|
95
|
-
|
|
76
|
+
To_Ruby() = default;
|
|
77
|
+
|
|
78
|
+
explicit To_Ruby(Arg* arg) : arg_(arg) { }
|
|
79
|
+
|
|
80
|
+
VALUE convert(SplitType const & x) {
|
|
96
81
|
switch (x) {
|
|
97
82
|
case LessOrEqual: return Symbol("less_or_equal");
|
|
98
83
|
case Greater: return Symbol("greater");
|
|
@@ -107,16 +92,18 @@ namespace Rice::detail
|
|
|
107
92
|
}
|
|
108
93
|
throw std::runtime_error("Unknown split type");
|
|
109
94
|
}
|
|
95
|
+
|
|
96
|
+
private:
|
|
97
|
+
Arg* arg_ = nullptr;
|
|
110
98
|
};
|
|
111
|
-
}
|
|
99
|
+
} // namespace Rice::detail
|
|
112
100
|
|
|
113
101
|
extern "C"
|
|
114
|
-
void Init_ext()
|
|
115
|
-
|
|
116
|
-
Module
|
|
117
|
-
Module rb_mExt = define_module_under(rb_mOutlierTree, "Ext");
|
|
102
|
+
void Init_ext() {
|
|
103
|
+
Rice::Module rb_mOutlierTree = Rice::define_module("OutlierTree");
|
|
104
|
+
Rice::Module rb_mExt = Rice::define_module_under(rb_mOutlierTree, "Ext");
|
|
118
105
|
|
|
119
|
-
define_class_under<Cluster>(rb_mExt, "Cluster")
|
|
106
|
+
Rice::define_class_under<Cluster>(rb_mExt, "Cluster")
|
|
120
107
|
.define_method("upper_lim", [](Cluster& self) { return self.upper_lim; })
|
|
121
108
|
.define_method("display_lim_high", [](Cluster& self) { return self.display_lim_high; })
|
|
122
109
|
.define_method("perc_below", [](Cluster& self) { return self.perc_below; })
|
|
@@ -133,7 +120,7 @@ void Init_ext()
|
|
|
133
120
|
.define_method("has_na_branch", [](Cluster& self) { return self.has_NA_branch; })
|
|
134
121
|
.define_method("col_num", [](Cluster& self) { return self.col_num; });
|
|
135
122
|
|
|
136
|
-
define_class_under<ClusterTree>(rb_mExt, "ClusterTree")
|
|
123
|
+
Rice::define_class_under<ClusterTree>(rb_mExt, "ClusterTree")
|
|
137
124
|
.define_method("parent_branch", [](ClusterTree& self) { return self.parent_branch; })
|
|
138
125
|
.define_method("parent", [](ClusterTree& self) { return self.parent; })
|
|
139
126
|
.define_method("all_branches", [](ClusterTree& self) { return self.all_branches; })
|
|
@@ -143,7 +130,7 @@ void Init_ext()
|
|
|
143
130
|
.define_method("split_subset", [](ClusterTree& self) { return self.split_subset; })
|
|
144
131
|
.define_method("split_lev", [](ClusterTree& self) { return self.split_lev; });
|
|
145
132
|
|
|
146
|
-
define_class_under<ModelOutputs>(rb_mExt, "ModelOutputs")
|
|
133
|
+
Rice::define_class_under<ModelOutputs>(rb_mExt, "ModelOutputs")
|
|
147
134
|
.define_method("outlier_scores_final", [](ModelOutputs& self) { return self.outlier_scores_final; })
|
|
148
135
|
.define_method("outlier_columns_final", [](ModelOutputs& self) { return self.outlier_columns_final; })
|
|
149
136
|
.define_method("outlier_clusters_final", [](ModelOutputs& self) { return self.outlier_clusters_final; })
|
|
@@ -154,12 +141,12 @@ void Init_ext()
|
|
|
154
141
|
.define_method(
|
|
155
142
|
"all_clusters",
|
|
156
143
|
[](ModelOutputs& self, size_t i, size_t j) {
|
|
157
|
-
return self.all_clusters
|
|
144
|
+
return self.all_clusters.at(i).at(j);
|
|
158
145
|
})
|
|
159
146
|
.define_method(
|
|
160
147
|
"all_trees",
|
|
161
148
|
[](ModelOutputs& self, size_t i, size_t j) {
|
|
162
|
-
return self.all_trees
|
|
149
|
+
return self.all_trees.at(i).at(j);
|
|
163
150
|
});
|
|
164
151
|
|
|
165
152
|
rb_mExt
|
|
@@ -169,47 +156,47 @@ void Init_ext()
|
|
|
169
156
|
ModelOutputs model_outputs;
|
|
170
157
|
|
|
171
158
|
// data
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
159
|
+
auto nrows = options.get<size_t, Symbol>("nrows");
|
|
160
|
+
auto ncols_numeric = options.get<size_t, Symbol>("ncols_numeric");
|
|
161
|
+
auto ncols_categ = options.get<size_t, Symbol>("ncols_categ");
|
|
162
|
+
auto ncols_ord = options.get<size_t, Symbol>("ncols_ord");
|
|
176
163
|
|
|
177
|
-
double
|
|
164
|
+
double* numeric_data = nullptr;
|
|
178
165
|
if (ncols_numeric > 0) {
|
|
179
|
-
numeric_data = (
|
|
166
|
+
numeric_data = reinterpret_cast<double*>(const_cast<char*>(options.get<String, Symbol>("numeric_data").c_str()));
|
|
180
167
|
}
|
|
181
168
|
|
|
182
|
-
int
|
|
183
|
-
int
|
|
169
|
+
int* categorical_data = nullptr;
|
|
170
|
+
int* ncat = nullptr;
|
|
184
171
|
if (ncols_categ > 0) {
|
|
185
|
-
categorical_data = (
|
|
186
|
-
ncat = (
|
|
172
|
+
categorical_data = reinterpret_cast<int*>(const_cast<char*>(options.get<String, Symbol>("categorical_data").c_str()));
|
|
173
|
+
ncat = reinterpret_cast<int*>(const_cast<char*>(options.get<String, Symbol>("ncat").c_str()));
|
|
187
174
|
}
|
|
188
175
|
|
|
189
|
-
int
|
|
190
|
-
int
|
|
176
|
+
int* ordinal_data = nullptr;
|
|
177
|
+
int* ncat_ord = nullptr;
|
|
191
178
|
if (ncols_ord > 0) {
|
|
192
|
-
ordinal_data = (
|
|
193
|
-
ncat_ord = (
|
|
179
|
+
ordinal_data = reinterpret_cast<int*>(const_cast<char*>(options.get<String, Symbol>("ordinal_data").c_str()));
|
|
180
|
+
ncat_ord = reinterpret_cast<int*>(const_cast<char*>(options.get<String, Symbol>("ncat_ord").c_str()));
|
|
194
181
|
}
|
|
195
182
|
|
|
196
183
|
// options
|
|
197
|
-
char
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
184
|
+
char* cols_ignore = nullptr;
|
|
185
|
+
auto nthreads = options.get<int, Symbol>("nthreads");
|
|
186
|
+
auto categ_as_bin = options.get<bool, Symbol>("categ_as_bin");
|
|
187
|
+
auto ord_as_bin = options.get<bool, Symbol>("ord_as_bin");
|
|
188
|
+
auto cat_bruteforce_subset = options.get<bool, Symbol>("cat_bruteforce_subset");
|
|
189
|
+
auto categ_from_maj = options.get<bool, Symbol>("categ_from_maj");
|
|
190
|
+
auto take_mid = options.get<bool, Symbol>("take_mid");
|
|
191
|
+
auto max_depth = options.get<size_t, Symbol>("max_depth");
|
|
192
|
+
auto max_perc_outliers = options.get<double, Symbol>("pct_outliers");
|
|
193
|
+
auto min_size_numeric = options.get<size_t, Symbol>("min_size_numeric");
|
|
194
|
+
auto min_size_categ = options.get<size_t, Symbol>("min_size_categ");
|
|
195
|
+
auto min_gain = options.get<double, Symbol>("min_gain");
|
|
196
|
+
auto gain_as_pct = options.get<bool, Symbol>("gain_as_pct");
|
|
197
|
+
auto follow_all = options.get<bool, Symbol>("follow_all");
|
|
198
|
+
auto z_norm = options.get<double, Symbol>("z_norm");
|
|
199
|
+
auto z_outlier = options.get<double, Symbol>("z_outlier");
|
|
213
200
|
|
|
214
201
|
fit_outliers_models(
|
|
215
202
|
model_outputs,
|
|
@@ -245,28 +232,28 @@ void Init_ext()
|
|
|
245
232
|
"find_new_outliers",
|
|
246
233
|
[](ModelOutputs& model_outputs, Hash options) {
|
|
247
234
|
// data
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
235
|
+
auto nrows = options.get<size_t, Symbol>("nrows");
|
|
236
|
+
auto ncols_numeric = options.get<size_t, Symbol>("ncols_numeric");
|
|
237
|
+
auto ncols_categ = options.get<size_t, Symbol>("ncols_categ");
|
|
238
|
+
auto ncols_ord = options.get<size_t, Symbol>("ncols_ord");
|
|
252
239
|
|
|
253
|
-
double
|
|
240
|
+
double* numeric_data = nullptr;
|
|
254
241
|
if (ncols_numeric > 0) {
|
|
255
|
-
numeric_data = (
|
|
242
|
+
numeric_data = reinterpret_cast<double*>(const_cast<char*>(options.get<String, Symbol>("numeric_data").c_str()));
|
|
256
243
|
}
|
|
257
244
|
|
|
258
|
-
int
|
|
245
|
+
int* categorical_data = nullptr;
|
|
259
246
|
if (ncols_categ > 0) {
|
|
260
|
-
categorical_data = (
|
|
247
|
+
categorical_data = reinterpret_cast<int*>(const_cast<char*>(options.get<String, Symbol>("categorical_data").c_str()));
|
|
261
248
|
}
|
|
262
249
|
|
|
263
|
-
int
|
|
250
|
+
int* ordinal_data = nullptr;
|
|
264
251
|
if (ncols_ord > 0) {
|
|
265
|
-
ordinal_data = (
|
|
252
|
+
ordinal_data = reinterpret_cast<int*>(const_cast<char*>(options.get<String, Symbol>("ordinal_data").c_str()));
|
|
266
253
|
}
|
|
267
254
|
|
|
268
255
|
// options
|
|
269
|
-
|
|
256
|
+
auto nthreads = options.get<int, Symbol>("nthreads");
|
|
270
257
|
|
|
271
258
|
find_new_outliers(
|
|
272
259
|
numeric_data,
|
data/lib/outliertree/model.rb
CHANGED
|
@@ -1,11 +1,20 @@
|
|
|
1
1
|
module OutlierTree
|
|
2
2
|
class Model
|
|
3
3
|
def initialize(
|
|
4
|
-
max_depth: 4,
|
|
5
|
-
|
|
6
|
-
|
|
4
|
+
max_depth: 4,
|
|
5
|
+
min_gain: 0.01,
|
|
6
|
+
z_norm: 2.67,
|
|
7
|
+
z_outlier: 8.0,
|
|
8
|
+
pct_outliers: 0.01,
|
|
9
|
+
min_size_numeric: 25,
|
|
10
|
+
min_size_categ: 50,
|
|
11
|
+
categ_split: "binarize",
|
|
12
|
+
categ_outliers: "tail",
|
|
13
|
+
numeric_split: "raw",
|
|
14
|
+
follow_all: false,
|
|
15
|
+
gain_as_pct: true,
|
|
16
|
+
nthreads: -1
|
|
7
17
|
)
|
|
8
|
-
|
|
9
18
|
# TODO validate values
|
|
10
19
|
@max_depth = max_depth
|
|
11
20
|
@min_gain = min_gain
|
data/lib/outliertree/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: outliertree
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.5.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Andrew Kane
|
|
@@ -65,14 +65,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
65
65
|
requirements:
|
|
66
66
|
- - ">="
|
|
67
67
|
- !ruby/object:Gem::Version
|
|
68
|
-
version: '3.
|
|
68
|
+
version: '3.3'
|
|
69
69
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
70
70
|
requirements:
|
|
71
71
|
- - ">="
|
|
72
72
|
- !ruby/object:Gem::Version
|
|
73
73
|
version: '0'
|
|
74
74
|
requirements: []
|
|
75
|
-
rubygems_version:
|
|
75
|
+
rubygems_version: 4.0.6
|
|
76
76
|
specification_version: 4
|
|
77
77
|
summary: Explainable outlier/anomaly detection for Ruby
|
|
78
78
|
test_files: []
|