outliertree 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE.txt +674 -0
- data/NOTICE.txt +14 -0
- data/README.md +107 -0
- data/ext/outliertree/ext.cpp +260 -0
- data/ext/outliertree/extconf.rb +21 -0
- data/lib/outliertree.rb +17 -0
- data/lib/outliertree/dataset.rb +35 -0
- data/lib/outliertree/model.rb +128 -0
- data/lib/outliertree/result.rb +190 -0
- data/lib/outliertree/version.rb +3 -0
- data/vendor/outliertree/LICENSE +674 -0
- data/vendor/outliertree/README.md +155 -0
- data/vendor/outliertree/src/Makevars +3 -0
- data/vendor/outliertree/src/RcppExports.cpp +123 -0
- data/vendor/outliertree/src/Rwrapper.cpp +1225 -0
- data/vendor/outliertree/src/cat_outlier.cpp +328 -0
- data/vendor/outliertree/src/clusters.cpp +972 -0
- data/vendor/outliertree/src/fit_model.cpp +1932 -0
- data/vendor/outliertree/src/misc.cpp +685 -0
- data/vendor/outliertree/src/outlier_tree.hpp +758 -0
- data/vendor/outliertree/src/predict.cpp +706 -0
- data/vendor/outliertree/src/split.cpp +1098 -0
- metadata +150 -0
data/NOTICE.txt
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
Copyright (C) 2020 Andrew Kane
|
2
|
+
|
3
|
+
This program is free software: you can redistribute it and/or modify
|
4
|
+
it under the terms of the GNU General Public License as published by
|
5
|
+
the Free Software Foundation, either version 3 of the License, or
|
6
|
+
(at your option) any later version.
|
7
|
+
|
8
|
+
This program is distributed in the hope that it will be useful,
|
9
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
11
|
+
GNU General Public License for more details.
|
12
|
+
|
13
|
+
You should have received a copy of the GNU General Public License
|
14
|
+
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
data/README.md
ADDED
@@ -0,0 +1,107 @@
|
|
1
|
+
# OutlierTree
|
2
|
+
|
3
|
+
:evergreen_tree: [OutlierTree](https://github.com/david-cortes/outliertree) - explainable outlier/anomaly detection - for Ruby
|
4
|
+
|
5
|
+
Produces human-readable explanations for why values are detected as outliers
|
6
|
+
|
7
|
+
```txt
|
8
|
+
Price (2.50) looks low given Department is Books and Sale is false
|
9
|
+
```
|
10
|
+
|
11
|
+
## Installation
|
12
|
+
|
13
|
+
Add this line to your application’s Gemfile:
|
14
|
+
|
15
|
+
```ruby
|
16
|
+
gem 'outliertree'
|
17
|
+
```
|
18
|
+
|
19
|
+
## Getting Started
|
20
|
+
|
21
|
+
Prep your data
|
22
|
+
|
23
|
+
```ruby
|
24
|
+
data = [
|
25
|
+
{department: "Books", sale: false, price: 2.50},
|
26
|
+
{department: "Books", sale: true, price: 3.00},
|
27
|
+
{department: "Movies", sale: false, price: 5.00}
|
28
|
+
]
|
29
|
+
```
|
30
|
+
|
31
|
+
Train a model
|
32
|
+
|
33
|
+
```ruby
|
34
|
+
model = OutlierTree.new
|
35
|
+
model.fit(data)
|
36
|
+
```
|
37
|
+
|
38
|
+
Get outliers
|
39
|
+
|
40
|
+
```ruby
|
41
|
+
model.outliers(data)
|
42
|
+
```
|
43
|
+
|
44
|
+
## Parameters
|
45
|
+
|
46
|
+
Pass parameters - default values below
|
47
|
+
|
48
|
+
```ruby
|
49
|
+
OutlierTree.new(
|
50
|
+
max_depth: 4,
|
51
|
+
min_gain: 0.01,
|
52
|
+
z_norm: 2.67,
|
53
|
+
z_outlier: 8.0,
|
54
|
+
pct_outliers: 0.01,
|
55
|
+
min_size_numeric: 25,
|
56
|
+
min_size_categ: 50,
|
57
|
+
categ_split: "binarize",
|
58
|
+
categ_outliers: "tail",
|
59
|
+
numeric_split: "raw",
|
60
|
+
follow_all: false,
|
61
|
+
gain_as_pct: true,
|
62
|
+
nthreads: -1
|
63
|
+
)
|
64
|
+
```
|
65
|
+
|
66
|
+
See a [detailed explanation](https://outliertree.readthedocs.io/en/latest/#outliertree.OutlierTree)
|
67
|
+
|
68
|
+
## Data
|
69
|
+
|
70
|
+
Data can be an array of hashes
|
71
|
+
|
72
|
+
```ruby
|
73
|
+
[
|
74
|
+
{department: "Books", sale: false, price: 2.50},
|
75
|
+
{department: "Books", sale: true, price: 3.00},
|
76
|
+
{department: "Movies", sale: false, price: 5.00}
|
77
|
+
]
|
78
|
+
```
|
79
|
+
|
80
|
+
Or a Rover data frame
|
81
|
+
|
82
|
+
```ruby
|
83
|
+
Rover.read_csv("data.csv")
|
84
|
+
```
|
85
|
+
|
86
|
+
## History
|
87
|
+
|
88
|
+
View the [changelog](https://github.com/ankane/outliertree/blob/master/CHANGELOG.md)
|
89
|
+
|
90
|
+
## Contributing
|
91
|
+
|
92
|
+
Everyone is encouraged to help improve this project. Here are a few ways you can help:
|
93
|
+
|
94
|
+
- [Report bugs](https://github.com/ankane/outliertree/issues)
|
95
|
+
- Fix bugs and [submit pull requests](https://github.com/ankane/outliertree/pulls)
|
96
|
+
- Write, clarify, or fix documentation
|
97
|
+
- Suggest or add new features
|
98
|
+
|
99
|
+
To get started with development:
|
100
|
+
|
101
|
+
```sh
|
102
|
+
git clone --recursive https://github.com/ankane/outliertree.git
|
103
|
+
cd outliertree
|
104
|
+
bundle install
|
105
|
+
bundle exec rake compile
|
106
|
+
bundle exec rake test
|
107
|
+
```
|
@@ -0,0 +1,260 @@
|
|
1
|
+
// outliertree
|
2
|
+
#include <outlier_tree.hpp>
|
3
|
+
|
4
|
+
// rice
|
5
|
+
#include <rice/Array.hpp>
|
6
|
+
#include <rice/Hash.hpp>
|
7
|
+
#include <rice/Module.hpp>
|
8
|
+
#include <rice/Object.hpp>
|
9
|
+
#include <rice/String.hpp>
|
10
|
+
#include <rice/Symbol.hpp>
|
11
|
+
|
12
|
+
using Rice::Array;
|
13
|
+
using Rice::Hash;
|
14
|
+
using Rice::Module;
|
15
|
+
using Rice::Object;
|
16
|
+
using Rice::String;
|
17
|
+
using Rice::Symbol;
|
18
|
+
using Rice::define_class_under;
|
19
|
+
using Rice::define_module;
|
20
|
+
|
21
|
+
template<>
|
22
|
+
Object to_ruby<std::vector<char>>(std::vector<char> const & x)
|
23
|
+
{
|
24
|
+
Array a;
|
25
|
+
for (size_t i = 0; i < x.size(); i++) {
|
26
|
+
a.push(x[i]);
|
27
|
+
}
|
28
|
+
return a;
|
29
|
+
}
|
30
|
+
|
31
|
+
template<>
|
32
|
+
Object to_ruby<std::vector<int>>(std::vector<int> const & x)
|
33
|
+
{
|
34
|
+
Array a;
|
35
|
+
for (size_t i = 0; i < x.size(); i++) {
|
36
|
+
a.push(x[i]);
|
37
|
+
}
|
38
|
+
return a;
|
39
|
+
}
|
40
|
+
|
41
|
+
template<>
|
42
|
+
Object to_ruby<std::vector<unsigned long>>(std::vector<unsigned long> const & x)
|
43
|
+
{
|
44
|
+
Array a;
|
45
|
+
for (size_t i = 0; i < x.size(); i++) {
|
46
|
+
a.push(x[i]);
|
47
|
+
}
|
48
|
+
return a;
|
49
|
+
}
|
50
|
+
|
51
|
+
template<>
|
52
|
+
Object to_ruby<std::vector<double>>(std::vector<double> const & x)
|
53
|
+
{
|
54
|
+
Array a;
|
55
|
+
for (size_t i = 0; i < x.size(); i++) {
|
56
|
+
a.push(x[i]);
|
57
|
+
}
|
58
|
+
return a;
|
59
|
+
}
|
60
|
+
|
61
|
+
template<>
|
62
|
+
Object to_ruby<ColType>(ColType const & x)
|
63
|
+
{
|
64
|
+
switch (x) {
|
65
|
+
case Numeric: return Symbol("numeric");
|
66
|
+
case Categorical: return Symbol("categorical");
|
67
|
+
case Ordinal: return Symbol("ordinal");
|
68
|
+
case NoType: return Symbol("no_type");
|
69
|
+
}
|
70
|
+
throw std::runtime_error("Unknown column type");
|
71
|
+
}
|
72
|
+
|
73
|
+
template<>
|
74
|
+
Object to_ruby<SplitType>(SplitType const & x)
|
75
|
+
{
|
76
|
+
switch (x) {
|
77
|
+
case LessOrEqual: return Symbol("less_or_equal");
|
78
|
+
case Greater: return Symbol("greater");
|
79
|
+
case Equal: return Symbol("equal");
|
80
|
+
case NotEqual: return Symbol("not_equal");
|
81
|
+
case InSubset: return Symbol("in_subset");
|
82
|
+
case NotInSubset: return Symbol("not_in_subset");
|
83
|
+
case SingleCateg: return Symbol("single_categ");
|
84
|
+
case SubTrees: return Symbol("sub_trees");
|
85
|
+
case IsNa: return Symbol("is_na");
|
86
|
+
case Root: return Symbol("root");
|
87
|
+
}
|
88
|
+
throw std::runtime_error("Unknown split type");
|
89
|
+
}
|
90
|
+
|
91
|
+
extern "C"
|
92
|
+
void Init_ext()
|
93
|
+
{
|
94
|
+
Module rb_mOutlierTree = define_module("OutlierTree");
|
95
|
+
Module rb_mExt = define_module_under(rb_mOutlierTree, "Ext");
|
96
|
+
|
97
|
+
define_class_under<Cluster>(rb_mExt, "Cluster")
|
98
|
+
.define_method("upper_lim", *[](Cluster& self) { return self.upper_lim; })
|
99
|
+
.define_method("display_lim_high", *[](Cluster& self) { return self.display_lim_high; })
|
100
|
+
.define_method("perc_below", *[](Cluster& self) { return self.perc_below; })
|
101
|
+
.define_method("display_lim_low", *[](Cluster& self) { return self.display_lim_low; })
|
102
|
+
.define_method("perc_above", *[](Cluster& self) { return self.perc_above; })
|
103
|
+
.define_method("display_mean", *[](Cluster& self) { return self.display_mean; })
|
104
|
+
.define_method("display_sd", *[](Cluster& self) { return self.display_sd; })
|
105
|
+
.define_method("cluster_size", *[](Cluster& self) { return self.cluster_size; })
|
106
|
+
.define_method("split_point", *[](Cluster& self) { return self.split_point; })
|
107
|
+
.define_method("split_subset", *[](Cluster& self) { return self.split_subset; })
|
108
|
+
.define_method("split_lev", *[](Cluster& self) { return self.split_lev; })
|
109
|
+
.define_method("split_type", *[](Cluster& self) { return self.split_type; })
|
110
|
+
.define_method("column_type", *[](Cluster& self) { return self.column_type; })
|
111
|
+
.define_method("has_na_branch", *[](Cluster& self) { return self.has_NA_branch; })
|
112
|
+
.define_method("col_num", *[](Cluster& self) { return self.col_num; });
|
113
|
+
|
114
|
+
define_class_under<ClusterTree>(rb_mExt, "ClusterTree")
|
115
|
+
.define_method("parent_branch", *[](ClusterTree& self) { return self.parent_branch; })
|
116
|
+
.define_method("parent", *[](ClusterTree& self) { return self.parent; })
|
117
|
+
.define_method("all_branches", *[](ClusterTree& self) { return self.all_branches; })
|
118
|
+
.define_method("column_type", *[](ClusterTree& self) { return self.column_type; })
|
119
|
+
.define_method("col_num", *[](ClusterTree& self) { return self.col_num; })
|
120
|
+
.define_method("split_point", *[](ClusterTree& self) { return self.split_point; })
|
121
|
+
.define_method("split_subset", *[](ClusterTree& self) { return self.split_subset; })
|
122
|
+
.define_method("split_lev", *[](ClusterTree& self) { return self.split_lev; });
|
123
|
+
|
124
|
+
define_class_under<ModelOutputs>(rb_mExt, "ModelOutputs")
|
125
|
+
.define_method("outlier_scores_final", *[](ModelOutputs& self) { return self.outlier_scores_final; })
|
126
|
+
.define_method("outlier_columns_final", *[](ModelOutputs& self) { return self.outlier_columns_final; })
|
127
|
+
.define_method("outlier_clusters_final", *[](ModelOutputs& self) { return self.outlier_clusters_final; })
|
128
|
+
.define_method("outlier_trees_final", *[](ModelOutputs& self) { return self.outlier_trees_final; })
|
129
|
+
.define_method("outlier_depth_final", *[](ModelOutputs& self) { return self.outlier_depth_final; })
|
130
|
+
.define_method("outlier_decimals_distr", *[](ModelOutputs& self) { return self.outlier_decimals_distr; })
|
131
|
+
.define_method("min_decimals_col", *[](ModelOutputs& self) { return self.min_decimals_col; })
|
132
|
+
.define_method(
|
133
|
+
"all_clusters",
|
134
|
+
*[](ModelOutputs& self, size_t i, size_t j) {
|
135
|
+
return self.all_clusters[i][j];
|
136
|
+
})
|
137
|
+
.define_method(
|
138
|
+
"all_trees",
|
139
|
+
*[](ModelOutputs& self, size_t i, size_t j) {
|
140
|
+
return self.all_trees[i][j];
|
141
|
+
});
|
142
|
+
|
143
|
+
rb_mExt
|
144
|
+
.define_singleton_method(
|
145
|
+
"fit_outliers_models",
|
146
|
+
*[](Hash options) {
|
147
|
+
ModelOutputs model_outputs;
|
148
|
+
|
149
|
+
// data
|
150
|
+
size_t nrows = options.get<size_t, Symbol>("nrows");
|
151
|
+
size_t ncols_numeric = options.get<size_t, Symbol>("ncols_numeric");
|
152
|
+
size_t ncols_categ = options.get<size_t, Symbol>("ncols_categ");
|
153
|
+
size_t ncols_ord = options.get<size_t, Symbol>("ncols_ord");
|
154
|
+
|
155
|
+
double *restrict numeric_data = NULL;
|
156
|
+
if (ncols_numeric > 0) {
|
157
|
+
numeric_data = (double*) options.get<String, Symbol>("numeric_data").c_str();
|
158
|
+
}
|
159
|
+
|
160
|
+
int *restrict categorical_data = NULL;
|
161
|
+
int *restrict ncat = NULL;
|
162
|
+
if (ncols_categ > 0) {
|
163
|
+
categorical_data = (int*) options.get<String, Symbol>("categorical_data").c_str();
|
164
|
+
ncat = (int*) options.get<String, Symbol>("ncat").c_str();
|
165
|
+
}
|
166
|
+
|
167
|
+
int *restrict ordinal_data = NULL;
|
168
|
+
int *restrict ncat_ord = NULL;
|
169
|
+
if (ncols_ord > 0) {
|
170
|
+
ordinal_data = (int*) options.get<String, Symbol>("ordinal_data").c_str();
|
171
|
+
ncat_ord = (int*) options.get<String, Symbol>("ncat_ord").c_str();
|
172
|
+
}
|
173
|
+
|
174
|
+
// options
|
175
|
+
char *restrict cols_ignore = NULL;
|
176
|
+
int nthreads = options.get<int, Symbol>("nthreads");
|
177
|
+
bool categ_as_bin = options.get<bool, Symbol>("categ_as_bin");
|
178
|
+
bool ord_as_bin = options.get<bool, Symbol>("ord_as_bin");
|
179
|
+
bool cat_bruteforce_subset = options.get<bool, Symbol>("cat_bruteforce_subset");
|
180
|
+
bool categ_from_maj = options.get<bool, Symbol>("categ_from_maj");
|
181
|
+
bool take_mid = options.get<bool, Symbol>("take_mid");
|
182
|
+
size_t max_depth = options.get<size_t, Symbol>("max_depth");
|
183
|
+
double max_perc_outliers = options.get<double, Symbol>("pct_outliers");
|
184
|
+
size_t min_size_numeric = options.get<size_t, Symbol>("min_size_numeric");
|
185
|
+
size_t min_size_categ = options.get<size_t, Symbol>("min_size_categ");
|
186
|
+
double min_gain = options.get<double, Symbol>("min_gain");
|
187
|
+
bool gain_as_pct = options.get<bool, Symbol>("gain_as_pct");
|
188
|
+
bool follow_all = options.get<bool, Symbol>("follow_all");
|
189
|
+
double z_norm = options.get<double, Symbol>("z_norm");
|
190
|
+
double z_outlier = options.get<double, Symbol>("z_outlier");
|
191
|
+
|
192
|
+
fit_outliers_models(
|
193
|
+
model_outputs,
|
194
|
+
numeric_data,
|
195
|
+
ncols_numeric,
|
196
|
+
categorical_data,
|
197
|
+
ncols_categ,
|
198
|
+
ncat,
|
199
|
+
ordinal_data,
|
200
|
+
ncols_ord,
|
201
|
+
ncat_ord,
|
202
|
+
nrows,
|
203
|
+
cols_ignore,
|
204
|
+
nthreads,
|
205
|
+
categ_as_bin,
|
206
|
+
ord_as_bin,
|
207
|
+
cat_bruteforce_subset,
|
208
|
+
categ_from_maj,
|
209
|
+
take_mid,
|
210
|
+
max_depth,
|
211
|
+
max_perc_outliers,
|
212
|
+
min_size_numeric,
|
213
|
+
min_size_categ,
|
214
|
+
min_gain,
|
215
|
+
gain_as_pct,
|
216
|
+
follow_all,
|
217
|
+
z_norm,
|
218
|
+
z_outlier
|
219
|
+
);
|
220
|
+
return model_outputs;
|
221
|
+
})
|
222
|
+
.define_singleton_method(
|
223
|
+
"find_new_outliers",
|
224
|
+
*[](ModelOutputs& model_outputs, Hash options) {
|
225
|
+
// data
|
226
|
+
size_t nrows = options.get<size_t, Symbol>("nrows");
|
227
|
+
size_t ncols_numeric = options.get<size_t, Symbol>("ncols_numeric");
|
228
|
+
size_t ncols_categ = options.get<size_t, Symbol>("ncols_categ");
|
229
|
+
size_t ncols_ord = options.get<size_t, Symbol>("ncols_ord");
|
230
|
+
|
231
|
+
double *restrict numeric_data = NULL;
|
232
|
+
if (ncols_numeric > 0) {
|
233
|
+
numeric_data = (double*) options.get<String, Symbol>("numeric_data").c_str();
|
234
|
+
}
|
235
|
+
|
236
|
+
int *restrict categorical_data = NULL;
|
237
|
+
if (ncols_categ > 0) {
|
238
|
+
categorical_data = (int*) options.get<String, Symbol>("categorical_data").c_str();
|
239
|
+
}
|
240
|
+
|
241
|
+
int *restrict ordinal_data = NULL;
|
242
|
+
if (ncols_ord > 0) {
|
243
|
+
ordinal_data = (int*) options.get<String, Symbol>("ordinal_data").c_str();
|
244
|
+
}
|
245
|
+
|
246
|
+
// options
|
247
|
+
int nthreads = options.get<int, Symbol>("nthreads");
|
248
|
+
|
249
|
+
find_new_outliers(
|
250
|
+
numeric_data,
|
251
|
+
categorical_data,
|
252
|
+
ordinal_data,
|
253
|
+
nrows,
|
254
|
+
nthreads,
|
255
|
+
model_outputs
|
256
|
+
);
|
257
|
+
|
258
|
+
return model_outputs;
|
259
|
+
});
|
260
|
+
}
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require "mkmf-rice"
|
2
|
+
|
3
|
+
$CXXFLAGS += " -std=c++11"
|
4
|
+
|
5
|
+
apple_clang = RbConfig::CONFIG["CC_VERSION_MESSAGE"] =~ /apple clang/i
|
6
|
+
|
7
|
+
# check omp first
|
8
|
+
if have_library("omp") || have_library("gomp")
|
9
|
+
$CXXFLAGS += " -Xclang" if apple_clang
|
10
|
+
$CXXFLAGS += " -fopenmp"
|
11
|
+
end
|
12
|
+
|
13
|
+
ext = File.expand_path(".", __dir__)
|
14
|
+
outliertree = File.expand_path("../../vendor/outliertree/src", __dir__)
|
15
|
+
|
16
|
+
exclude = %w(Rwrapper.cpp RcppExports.cpp)
|
17
|
+
$srcs = Dir["{#{ext},#{outliertree}}/*.{cc,cpp}"].reject { |f| exclude.include?(File.basename(f)) }
|
18
|
+
$INCFLAGS += " -I#{outliertree}"
|
19
|
+
$VPATH << outliertree
|
20
|
+
|
21
|
+
create_makefile("outliertree/ext")
|
data/lib/outliertree.rb
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
# ext
|
2
|
+
require "outliertree/ext"
|
3
|
+
|
4
|
+
# stdlib
|
5
|
+
require "etc"
|
6
|
+
|
7
|
+
# modules
|
8
|
+
require "outliertree/dataset"
|
9
|
+
require "outliertree/model"
|
10
|
+
require "outliertree/result"
|
11
|
+
require "outliertree/version"
|
12
|
+
|
13
|
+
module OutlierTree
|
14
|
+
def self.new(**options)
|
15
|
+
Model.new(**options)
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module OutlierTree
|
2
|
+
class Dataset
|
3
|
+
attr_reader :numeric_columns, :categorical_columns
|
4
|
+
|
5
|
+
def initialize(data)
|
6
|
+
@data = data
|
7
|
+
|
8
|
+
if defined?(Rover::DataFrame) && data.is_a?(Rover::DataFrame)
|
9
|
+
@vectors = data.vectors
|
10
|
+
@numeric_columns, @categorical_columns = data.keys.partition { |k, v| ![:object, :bool].include?(data[k].type) }
|
11
|
+
else
|
12
|
+
@vectors = {}
|
13
|
+
raise ArgumentError, "Array elements must be hashes" unless data.all? { |d| d.is_a?(Hash) }
|
14
|
+
keys = data.flat_map(&:keys).uniq
|
15
|
+
keys.each do |k|
|
16
|
+
@vectors[k] = []
|
17
|
+
end
|
18
|
+
data.each do |d|
|
19
|
+
keys.each do |k|
|
20
|
+
@vectors[k] << d[k]
|
21
|
+
end
|
22
|
+
end
|
23
|
+
@numeric_columns, @categorical_columns = keys.partition { |k| @vectors[k].all? { |v| v.nil? || v.is_a?(Numeric) } }
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def [](k)
|
28
|
+
@vectors[k]
|
29
|
+
end
|
30
|
+
|
31
|
+
def size
|
32
|
+
@vectors.any? ? @vectors.values.first.size : 0
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|