outliertree 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE.txt +674 -0
- data/NOTICE.txt +14 -0
- data/README.md +107 -0
- data/ext/outliertree/ext.cpp +260 -0
- data/ext/outliertree/extconf.rb +21 -0
- data/lib/outliertree.rb +17 -0
- data/lib/outliertree/dataset.rb +35 -0
- data/lib/outliertree/model.rb +128 -0
- data/lib/outliertree/result.rb +190 -0
- data/lib/outliertree/version.rb +3 -0
- data/vendor/outliertree/LICENSE +674 -0
- data/vendor/outliertree/README.md +155 -0
- data/vendor/outliertree/src/Makevars +3 -0
- data/vendor/outliertree/src/RcppExports.cpp +123 -0
- data/vendor/outliertree/src/Rwrapper.cpp +1225 -0
- data/vendor/outliertree/src/cat_outlier.cpp +328 -0
- data/vendor/outliertree/src/clusters.cpp +972 -0
- data/vendor/outliertree/src/fit_model.cpp +1932 -0
- data/vendor/outliertree/src/misc.cpp +685 -0
- data/vendor/outliertree/src/outlier_tree.hpp +758 -0
- data/vendor/outliertree/src/predict.cpp +706 -0
- data/vendor/outliertree/src/split.cpp +1098 -0
- metadata +150 -0
data/NOTICE.txt
ADDED
@@ -0,0 +1,14 @@
|
|
1
|
+
Copyright (C) 2020 Andrew Kane
|
2
|
+
|
3
|
+
This program is free software: you can redistribute it and/or modify
|
4
|
+
it under the terms of the GNU General Public License as published by
|
5
|
+
the Free Software Foundation, either version 3 of the License, or
|
6
|
+
(at your option) any later version.
|
7
|
+
|
8
|
+
This program is distributed in the hope that it will be useful,
|
9
|
+
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
10
|
+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
11
|
+
GNU General Public License for more details.
|
12
|
+
|
13
|
+
You should have received a copy of the GNU General Public License
|
14
|
+
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
data/README.md
ADDED
@@ -0,0 +1,107 @@
|
|
1
|
+
# OutlierTree
|
2
|
+
|
3
|
+
:evergreen_tree: [OutlierTree](https://github.com/david-cortes/outliertree) - explainable outlier/anomaly detection - for Ruby
|
4
|
+
|
5
|
+
Produces human-readable explanations for why values are detected as outliers
|
6
|
+
|
7
|
+
```txt
|
8
|
+
Price (2.50) looks low given Department is Books and Sale is false
|
9
|
+
```
|
10
|
+
|
11
|
+
## Installation
|
12
|
+
|
13
|
+
Add this line to your application’s Gemfile:
|
14
|
+
|
15
|
+
```ruby
|
16
|
+
gem 'outliertree'
|
17
|
+
```
|
18
|
+
|
19
|
+
## Getting Started
|
20
|
+
|
21
|
+
Prep your data
|
22
|
+
|
23
|
+
```ruby
|
24
|
+
data = [
|
25
|
+
{department: "Books", sale: false, price: 2.50},
|
26
|
+
{department: "Books", sale: true, price: 3.00},
|
27
|
+
{department: "Movies", sale: false, price: 5.00}
|
28
|
+
]
|
29
|
+
```
|
30
|
+
|
31
|
+
Train a model
|
32
|
+
|
33
|
+
```ruby
|
34
|
+
model = OutlierTree.new
|
35
|
+
model.fit(data)
|
36
|
+
```
|
37
|
+
|
38
|
+
Get outliers
|
39
|
+
|
40
|
+
```ruby
|
41
|
+
model.outliers(data)
|
42
|
+
```
|
43
|
+
|
44
|
+
## Parameters
|
45
|
+
|
46
|
+
Pass parameters - default values below
|
47
|
+
|
48
|
+
```ruby
|
49
|
+
OutlierTree.new(
|
50
|
+
max_depth: 4,
|
51
|
+
min_gain: 0.01,
|
52
|
+
z_norm: 2.67,
|
53
|
+
z_outlier: 8.0,
|
54
|
+
pct_outliers: 0.01,
|
55
|
+
min_size_numeric: 25,
|
56
|
+
min_size_categ: 50,
|
57
|
+
categ_split: "binarize",
|
58
|
+
categ_outliers: "tail",
|
59
|
+
numeric_split: "raw",
|
60
|
+
follow_all: false,
|
61
|
+
gain_as_pct: true,
|
62
|
+
nthreads: -1
|
63
|
+
)
|
64
|
+
```
|
65
|
+
|
66
|
+
See a [detailed explanation](https://outliertree.readthedocs.io/en/latest/#outliertree.OutlierTree)
|
67
|
+
|
68
|
+
## Data
|
69
|
+
|
70
|
+
Data can be an array of hashes
|
71
|
+
|
72
|
+
```ruby
|
73
|
+
[
|
74
|
+
{department: "Books", sale: false, price: 2.50},
|
75
|
+
{department: "Books", sale: true, price: 3.00},
|
76
|
+
{department: "Movies", sale: false, price: 5.00}
|
77
|
+
]
|
78
|
+
```
|
79
|
+
|
80
|
+
Or a Rover data frame
|
81
|
+
|
82
|
+
```ruby
|
83
|
+
Rover.read_csv("data.csv")
|
84
|
+
```
|
85
|
+
|
86
|
+
## History
|
87
|
+
|
88
|
+
View the [changelog](https://github.com/ankane/outliertree/blob/master/CHANGELOG.md)
|
89
|
+
|
90
|
+
## Contributing
|
91
|
+
|
92
|
+
Everyone is encouraged to help improve this project. Here are a few ways you can help:
|
93
|
+
|
94
|
+
- [Report bugs](https://github.com/ankane/outliertree/issues)
|
95
|
+
- Fix bugs and [submit pull requests](https://github.com/ankane/outliertree/pulls)
|
96
|
+
- Write, clarify, or fix documentation
|
97
|
+
- Suggest or add new features
|
98
|
+
|
99
|
+
To get started with development:
|
100
|
+
|
101
|
+
```sh
|
102
|
+
git clone --recursive https://github.com/ankane/outliertree.git
|
103
|
+
cd outliertree
|
104
|
+
bundle install
|
105
|
+
bundle exec rake compile
|
106
|
+
bundle exec rake test
|
107
|
+
```
|
@@ -0,0 +1,260 @@
|
|
1
|
+
// outliertree
|
2
|
+
#include <outlier_tree.hpp>
|
3
|
+
|
4
|
+
// rice
|
5
|
+
#include <rice/Array.hpp>
|
6
|
+
#include <rice/Hash.hpp>
|
7
|
+
#include <rice/Module.hpp>
|
8
|
+
#include <rice/Object.hpp>
|
9
|
+
#include <rice/String.hpp>
|
10
|
+
#include <rice/Symbol.hpp>
|
11
|
+
|
12
|
+
using Rice::Array;
|
13
|
+
using Rice::Hash;
|
14
|
+
using Rice::Module;
|
15
|
+
using Rice::Object;
|
16
|
+
using Rice::String;
|
17
|
+
using Rice::Symbol;
|
18
|
+
using Rice::define_class_under;
|
19
|
+
using Rice::define_module;
|
20
|
+
|
21
|
+
template<>
|
22
|
+
Object to_ruby<std::vector<char>>(std::vector<char> const & x)
|
23
|
+
{
|
24
|
+
Array a;
|
25
|
+
for (size_t i = 0; i < x.size(); i++) {
|
26
|
+
a.push(x[i]);
|
27
|
+
}
|
28
|
+
return a;
|
29
|
+
}
|
30
|
+
|
31
|
+
template<>
|
32
|
+
Object to_ruby<std::vector<int>>(std::vector<int> const & x)
|
33
|
+
{
|
34
|
+
Array a;
|
35
|
+
for (size_t i = 0; i < x.size(); i++) {
|
36
|
+
a.push(x[i]);
|
37
|
+
}
|
38
|
+
return a;
|
39
|
+
}
|
40
|
+
|
41
|
+
template<>
|
42
|
+
Object to_ruby<std::vector<unsigned long>>(std::vector<unsigned long> const & x)
|
43
|
+
{
|
44
|
+
Array a;
|
45
|
+
for (size_t i = 0; i < x.size(); i++) {
|
46
|
+
a.push(x[i]);
|
47
|
+
}
|
48
|
+
return a;
|
49
|
+
}
|
50
|
+
|
51
|
+
template<>
|
52
|
+
Object to_ruby<std::vector<double>>(std::vector<double> const & x)
|
53
|
+
{
|
54
|
+
Array a;
|
55
|
+
for (size_t i = 0; i < x.size(); i++) {
|
56
|
+
a.push(x[i]);
|
57
|
+
}
|
58
|
+
return a;
|
59
|
+
}
|
60
|
+
|
61
|
+
template<>
|
62
|
+
Object to_ruby<ColType>(ColType const & x)
|
63
|
+
{
|
64
|
+
switch (x) {
|
65
|
+
case Numeric: return Symbol("numeric");
|
66
|
+
case Categorical: return Symbol("categorical");
|
67
|
+
case Ordinal: return Symbol("ordinal");
|
68
|
+
case NoType: return Symbol("no_type");
|
69
|
+
}
|
70
|
+
throw std::runtime_error("Unknown column type");
|
71
|
+
}
|
72
|
+
|
73
|
+
template<>
|
74
|
+
Object to_ruby<SplitType>(SplitType const & x)
|
75
|
+
{
|
76
|
+
switch (x) {
|
77
|
+
case LessOrEqual: return Symbol("less_or_equal");
|
78
|
+
case Greater: return Symbol("greater");
|
79
|
+
case Equal: return Symbol("equal");
|
80
|
+
case NotEqual: return Symbol("not_equal");
|
81
|
+
case InSubset: return Symbol("in_subset");
|
82
|
+
case NotInSubset: return Symbol("not_in_subset");
|
83
|
+
case SingleCateg: return Symbol("single_categ");
|
84
|
+
case SubTrees: return Symbol("sub_trees");
|
85
|
+
case IsNa: return Symbol("is_na");
|
86
|
+
case Root: return Symbol("root");
|
87
|
+
}
|
88
|
+
throw std::runtime_error("Unknown split type");
|
89
|
+
}
|
90
|
+
|
91
|
+
extern "C"
|
92
|
+
void Init_ext()
|
93
|
+
{
|
94
|
+
Module rb_mOutlierTree = define_module("OutlierTree");
|
95
|
+
Module rb_mExt = define_module_under(rb_mOutlierTree, "Ext");
|
96
|
+
|
97
|
+
define_class_under<Cluster>(rb_mExt, "Cluster")
|
98
|
+
.define_method("upper_lim", *[](Cluster& self) { return self.upper_lim; })
|
99
|
+
.define_method("display_lim_high", *[](Cluster& self) { return self.display_lim_high; })
|
100
|
+
.define_method("perc_below", *[](Cluster& self) { return self.perc_below; })
|
101
|
+
.define_method("display_lim_low", *[](Cluster& self) { return self.display_lim_low; })
|
102
|
+
.define_method("perc_above", *[](Cluster& self) { return self.perc_above; })
|
103
|
+
.define_method("display_mean", *[](Cluster& self) { return self.display_mean; })
|
104
|
+
.define_method("display_sd", *[](Cluster& self) { return self.display_sd; })
|
105
|
+
.define_method("cluster_size", *[](Cluster& self) { return self.cluster_size; })
|
106
|
+
.define_method("split_point", *[](Cluster& self) { return self.split_point; })
|
107
|
+
.define_method("split_subset", *[](Cluster& self) { return self.split_subset; })
|
108
|
+
.define_method("split_lev", *[](Cluster& self) { return self.split_lev; })
|
109
|
+
.define_method("split_type", *[](Cluster& self) { return self.split_type; })
|
110
|
+
.define_method("column_type", *[](Cluster& self) { return self.column_type; })
|
111
|
+
.define_method("has_na_branch", *[](Cluster& self) { return self.has_NA_branch; })
|
112
|
+
.define_method("col_num", *[](Cluster& self) { return self.col_num; });
|
113
|
+
|
114
|
+
define_class_under<ClusterTree>(rb_mExt, "ClusterTree")
|
115
|
+
.define_method("parent_branch", *[](ClusterTree& self) { return self.parent_branch; })
|
116
|
+
.define_method("parent", *[](ClusterTree& self) { return self.parent; })
|
117
|
+
.define_method("all_branches", *[](ClusterTree& self) { return self.all_branches; })
|
118
|
+
.define_method("column_type", *[](ClusterTree& self) { return self.column_type; })
|
119
|
+
.define_method("col_num", *[](ClusterTree& self) { return self.col_num; })
|
120
|
+
.define_method("split_point", *[](ClusterTree& self) { return self.split_point; })
|
121
|
+
.define_method("split_subset", *[](ClusterTree& self) { return self.split_subset; })
|
122
|
+
.define_method("split_lev", *[](ClusterTree& self) { return self.split_lev; });
|
123
|
+
|
124
|
+
define_class_under<ModelOutputs>(rb_mExt, "ModelOutputs")
|
125
|
+
.define_method("outlier_scores_final", *[](ModelOutputs& self) { return self.outlier_scores_final; })
|
126
|
+
.define_method("outlier_columns_final", *[](ModelOutputs& self) { return self.outlier_columns_final; })
|
127
|
+
.define_method("outlier_clusters_final", *[](ModelOutputs& self) { return self.outlier_clusters_final; })
|
128
|
+
.define_method("outlier_trees_final", *[](ModelOutputs& self) { return self.outlier_trees_final; })
|
129
|
+
.define_method("outlier_depth_final", *[](ModelOutputs& self) { return self.outlier_depth_final; })
|
130
|
+
.define_method("outlier_decimals_distr", *[](ModelOutputs& self) { return self.outlier_decimals_distr; })
|
131
|
+
.define_method("min_decimals_col", *[](ModelOutputs& self) { return self.min_decimals_col; })
|
132
|
+
.define_method(
|
133
|
+
"all_clusters",
|
134
|
+
*[](ModelOutputs& self, size_t i, size_t j) {
|
135
|
+
return self.all_clusters[i][j];
|
136
|
+
})
|
137
|
+
.define_method(
|
138
|
+
"all_trees",
|
139
|
+
*[](ModelOutputs& self, size_t i, size_t j) {
|
140
|
+
return self.all_trees[i][j];
|
141
|
+
});
|
142
|
+
|
143
|
+
rb_mExt
|
144
|
+
.define_singleton_method(
|
145
|
+
"fit_outliers_models",
|
146
|
+
*[](Hash options) {
|
147
|
+
ModelOutputs model_outputs;
|
148
|
+
|
149
|
+
// data
|
150
|
+
size_t nrows = options.get<size_t, Symbol>("nrows");
|
151
|
+
size_t ncols_numeric = options.get<size_t, Symbol>("ncols_numeric");
|
152
|
+
size_t ncols_categ = options.get<size_t, Symbol>("ncols_categ");
|
153
|
+
size_t ncols_ord = options.get<size_t, Symbol>("ncols_ord");
|
154
|
+
|
155
|
+
double *restrict numeric_data = NULL;
|
156
|
+
if (ncols_numeric > 0) {
|
157
|
+
numeric_data = (double*) options.get<String, Symbol>("numeric_data").c_str();
|
158
|
+
}
|
159
|
+
|
160
|
+
int *restrict categorical_data = NULL;
|
161
|
+
int *restrict ncat = NULL;
|
162
|
+
if (ncols_categ > 0) {
|
163
|
+
categorical_data = (int*) options.get<String, Symbol>("categorical_data").c_str();
|
164
|
+
ncat = (int*) options.get<String, Symbol>("ncat").c_str();
|
165
|
+
}
|
166
|
+
|
167
|
+
int *restrict ordinal_data = NULL;
|
168
|
+
int *restrict ncat_ord = NULL;
|
169
|
+
if (ncols_ord > 0) {
|
170
|
+
ordinal_data = (int*) options.get<String, Symbol>("ordinal_data").c_str();
|
171
|
+
ncat_ord = (int*) options.get<String, Symbol>("ncat_ord").c_str();
|
172
|
+
}
|
173
|
+
|
174
|
+
// options
|
175
|
+
char *restrict cols_ignore = NULL;
|
176
|
+
int nthreads = options.get<int, Symbol>("nthreads");
|
177
|
+
bool categ_as_bin = options.get<bool, Symbol>("categ_as_bin");
|
178
|
+
bool ord_as_bin = options.get<bool, Symbol>("ord_as_bin");
|
179
|
+
bool cat_bruteforce_subset = options.get<bool, Symbol>("cat_bruteforce_subset");
|
180
|
+
bool categ_from_maj = options.get<bool, Symbol>("categ_from_maj");
|
181
|
+
bool take_mid = options.get<bool, Symbol>("take_mid");
|
182
|
+
size_t max_depth = options.get<size_t, Symbol>("max_depth");
|
183
|
+
double max_perc_outliers = options.get<double, Symbol>("pct_outliers");
|
184
|
+
size_t min_size_numeric = options.get<size_t, Symbol>("min_size_numeric");
|
185
|
+
size_t min_size_categ = options.get<size_t, Symbol>("min_size_categ");
|
186
|
+
double min_gain = options.get<double, Symbol>("min_gain");
|
187
|
+
bool gain_as_pct = options.get<bool, Symbol>("gain_as_pct");
|
188
|
+
bool follow_all = options.get<bool, Symbol>("follow_all");
|
189
|
+
double z_norm = options.get<double, Symbol>("z_norm");
|
190
|
+
double z_outlier = options.get<double, Symbol>("z_outlier");
|
191
|
+
|
192
|
+
fit_outliers_models(
|
193
|
+
model_outputs,
|
194
|
+
numeric_data,
|
195
|
+
ncols_numeric,
|
196
|
+
categorical_data,
|
197
|
+
ncols_categ,
|
198
|
+
ncat,
|
199
|
+
ordinal_data,
|
200
|
+
ncols_ord,
|
201
|
+
ncat_ord,
|
202
|
+
nrows,
|
203
|
+
cols_ignore,
|
204
|
+
nthreads,
|
205
|
+
categ_as_bin,
|
206
|
+
ord_as_bin,
|
207
|
+
cat_bruteforce_subset,
|
208
|
+
categ_from_maj,
|
209
|
+
take_mid,
|
210
|
+
max_depth,
|
211
|
+
max_perc_outliers,
|
212
|
+
min_size_numeric,
|
213
|
+
min_size_categ,
|
214
|
+
min_gain,
|
215
|
+
gain_as_pct,
|
216
|
+
follow_all,
|
217
|
+
z_norm,
|
218
|
+
z_outlier
|
219
|
+
);
|
220
|
+
return model_outputs;
|
221
|
+
})
|
222
|
+
.define_singleton_method(
|
223
|
+
"find_new_outliers",
|
224
|
+
*[](ModelOutputs& model_outputs, Hash options) {
|
225
|
+
// data
|
226
|
+
size_t nrows = options.get<size_t, Symbol>("nrows");
|
227
|
+
size_t ncols_numeric = options.get<size_t, Symbol>("ncols_numeric");
|
228
|
+
size_t ncols_categ = options.get<size_t, Symbol>("ncols_categ");
|
229
|
+
size_t ncols_ord = options.get<size_t, Symbol>("ncols_ord");
|
230
|
+
|
231
|
+
double *restrict numeric_data = NULL;
|
232
|
+
if (ncols_numeric > 0) {
|
233
|
+
numeric_data = (double*) options.get<String, Symbol>("numeric_data").c_str();
|
234
|
+
}
|
235
|
+
|
236
|
+
int *restrict categorical_data = NULL;
|
237
|
+
if (ncols_categ > 0) {
|
238
|
+
categorical_data = (int*) options.get<String, Symbol>("categorical_data").c_str();
|
239
|
+
}
|
240
|
+
|
241
|
+
int *restrict ordinal_data = NULL;
|
242
|
+
if (ncols_ord > 0) {
|
243
|
+
ordinal_data = (int*) options.get<String, Symbol>("ordinal_data").c_str();
|
244
|
+
}
|
245
|
+
|
246
|
+
// options
|
247
|
+
int nthreads = options.get<int, Symbol>("nthreads");
|
248
|
+
|
249
|
+
find_new_outliers(
|
250
|
+
numeric_data,
|
251
|
+
categorical_data,
|
252
|
+
ordinal_data,
|
253
|
+
nrows,
|
254
|
+
nthreads,
|
255
|
+
model_outputs
|
256
|
+
);
|
257
|
+
|
258
|
+
return model_outputs;
|
259
|
+
});
|
260
|
+
}
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require "mkmf-rice"
|
2
|
+
|
3
|
+
$CXXFLAGS += " -std=c++11"
|
4
|
+
|
5
|
+
apple_clang = RbConfig::CONFIG["CC_VERSION_MESSAGE"] =~ /apple clang/i
|
6
|
+
|
7
|
+
# check omp first
|
8
|
+
if have_library("omp") || have_library("gomp")
|
9
|
+
$CXXFLAGS += " -Xclang" if apple_clang
|
10
|
+
$CXXFLAGS += " -fopenmp"
|
11
|
+
end
|
12
|
+
|
13
|
+
ext = File.expand_path(".", __dir__)
|
14
|
+
outliertree = File.expand_path("../../vendor/outliertree/src", __dir__)
|
15
|
+
|
16
|
+
exclude = %w(Rwrapper.cpp RcppExports.cpp)
|
17
|
+
$srcs = Dir["{#{ext},#{outliertree}}/*.{cc,cpp}"].reject { |f| exclude.include?(File.basename(f)) }
|
18
|
+
$INCFLAGS += " -I#{outliertree}"
|
19
|
+
$VPATH << outliertree
|
20
|
+
|
21
|
+
create_makefile("outliertree/ext")
|
data/lib/outliertree.rb
ADDED
@@ -0,0 +1,17 @@
|
|
1
|
+
# ext
|
2
|
+
require "outliertree/ext"
|
3
|
+
|
4
|
+
# stdlib
|
5
|
+
require "etc"
|
6
|
+
|
7
|
+
# modules
|
8
|
+
require "outliertree/dataset"
|
9
|
+
require "outliertree/model"
|
10
|
+
require "outliertree/result"
|
11
|
+
require "outliertree/version"
|
12
|
+
|
13
|
+
module OutlierTree
|
14
|
+
def self.new(**options)
|
15
|
+
Model.new(**options)
|
16
|
+
end
|
17
|
+
end
|
@@ -0,0 +1,35 @@
|
|
1
|
+
module OutlierTree
|
2
|
+
class Dataset
|
3
|
+
attr_reader :numeric_columns, :categorical_columns
|
4
|
+
|
5
|
+
def initialize(data)
|
6
|
+
@data = data
|
7
|
+
|
8
|
+
if defined?(Rover::DataFrame) && data.is_a?(Rover::DataFrame)
|
9
|
+
@vectors = data.vectors
|
10
|
+
@numeric_columns, @categorical_columns = data.keys.partition { |k, v| ![:object, :bool].include?(data[k].type) }
|
11
|
+
else
|
12
|
+
@vectors = {}
|
13
|
+
raise ArgumentError, "Array elements must be hashes" unless data.all? { |d| d.is_a?(Hash) }
|
14
|
+
keys = data.flat_map(&:keys).uniq
|
15
|
+
keys.each do |k|
|
16
|
+
@vectors[k] = []
|
17
|
+
end
|
18
|
+
data.each do |d|
|
19
|
+
keys.each do |k|
|
20
|
+
@vectors[k] << d[k]
|
21
|
+
end
|
22
|
+
end
|
23
|
+
@numeric_columns, @categorical_columns = keys.partition { |k| @vectors[k].all? { |v| v.nil? || v.is_a?(Numeric) } }
|
24
|
+
end
|
25
|
+
end
|
26
|
+
|
27
|
+
def [](k)
|
28
|
+
@vectors[k]
|
29
|
+
end
|
30
|
+
|
31
|
+
def size
|
32
|
+
@vectors.any? ? @vectors.values.first.size : 0
|
33
|
+
end
|
34
|
+
end
|
35
|
+
end
|