isotree 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE.txt +25 -0
- data/README.md +111 -0
- data/ext/isotree/ext.cpp +178 -0
- data/ext/isotree/extconf.rb +21 -0
- data/lib/isotree.rb +9 -0
- data/lib/isotree/isolation_forest.rb +94 -0
- data/lib/isotree/version.rb +3 -0
- data/vendor/isotree/LICENSE +25 -0
- data/vendor/isotree/README.md +167 -0
- data/vendor/isotree/src/Makevars +4 -0
- data/vendor/isotree/src/RcppExports.cpp +267 -0
- data/vendor/isotree/src/Rwrapper.cpp +762 -0
- data/vendor/isotree/src/crit.cpp +912 -0
- data/vendor/isotree/src/dealloc.cpp +66 -0
- data/vendor/isotree/src/dist.cpp +749 -0
- data/vendor/isotree/src/extended.cpp +790 -0
- data/vendor/isotree/src/fit_model.cpp +1068 -0
- data/vendor/isotree/src/helpers_iforest.cpp +309 -0
- data/vendor/isotree/src/impute.cpp +1205 -0
- data/vendor/isotree/src/isoforest.cpp +771 -0
- data/vendor/isotree/src/isotree.hpp +929 -0
- data/vendor/isotree/src/merge_models.cpp +116 -0
- data/vendor/isotree/src/mult.cpp +607 -0
- data/vendor/isotree/src/predict.cpp +849 -0
- data/vendor/isotree/src/serialize.cpp +262 -0
- data/vendor/isotree/src/utils.cpp +1574 -0
- metadata +154 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: fa5516ad971c1fc1def4766fdf39ec74121a20f6c95d9b7bda705707324a5571
|
4
|
+
data.tar.gz: c7aed404ad88d2e0365f7698cbad0a2d198a85860b9863d6322b10e53852f8a3
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 92a74815ea52c38c0a4d0f27cb78524413cc32f966732c520590937f9523c253344e04673be280ff04659c7ed4a8db96560fba54fc425221b8542244e275ed33
|
7
|
+
data.tar.gz: a5824999e81a4732742646e66cdbae55812340667122b262c799bf3a5b243e9185445855ef64c31c877941d6427cd4fdfb7d77b4f87c2160b489ebe90aa18af6
|
data/CHANGELOG.md
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
BSD 2-Clause License
|
2
|
+
|
3
|
+
Copyright (c) 2020, Andrew Kane
|
4
|
+
All rights reserved.
|
5
|
+
|
6
|
+
Redistribution and use in source and binary forms, with or without
|
7
|
+
modification, are permitted provided that the following conditions are met:
|
8
|
+
|
9
|
+
1. Redistributions of source code must retain the above copyright notice, this
|
10
|
+
list of conditions and the following disclaimer.
|
11
|
+
|
12
|
+
2. Redistributions in binary form must reproduce the above copyright notice,
|
13
|
+
this list of conditions and the following disclaimer in the documentation
|
14
|
+
and/or other materials provided with the distribution.
|
15
|
+
|
16
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
17
|
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
18
|
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
19
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
20
|
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
21
|
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
22
|
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
23
|
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
24
|
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
25
|
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
data/README.md
ADDED
@@ -0,0 +1,111 @@
|
|
1
|
+
# IsoTree
|
2
|
+
|
3
|
+
:evergreen_tree: [IsoTree](https://github.com/david-cortes/isotree) - outlier/anomaly detection for using Isolation Forest - for Ruby
|
4
|
+
|
5
|
+
Learn how [Isolation Forest](https://www.youtube.com/watch?v=RyFQXQf4w4w) works
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
Add this line to your application’s Gemfile:
|
10
|
+
|
11
|
+
```ruby
|
12
|
+
gem 'isotree'
|
13
|
+
```
|
14
|
+
|
15
|
+
## Getting Started
|
16
|
+
|
17
|
+
Prep your data
|
18
|
+
|
19
|
+
```ruby
|
20
|
+
x = [[1, 2], [3, 4], [5, 6], [7, 8]]
|
21
|
+
```
|
22
|
+
|
23
|
+
Train a model
|
24
|
+
|
25
|
+
```ruby
|
26
|
+
model = IsoTree::IsolationForest.new
|
27
|
+
model.fit(x)
|
28
|
+
```
|
29
|
+
|
30
|
+
Get outlier scores
|
31
|
+
|
32
|
+
```ruby
|
33
|
+
model.predict(x)
|
34
|
+
```
|
35
|
+
|
36
|
+
Scores are between 0 and 1, with higher scores indicating outliers
|
37
|
+
|
38
|
+
## Parameters
|
39
|
+
|
40
|
+
Pass parameters - default values below
|
41
|
+
|
42
|
+
```ruby
|
43
|
+
IsoTree::IsolationForest.new(
|
44
|
+
sample_size: nil,
|
45
|
+
ntrees: 500,
|
46
|
+
ndim: 3,
|
47
|
+
ntry: 3,
|
48
|
+
prob_pick_avg_gain: 0,
|
49
|
+
prob_pick_pooled_gain: 0,
|
50
|
+
prob_split_avg_gain: 0,
|
51
|
+
prob_split_pooled_gain: 0,
|
52
|
+
min_gain: 0,
|
53
|
+
all_perm: false,
|
54
|
+
coef_by_prop: false,
|
55
|
+
sample_with_replacement: false,
|
56
|
+
penalize_range: true,
|
57
|
+
weigh_by_kurtosis: false,
|
58
|
+
min_imp_obs: 3,
|
59
|
+
random_seed: 1,
|
60
|
+
nthreads: -1
|
61
|
+
)
|
62
|
+
```
|
63
|
+
|
64
|
+
See a [detailed explanation](https://isotree.readthedocs.io/en/latest/#isotree.IsolationForest)
|
65
|
+
|
66
|
+
## Data
|
67
|
+
|
68
|
+
Data can be an array of arrays
|
69
|
+
|
70
|
+
```ruby
|
71
|
+
[[1, 2, 3], [4, 5, 6]]
|
72
|
+
```
|
73
|
+
|
74
|
+
Or a Numo array
|
75
|
+
|
76
|
+
```ruby
|
77
|
+
Numo::NArray.cast([[1, 2, 3], [4, 5, 6]])
|
78
|
+
```
|
79
|
+
|
80
|
+
## Performance
|
81
|
+
|
82
|
+
IsoTree uses OpenMP when possible for best performance. To enable OpenMP on Mac, run:
|
83
|
+
|
84
|
+
```sh
|
85
|
+
brew install libomp
|
86
|
+
```
|
87
|
+
|
88
|
+
Then reinstall the gem.
|
89
|
+
|
90
|
+
## History
|
91
|
+
|
92
|
+
View the [changelog](https://github.com/ankane/isotree/blob/master/CHANGELOG.md)
|
93
|
+
|
94
|
+
## Contributing
|
95
|
+
|
96
|
+
Everyone is encouraged to help improve this project. Here are a few ways you can help:
|
97
|
+
|
98
|
+
- [Report bugs](https://github.com/ankane/isotree/issues)
|
99
|
+
- Fix bugs and [submit pull requests](https://github.com/ankane/isotree/pulls)
|
100
|
+
- Write, clarify, or fix documentation
|
101
|
+
- Suggest or add new features
|
102
|
+
|
103
|
+
To get started with development:
|
104
|
+
|
105
|
+
```sh
|
106
|
+
git clone --recursive https://github.com/ankane/isotree.git
|
107
|
+
cd isotree
|
108
|
+
bundle install
|
109
|
+
bundle exec rake compile
|
110
|
+
bundle exec rake test
|
111
|
+
```
|
data/ext/isotree/ext.cpp
ADDED
@@ -0,0 +1,178 @@
|
|
1
|
+
// isotree
|
2
|
+
#include <isotree.hpp>
|
3
|
+
|
4
|
+
// rice
|
5
|
+
#include <rice/Array.hpp>
|
6
|
+
#include <rice/Hash.hpp>
|
7
|
+
#include <rice/Module.hpp>
|
8
|
+
#include <rice/String.hpp>
|
9
|
+
#include <rice/Symbol.hpp>
|
10
|
+
|
11
|
+
using Rice::Array;
|
12
|
+
using Rice::Hash;
|
13
|
+
using Rice::Module;
|
14
|
+
using Rice::String;
|
15
|
+
using Rice::Symbol;
|
16
|
+
using Rice::define_class_under;
|
17
|
+
using Rice::define_module;
|
18
|
+
|
19
|
+
extern "C"
|
20
|
+
void Init_ext()
|
21
|
+
{
|
22
|
+
Module rb_mIsoTree = define_module("IsoTree");
|
23
|
+
|
24
|
+
Module rb_mExt = define_module_under(rb_mIsoTree, "Ext");
|
25
|
+
define_class_under<ExtIsoForest>(rb_mExt, "ExtIsoForest");
|
26
|
+
|
27
|
+
rb_mExt
|
28
|
+
.define_singleton_method(
|
29
|
+
"fit_iforest",
|
30
|
+
*[](Hash options) {
|
31
|
+
// model
|
32
|
+
ExtIsoForest iso;
|
33
|
+
|
34
|
+
// data
|
35
|
+
size_t nrows = options.get<size_t, Symbol>("nrows");
|
36
|
+
size_t ncols = options.get<size_t, Symbol>("ncols");
|
37
|
+
double* numeric_data = (double*) options.get<String, Symbol>("numeric_data").c_str();
|
38
|
+
size_t ncols_numeric = ncols;
|
39
|
+
int* categ_data = NULL;
|
40
|
+
size_t ncols_categ = 0;
|
41
|
+
int* ncat = NULL;
|
42
|
+
double* Xc = NULL;
|
43
|
+
sparse_ix* Xc_ind = NULL;
|
44
|
+
sparse_ix* Xc_indptr = NULL;
|
45
|
+
|
46
|
+
// options
|
47
|
+
CoefType coef_type = Normal;
|
48
|
+
double* sample_weights = NULL;
|
49
|
+
bool weight_as_sample = false;
|
50
|
+
size_t max_depth = 0;
|
51
|
+
bool limit_depth = true;
|
52
|
+
bool standardize_dist = false;
|
53
|
+
double* tmat = NULL;
|
54
|
+
double* output_depths = NULL;
|
55
|
+
bool standardize_depth = false;
|
56
|
+
double* col_weights = NULL;
|
57
|
+
MissingAction missing_action = Impute;
|
58
|
+
CategSplit cat_split_type = SubSet;
|
59
|
+
NewCategAction new_cat_action = Smallest;
|
60
|
+
Imputer *imputer = NULL;
|
61
|
+
UseDepthImp depth_imp = Higher;
|
62
|
+
WeighImpRows weigh_imp_rows = Inverse;
|
63
|
+
bool impute_at_fit = false;
|
64
|
+
|
65
|
+
// Rice has limit of 14 arguments, so use hash for options
|
66
|
+
size_t sample_size = options.get<size_t, Symbol>("sample_size");
|
67
|
+
size_t ndim = options.get<size_t, Symbol>("ndim");
|
68
|
+
size_t ntrees = options.get<size_t, Symbol>("ntrees");
|
69
|
+
size_t ntry = options.get<size_t, Symbol>("ntry");
|
70
|
+
double prob_pick_by_gain_avg = options.get<double, Symbol>("prob_pick_avg_gain");
|
71
|
+
double prob_split_by_gain_avg = options.get<double, Symbol>("prob_split_avg_gain");
|
72
|
+
double prob_pick_by_gain_pl = options.get<double, Symbol>("prob_pick_pooled_gain");
|
73
|
+
double prob_split_by_gain_pl = options.get<double, Symbol>("prob_split_pooled_gain");
|
74
|
+
double min_gain = options.get<double, Symbol>("min_gain");
|
75
|
+
bool all_perm = options.get<bool, Symbol>("all_perm");
|
76
|
+
bool coef_by_prop = options.get<bool, Symbol>("coef_by_prop");
|
77
|
+
bool with_replacement = options.get<bool, Symbol>("sample_with_replacement");
|
78
|
+
bool penalize_range = options.get<bool, Symbol>("penalize_range");
|
79
|
+
bool weigh_by_kurt = options.get<bool, Symbol>("weigh_by_kurtosis");
|
80
|
+
size_t min_imp_obs = options.get<size_t, Symbol>("min_imp_obs");
|
81
|
+
uint64_t random_seed = options.get<uint64_t, Symbol>("random_seed");
|
82
|
+
int nthreads = options.get<int, Symbol>("nthreads");
|
83
|
+
|
84
|
+
fit_iforest(
|
85
|
+
NULL,
|
86
|
+
&iso,
|
87
|
+
numeric_data,
|
88
|
+
ncols_numeric,
|
89
|
+
categ_data,
|
90
|
+
ncols_categ,
|
91
|
+
ncat,
|
92
|
+
Xc,
|
93
|
+
Xc_ind,
|
94
|
+
Xc_indptr,
|
95
|
+
ndim,
|
96
|
+
ntry,
|
97
|
+
coef_type,
|
98
|
+
coef_by_prop,
|
99
|
+
sample_weights,
|
100
|
+
with_replacement,
|
101
|
+
weight_as_sample,
|
102
|
+
nrows,
|
103
|
+
sample_size,
|
104
|
+
ntrees,
|
105
|
+
max_depth,
|
106
|
+
limit_depth,
|
107
|
+
penalize_range,
|
108
|
+
standardize_dist,
|
109
|
+
tmat,
|
110
|
+
output_depths,
|
111
|
+
standardize_depth,
|
112
|
+
col_weights,
|
113
|
+
weigh_by_kurt,
|
114
|
+
prob_pick_by_gain_avg,
|
115
|
+
prob_split_by_gain_avg,
|
116
|
+
prob_pick_by_gain_pl,
|
117
|
+
prob_split_by_gain_pl,
|
118
|
+
min_gain,
|
119
|
+
missing_action,
|
120
|
+
cat_split_type,
|
121
|
+
new_cat_action,
|
122
|
+
all_perm,
|
123
|
+
imputer,
|
124
|
+
min_imp_obs,
|
125
|
+
depth_imp,
|
126
|
+
weigh_imp_rows,
|
127
|
+
impute_at_fit,
|
128
|
+
random_seed,
|
129
|
+
nthreads
|
130
|
+
);
|
131
|
+
|
132
|
+
return iso;
|
133
|
+
})
|
134
|
+
.define_singleton_method(
|
135
|
+
"predict_iforest",
|
136
|
+
*[](ExtIsoForest& iso, Hash options) {
|
137
|
+
// data
|
138
|
+
size_t nrows = options.get<size_t, Symbol>("nrows");
|
139
|
+
double* numeric_data = (double*) options.get<String, Symbol>("numeric_data").c_str();
|
140
|
+
int* categ_data = NULL;
|
141
|
+
double* Xc = NULL;
|
142
|
+
sparse_ix* Xc_ind = NULL;
|
143
|
+
sparse_ix* Xc_indptr = NULL;
|
144
|
+
double* Xr = NULL;
|
145
|
+
sparse_ix* Xr_ind = NULL;
|
146
|
+
sparse_ix* Xr_indptr = NULL;
|
147
|
+
|
148
|
+
// options
|
149
|
+
int nthreads = options.get<int, Symbol>("nthreads");
|
150
|
+
bool standardize = true;
|
151
|
+
std::vector<double> outlier_scores(nrows);
|
152
|
+
sparse_ix* tree_num = NULL;
|
153
|
+
|
154
|
+
predict_iforest(
|
155
|
+
numeric_data,
|
156
|
+
categ_data,
|
157
|
+
Xc,
|
158
|
+
Xc_ind,
|
159
|
+
Xc_indptr,
|
160
|
+
Xr,
|
161
|
+
Xr_ind,
|
162
|
+
Xr_indptr,
|
163
|
+
nrows,
|
164
|
+
nthreads,
|
165
|
+
standardize,
|
166
|
+
NULL,
|
167
|
+
&iso,
|
168
|
+
outlier_scores.data(),
|
169
|
+
tree_num
|
170
|
+
);
|
171
|
+
|
172
|
+
Array ret;
|
173
|
+
for (size_t i = 0; i < outlier_scores.size(); i++) {
|
174
|
+
ret.push(outlier_scores[i]);
|
175
|
+
}
|
176
|
+
return ret;
|
177
|
+
});
|
178
|
+
}
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require "mkmf-rice"
|
2
|
+
|
3
|
+
$CXXFLAGS += " -std=c++11 -D_USE_MERSENNE_TWISTER -D_ENABLE_CEREAL"
|
4
|
+
|
5
|
+
apple_clang = RbConfig::CONFIG["CC_VERSION_MESSAGE"] =~ /apple clang/i
|
6
|
+
|
7
|
+
# check omp first
|
8
|
+
if have_library("omp") || have_library("gomp")
|
9
|
+
$CXXFLAGS += " -Xclang" if apple_clang
|
10
|
+
$CXXFLAGS += " -fopenmp"
|
11
|
+
end
|
12
|
+
|
13
|
+
ext = File.expand_path(".", __dir__)
|
14
|
+
isotree = File.expand_path("../../vendor/isotree/src", __dir__)
|
15
|
+
|
16
|
+
exclude = %w(Rwrapper.cpp RcppExports.cpp)
|
17
|
+
$srcs = Dir["{#{ext},#{isotree}}/*.{cc,cpp}"].reject { |f| exclude.include?(File.basename(f)) }
|
18
|
+
$INCFLAGS << " -I#{isotree}"
|
19
|
+
$VPATH << isotree
|
20
|
+
|
21
|
+
create_makefile("isotree/ext")
|
data/lib/isotree.rb
ADDED
@@ -0,0 +1,94 @@
|
|
1
|
+
module IsoTree
|
2
|
+
class IsolationForest
|
3
|
+
def initialize(
|
4
|
+
sample_size: nil, ntrees: 500, ndim: 3, ntry: 3,
|
5
|
+
prob_pick_avg_gain: 0, prob_pick_pooled_gain: 0,
|
6
|
+
prob_split_avg_gain: 0, prob_split_pooled_gain: 0,
|
7
|
+
min_gain: 0, all_perm: false, coef_by_prop: false,
|
8
|
+
sample_with_replacement: false, penalize_range: true,
|
9
|
+
weigh_by_kurtosis: false, min_imp_obs: 3, random_seed: 1, nthreads: -1
|
10
|
+
)
|
11
|
+
|
12
|
+
@sample_size = sample_size
|
13
|
+
@ntrees = ntrees
|
14
|
+
@ndim = ndim
|
15
|
+
@ntry = ntry
|
16
|
+
@prob_pick_avg_gain = prob_pick_avg_gain
|
17
|
+
@prob_pick_pooled_gain = prob_pick_pooled_gain
|
18
|
+
@prob_split_avg_gain = prob_split_avg_gain
|
19
|
+
@prob_split_pooled_gain = prob_split_pooled_gain
|
20
|
+
@min_gain = min_gain
|
21
|
+
@all_perm = all_perm
|
22
|
+
@coef_by_prop = coef_by_prop
|
23
|
+
@sample_with_replacement = sample_with_replacement
|
24
|
+
@penalize_range = penalize_range
|
25
|
+
@weigh_by_kurtosis = weigh_by_kurtosis
|
26
|
+
@min_imp_obs = min_imp_obs
|
27
|
+
@random_seed = random_seed
|
28
|
+
|
29
|
+
# etc module returns virtual cores
|
30
|
+
nthreads = Etc.nprocessors if nthreads < 0
|
31
|
+
@nthreads = nthreads
|
32
|
+
end
|
33
|
+
|
34
|
+
def fit(x)
|
35
|
+
options = data_options(x).merge(fit_options)
|
36
|
+
options[:sample_size] ||= options[:nrows]
|
37
|
+
@ncols = options[:ncols]
|
38
|
+
@ext_iso_forest = Ext.fit_iforest(options)
|
39
|
+
end
|
40
|
+
|
41
|
+
def predict(x)
|
42
|
+
raise "Not fit" unless @ext_iso_forest
|
43
|
+
options = data_options(x).merge(nthreads: @nthreads)
|
44
|
+
if options[:ncols] != @ncols
|
45
|
+
raise ArgumentError, "Input must have #{@ncols} columns for this model"
|
46
|
+
end
|
47
|
+
Ext.predict_iforest(@ext_iso_forest, options)
|
48
|
+
end
|
49
|
+
|
50
|
+
private
|
51
|
+
|
52
|
+
# TODO support categorical data
|
53
|
+
def data_options(x)
|
54
|
+
if defined?(Numo::NArray) && x.is_a?(Numo::NArray)
|
55
|
+
raise ArgumentError, "Input must have 2 dimensions" if x.ndim != 2
|
56
|
+
x = x.cast_to(Numo::DFloat)
|
57
|
+
numeric_data = x.to_binary
|
58
|
+
nrows, ncols = x.shape
|
59
|
+
else
|
60
|
+
x = x.to_a
|
61
|
+
nrows = x.size
|
62
|
+
ncols = x.first ? x.first.size : 0
|
63
|
+
if x.any? { |r| r.size != ncols }
|
64
|
+
raise ArgumentError, "All rows must have the same number of columns"
|
65
|
+
end
|
66
|
+
numeric_data = x.flatten(1).pack("d*")
|
67
|
+
end
|
68
|
+
raise ArgumentError, "No data" if nrows == 0
|
69
|
+
|
70
|
+
{
|
71
|
+
nrows: nrows,
|
72
|
+
ncols: ncols,
|
73
|
+
numeric_data: numeric_data
|
74
|
+
}
|
75
|
+
end
|
76
|
+
|
77
|
+
def fit_options
|
78
|
+
keys = %i(
|
79
|
+
sample_size ntrees ndim ntry
|
80
|
+
prob_pick_avg_gain prob_pick_pooled_gain
|
81
|
+
prob_split_avg_gain prob_split_pooled_gain
|
82
|
+
min_gain all_perm coef_by_prop
|
83
|
+
sample_with_replacement penalize_range
|
84
|
+
weigh_by_kurtosis min_imp_obs
|
85
|
+
random_seed nthreads
|
86
|
+
)
|
87
|
+
options = {}
|
88
|
+
keys.each do |k|
|
89
|
+
options[k] = instance_variable_get("@#{k}")
|
90
|
+
end
|
91
|
+
options
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|