isotree 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +3 -0
- data/LICENSE.txt +25 -0
- data/README.md +111 -0
- data/ext/isotree/ext.cpp +178 -0
- data/ext/isotree/extconf.rb +21 -0
- data/lib/isotree.rb +9 -0
- data/lib/isotree/isolation_forest.rb +94 -0
- data/lib/isotree/version.rb +3 -0
- data/vendor/isotree/LICENSE +25 -0
- data/vendor/isotree/README.md +167 -0
- data/vendor/isotree/src/Makevars +4 -0
- data/vendor/isotree/src/RcppExports.cpp +267 -0
- data/vendor/isotree/src/Rwrapper.cpp +762 -0
- data/vendor/isotree/src/crit.cpp +912 -0
- data/vendor/isotree/src/dealloc.cpp +66 -0
- data/vendor/isotree/src/dist.cpp +749 -0
- data/vendor/isotree/src/extended.cpp +790 -0
- data/vendor/isotree/src/fit_model.cpp +1068 -0
- data/vendor/isotree/src/helpers_iforest.cpp +309 -0
- data/vendor/isotree/src/impute.cpp +1205 -0
- data/vendor/isotree/src/isoforest.cpp +771 -0
- data/vendor/isotree/src/isotree.hpp +929 -0
- data/vendor/isotree/src/merge_models.cpp +116 -0
- data/vendor/isotree/src/mult.cpp +607 -0
- data/vendor/isotree/src/predict.cpp +849 -0
- data/vendor/isotree/src/serialize.cpp +262 -0
- data/vendor/isotree/src/utils.cpp +1574 -0
- metadata +154 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA256:
|
3
|
+
metadata.gz: fa5516ad971c1fc1def4766fdf39ec74121a20f6c95d9b7bda705707324a5571
|
4
|
+
data.tar.gz: c7aed404ad88d2e0365f7698cbad0a2d198a85860b9863d6322b10e53852f8a3
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: 92a74815ea52c38c0a4d0f27cb78524413cc32f966732c520590937f9523c253344e04673be280ff04659c7ed4a8db96560fba54fc425221b8542244e275ed33
|
7
|
+
data.tar.gz: a5824999e81a4732742646e66cdbae55812340667122b262c799bf3a5b243e9185445855ef64c31c877941d6427cd4fdfb7d77b4f87c2160b489ebe90aa18af6
|
data/CHANGELOG.md
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,25 @@
|
|
1
|
+
BSD 2-Clause License
|
2
|
+
|
3
|
+
Copyright (c) 2020, Andrew Kane
|
4
|
+
All rights reserved.
|
5
|
+
|
6
|
+
Redistribution and use in source and binary forms, with or without
|
7
|
+
modification, are permitted provided that the following conditions are met:
|
8
|
+
|
9
|
+
1. Redistributions of source code must retain the above copyright notice, this
|
10
|
+
list of conditions and the following disclaimer.
|
11
|
+
|
12
|
+
2. Redistributions in binary form must reproduce the above copyright notice,
|
13
|
+
this list of conditions and the following disclaimer in the documentation
|
14
|
+
and/or other materials provided with the distribution.
|
15
|
+
|
16
|
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
17
|
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
18
|
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
19
|
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
20
|
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
21
|
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
22
|
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
23
|
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
24
|
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
25
|
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
data/README.md
ADDED
@@ -0,0 +1,111 @@
|
|
1
|
+
# IsoTree
|
2
|
+
|
3
|
+
:evergreen_tree: [IsoTree](https://github.com/david-cortes/isotree) - outlier/anomaly detection for using Isolation Forest - for Ruby
|
4
|
+
|
5
|
+
Learn how [Isolation Forest](https://www.youtube.com/watch?v=RyFQXQf4w4w) works
|
6
|
+
|
7
|
+
## Installation
|
8
|
+
|
9
|
+
Add this line to your application’s Gemfile:
|
10
|
+
|
11
|
+
```ruby
|
12
|
+
gem 'isotree'
|
13
|
+
```
|
14
|
+
|
15
|
+
## Getting Started
|
16
|
+
|
17
|
+
Prep your data
|
18
|
+
|
19
|
+
```ruby
|
20
|
+
x = [[1, 2], [3, 4], [5, 6], [7, 8]]
|
21
|
+
```
|
22
|
+
|
23
|
+
Train a model
|
24
|
+
|
25
|
+
```ruby
|
26
|
+
model = IsoTree::IsolationForest.new
|
27
|
+
model.fit(x)
|
28
|
+
```
|
29
|
+
|
30
|
+
Get outlier scores
|
31
|
+
|
32
|
+
```ruby
|
33
|
+
model.predict(x)
|
34
|
+
```
|
35
|
+
|
36
|
+
Scores are between 0 and 1, with higher scores indicating outliers
|
37
|
+
|
38
|
+
## Parameters
|
39
|
+
|
40
|
+
Pass parameters - default values below
|
41
|
+
|
42
|
+
```ruby
|
43
|
+
IsoTree::IsolationForest.new(
|
44
|
+
sample_size: nil,
|
45
|
+
ntrees: 500,
|
46
|
+
ndim: 3,
|
47
|
+
ntry: 3,
|
48
|
+
prob_pick_avg_gain: 0,
|
49
|
+
prob_pick_pooled_gain: 0,
|
50
|
+
prob_split_avg_gain: 0,
|
51
|
+
prob_split_pooled_gain: 0,
|
52
|
+
min_gain: 0,
|
53
|
+
all_perm: false,
|
54
|
+
coef_by_prop: false,
|
55
|
+
sample_with_replacement: false,
|
56
|
+
penalize_range: true,
|
57
|
+
weigh_by_kurtosis: false,
|
58
|
+
min_imp_obs: 3,
|
59
|
+
random_seed: 1,
|
60
|
+
nthreads: -1
|
61
|
+
)
|
62
|
+
```
|
63
|
+
|
64
|
+
See a [detailed explanation](https://isotree.readthedocs.io/en/latest/#isotree.IsolationForest)
|
65
|
+
|
66
|
+
## Data
|
67
|
+
|
68
|
+
Data can be an array of arrays
|
69
|
+
|
70
|
+
```ruby
|
71
|
+
[[1, 2, 3], [4, 5, 6]]
|
72
|
+
```
|
73
|
+
|
74
|
+
Or a Numo array
|
75
|
+
|
76
|
+
```ruby
|
77
|
+
Numo::NArray.cast([[1, 2, 3], [4, 5, 6]])
|
78
|
+
```
|
79
|
+
|
80
|
+
## Performance
|
81
|
+
|
82
|
+
IsoTree uses OpenMP when possible for best performance. To enable OpenMP on Mac, run:
|
83
|
+
|
84
|
+
```sh
|
85
|
+
brew install libomp
|
86
|
+
```
|
87
|
+
|
88
|
+
Then reinstall the gem.
|
89
|
+
|
90
|
+
## History
|
91
|
+
|
92
|
+
View the [changelog](https://github.com/ankane/isotree/blob/master/CHANGELOG.md)
|
93
|
+
|
94
|
+
## Contributing
|
95
|
+
|
96
|
+
Everyone is encouraged to help improve this project. Here are a few ways you can help:
|
97
|
+
|
98
|
+
- [Report bugs](https://github.com/ankane/isotree/issues)
|
99
|
+
- Fix bugs and [submit pull requests](https://github.com/ankane/isotree/pulls)
|
100
|
+
- Write, clarify, or fix documentation
|
101
|
+
- Suggest or add new features
|
102
|
+
|
103
|
+
To get started with development:
|
104
|
+
|
105
|
+
```sh
|
106
|
+
git clone --recursive https://github.com/ankane/isotree.git
|
107
|
+
cd isotree
|
108
|
+
bundle install
|
109
|
+
bundle exec rake compile
|
110
|
+
bundle exec rake test
|
111
|
+
```
|
data/ext/isotree/ext.cpp
ADDED
@@ -0,0 +1,178 @@
|
|
1
|
+
// isotree
|
2
|
+
#include <isotree.hpp>
|
3
|
+
|
4
|
+
// rice
|
5
|
+
#include <rice/Array.hpp>
|
6
|
+
#include <rice/Hash.hpp>
|
7
|
+
#include <rice/Module.hpp>
|
8
|
+
#include <rice/String.hpp>
|
9
|
+
#include <rice/Symbol.hpp>
|
10
|
+
|
11
|
+
using Rice::Array;
|
12
|
+
using Rice::Hash;
|
13
|
+
using Rice::Module;
|
14
|
+
using Rice::String;
|
15
|
+
using Rice::Symbol;
|
16
|
+
using Rice::define_class_under;
|
17
|
+
using Rice::define_module;
|
18
|
+
|
19
|
+
extern "C"
|
20
|
+
void Init_ext()
|
21
|
+
{
|
22
|
+
Module rb_mIsoTree = define_module("IsoTree");
|
23
|
+
|
24
|
+
Module rb_mExt = define_module_under(rb_mIsoTree, "Ext");
|
25
|
+
define_class_under<ExtIsoForest>(rb_mExt, "ExtIsoForest");
|
26
|
+
|
27
|
+
rb_mExt
|
28
|
+
.define_singleton_method(
|
29
|
+
"fit_iforest",
|
30
|
+
*[](Hash options) {
|
31
|
+
// model
|
32
|
+
ExtIsoForest iso;
|
33
|
+
|
34
|
+
// data
|
35
|
+
size_t nrows = options.get<size_t, Symbol>("nrows");
|
36
|
+
size_t ncols = options.get<size_t, Symbol>("ncols");
|
37
|
+
double* numeric_data = (double*) options.get<String, Symbol>("numeric_data").c_str();
|
38
|
+
size_t ncols_numeric = ncols;
|
39
|
+
int* categ_data = NULL;
|
40
|
+
size_t ncols_categ = 0;
|
41
|
+
int* ncat = NULL;
|
42
|
+
double* Xc = NULL;
|
43
|
+
sparse_ix* Xc_ind = NULL;
|
44
|
+
sparse_ix* Xc_indptr = NULL;
|
45
|
+
|
46
|
+
// options
|
47
|
+
CoefType coef_type = Normal;
|
48
|
+
double* sample_weights = NULL;
|
49
|
+
bool weight_as_sample = false;
|
50
|
+
size_t max_depth = 0;
|
51
|
+
bool limit_depth = true;
|
52
|
+
bool standardize_dist = false;
|
53
|
+
double* tmat = NULL;
|
54
|
+
double* output_depths = NULL;
|
55
|
+
bool standardize_depth = false;
|
56
|
+
double* col_weights = NULL;
|
57
|
+
MissingAction missing_action = Impute;
|
58
|
+
CategSplit cat_split_type = SubSet;
|
59
|
+
NewCategAction new_cat_action = Smallest;
|
60
|
+
Imputer *imputer = NULL;
|
61
|
+
UseDepthImp depth_imp = Higher;
|
62
|
+
WeighImpRows weigh_imp_rows = Inverse;
|
63
|
+
bool impute_at_fit = false;
|
64
|
+
|
65
|
+
// Rice has limit of 14 arguments, so use hash for options
|
66
|
+
size_t sample_size = options.get<size_t, Symbol>("sample_size");
|
67
|
+
size_t ndim = options.get<size_t, Symbol>("ndim");
|
68
|
+
size_t ntrees = options.get<size_t, Symbol>("ntrees");
|
69
|
+
size_t ntry = options.get<size_t, Symbol>("ntry");
|
70
|
+
double prob_pick_by_gain_avg = options.get<double, Symbol>("prob_pick_avg_gain");
|
71
|
+
double prob_split_by_gain_avg = options.get<double, Symbol>("prob_split_avg_gain");
|
72
|
+
double prob_pick_by_gain_pl = options.get<double, Symbol>("prob_pick_pooled_gain");
|
73
|
+
double prob_split_by_gain_pl = options.get<double, Symbol>("prob_split_pooled_gain");
|
74
|
+
double min_gain = options.get<double, Symbol>("min_gain");
|
75
|
+
bool all_perm = options.get<bool, Symbol>("all_perm");
|
76
|
+
bool coef_by_prop = options.get<bool, Symbol>("coef_by_prop");
|
77
|
+
bool with_replacement = options.get<bool, Symbol>("sample_with_replacement");
|
78
|
+
bool penalize_range = options.get<bool, Symbol>("penalize_range");
|
79
|
+
bool weigh_by_kurt = options.get<bool, Symbol>("weigh_by_kurtosis");
|
80
|
+
size_t min_imp_obs = options.get<size_t, Symbol>("min_imp_obs");
|
81
|
+
uint64_t random_seed = options.get<uint64_t, Symbol>("random_seed");
|
82
|
+
int nthreads = options.get<int, Symbol>("nthreads");
|
83
|
+
|
84
|
+
fit_iforest(
|
85
|
+
NULL,
|
86
|
+
&iso,
|
87
|
+
numeric_data,
|
88
|
+
ncols_numeric,
|
89
|
+
categ_data,
|
90
|
+
ncols_categ,
|
91
|
+
ncat,
|
92
|
+
Xc,
|
93
|
+
Xc_ind,
|
94
|
+
Xc_indptr,
|
95
|
+
ndim,
|
96
|
+
ntry,
|
97
|
+
coef_type,
|
98
|
+
coef_by_prop,
|
99
|
+
sample_weights,
|
100
|
+
with_replacement,
|
101
|
+
weight_as_sample,
|
102
|
+
nrows,
|
103
|
+
sample_size,
|
104
|
+
ntrees,
|
105
|
+
max_depth,
|
106
|
+
limit_depth,
|
107
|
+
penalize_range,
|
108
|
+
standardize_dist,
|
109
|
+
tmat,
|
110
|
+
output_depths,
|
111
|
+
standardize_depth,
|
112
|
+
col_weights,
|
113
|
+
weigh_by_kurt,
|
114
|
+
prob_pick_by_gain_avg,
|
115
|
+
prob_split_by_gain_avg,
|
116
|
+
prob_pick_by_gain_pl,
|
117
|
+
prob_split_by_gain_pl,
|
118
|
+
min_gain,
|
119
|
+
missing_action,
|
120
|
+
cat_split_type,
|
121
|
+
new_cat_action,
|
122
|
+
all_perm,
|
123
|
+
imputer,
|
124
|
+
min_imp_obs,
|
125
|
+
depth_imp,
|
126
|
+
weigh_imp_rows,
|
127
|
+
impute_at_fit,
|
128
|
+
random_seed,
|
129
|
+
nthreads
|
130
|
+
);
|
131
|
+
|
132
|
+
return iso;
|
133
|
+
})
|
134
|
+
.define_singleton_method(
|
135
|
+
"predict_iforest",
|
136
|
+
*[](ExtIsoForest& iso, Hash options) {
|
137
|
+
// data
|
138
|
+
size_t nrows = options.get<size_t, Symbol>("nrows");
|
139
|
+
double* numeric_data = (double*) options.get<String, Symbol>("numeric_data").c_str();
|
140
|
+
int* categ_data = NULL;
|
141
|
+
double* Xc = NULL;
|
142
|
+
sparse_ix* Xc_ind = NULL;
|
143
|
+
sparse_ix* Xc_indptr = NULL;
|
144
|
+
double* Xr = NULL;
|
145
|
+
sparse_ix* Xr_ind = NULL;
|
146
|
+
sparse_ix* Xr_indptr = NULL;
|
147
|
+
|
148
|
+
// options
|
149
|
+
int nthreads = options.get<int, Symbol>("nthreads");
|
150
|
+
bool standardize = true;
|
151
|
+
std::vector<double> outlier_scores(nrows);
|
152
|
+
sparse_ix* tree_num = NULL;
|
153
|
+
|
154
|
+
predict_iforest(
|
155
|
+
numeric_data,
|
156
|
+
categ_data,
|
157
|
+
Xc,
|
158
|
+
Xc_ind,
|
159
|
+
Xc_indptr,
|
160
|
+
Xr,
|
161
|
+
Xr_ind,
|
162
|
+
Xr_indptr,
|
163
|
+
nrows,
|
164
|
+
nthreads,
|
165
|
+
standardize,
|
166
|
+
NULL,
|
167
|
+
&iso,
|
168
|
+
outlier_scores.data(),
|
169
|
+
tree_num
|
170
|
+
);
|
171
|
+
|
172
|
+
Array ret;
|
173
|
+
for (size_t i = 0; i < outlier_scores.size(); i++) {
|
174
|
+
ret.push(outlier_scores[i]);
|
175
|
+
}
|
176
|
+
return ret;
|
177
|
+
});
|
178
|
+
}
|
@@ -0,0 +1,21 @@
|
|
1
|
+
require "mkmf-rice"
|
2
|
+
|
3
|
+
$CXXFLAGS += " -std=c++11 -D_USE_MERSENNE_TWISTER -D_ENABLE_CEREAL"
|
4
|
+
|
5
|
+
apple_clang = RbConfig::CONFIG["CC_VERSION_MESSAGE"] =~ /apple clang/i
|
6
|
+
|
7
|
+
# check omp first
|
8
|
+
if have_library("omp") || have_library("gomp")
|
9
|
+
$CXXFLAGS += " -Xclang" if apple_clang
|
10
|
+
$CXXFLAGS += " -fopenmp"
|
11
|
+
end
|
12
|
+
|
13
|
+
ext = File.expand_path(".", __dir__)
|
14
|
+
isotree = File.expand_path("../../vendor/isotree/src", __dir__)
|
15
|
+
|
16
|
+
exclude = %w(Rwrapper.cpp RcppExports.cpp)
|
17
|
+
$srcs = Dir["{#{ext},#{isotree}}/*.{cc,cpp}"].reject { |f| exclude.include?(File.basename(f)) }
|
18
|
+
$INCFLAGS << " -I#{isotree}"
|
19
|
+
$VPATH << isotree
|
20
|
+
|
21
|
+
create_makefile("isotree/ext")
|
data/lib/isotree.rb
ADDED
@@ -0,0 +1,94 @@
|
|
1
|
+
module IsoTree
|
2
|
+
class IsolationForest
|
3
|
+
def initialize(
|
4
|
+
sample_size: nil, ntrees: 500, ndim: 3, ntry: 3,
|
5
|
+
prob_pick_avg_gain: 0, prob_pick_pooled_gain: 0,
|
6
|
+
prob_split_avg_gain: 0, prob_split_pooled_gain: 0,
|
7
|
+
min_gain: 0, all_perm: false, coef_by_prop: false,
|
8
|
+
sample_with_replacement: false, penalize_range: true,
|
9
|
+
weigh_by_kurtosis: false, min_imp_obs: 3, random_seed: 1, nthreads: -1
|
10
|
+
)
|
11
|
+
|
12
|
+
@sample_size = sample_size
|
13
|
+
@ntrees = ntrees
|
14
|
+
@ndim = ndim
|
15
|
+
@ntry = ntry
|
16
|
+
@prob_pick_avg_gain = prob_pick_avg_gain
|
17
|
+
@prob_pick_pooled_gain = prob_pick_pooled_gain
|
18
|
+
@prob_split_avg_gain = prob_split_avg_gain
|
19
|
+
@prob_split_pooled_gain = prob_split_pooled_gain
|
20
|
+
@min_gain = min_gain
|
21
|
+
@all_perm = all_perm
|
22
|
+
@coef_by_prop = coef_by_prop
|
23
|
+
@sample_with_replacement = sample_with_replacement
|
24
|
+
@penalize_range = penalize_range
|
25
|
+
@weigh_by_kurtosis = weigh_by_kurtosis
|
26
|
+
@min_imp_obs = min_imp_obs
|
27
|
+
@random_seed = random_seed
|
28
|
+
|
29
|
+
# etc module returns virtual cores
|
30
|
+
nthreads = Etc.nprocessors if nthreads < 0
|
31
|
+
@nthreads = nthreads
|
32
|
+
end
|
33
|
+
|
34
|
+
def fit(x)
|
35
|
+
options = data_options(x).merge(fit_options)
|
36
|
+
options[:sample_size] ||= options[:nrows]
|
37
|
+
@ncols = options[:ncols]
|
38
|
+
@ext_iso_forest = Ext.fit_iforest(options)
|
39
|
+
end
|
40
|
+
|
41
|
+
def predict(x)
|
42
|
+
raise "Not fit" unless @ext_iso_forest
|
43
|
+
options = data_options(x).merge(nthreads: @nthreads)
|
44
|
+
if options[:ncols] != @ncols
|
45
|
+
raise ArgumentError, "Input must have #{@ncols} columns for this model"
|
46
|
+
end
|
47
|
+
Ext.predict_iforest(@ext_iso_forest, options)
|
48
|
+
end
|
49
|
+
|
50
|
+
private
|
51
|
+
|
52
|
+
# TODO support categorical data
|
53
|
+
def data_options(x)
|
54
|
+
if defined?(Numo::NArray) && x.is_a?(Numo::NArray)
|
55
|
+
raise ArgumentError, "Input must have 2 dimensions" if x.ndim != 2
|
56
|
+
x = x.cast_to(Numo::DFloat)
|
57
|
+
numeric_data = x.to_binary
|
58
|
+
nrows, ncols = x.shape
|
59
|
+
else
|
60
|
+
x = x.to_a
|
61
|
+
nrows = x.size
|
62
|
+
ncols = x.first ? x.first.size : 0
|
63
|
+
if x.any? { |r| r.size != ncols }
|
64
|
+
raise ArgumentError, "All rows must have the same number of columns"
|
65
|
+
end
|
66
|
+
numeric_data = x.flatten(1).pack("d*")
|
67
|
+
end
|
68
|
+
raise ArgumentError, "No data" if nrows == 0
|
69
|
+
|
70
|
+
{
|
71
|
+
nrows: nrows,
|
72
|
+
ncols: ncols,
|
73
|
+
numeric_data: numeric_data
|
74
|
+
}
|
75
|
+
end
|
76
|
+
|
77
|
+
def fit_options
|
78
|
+
keys = %i(
|
79
|
+
sample_size ntrees ndim ntry
|
80
|
+
prob_pick_avg_gain prob_pick_pooled_gain
|
81
|
+
prob_split_avg_gain prob_split_pooled_gain
|
82
|
+
min_gain all_perm coef_by_prop
|
83
|
+
sample_with_replacement penalize_range
|
84
|
+
weigh_by_kurtosis min_imp_obs
|
85
|
+
random_seed nthreads
|
86
|
+
)
|
87
|
+
options = {}
|
88
|
+
keys.each do |k|
|
89
|
+
options[k] = instance_variable_get("@#{k}")
|
90
|
+
end
|
91
|
+
options
|
92
|
+
end
|
93
|
+
end
|
94
|
+
end
|