isotree 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: fa5516ad971c1fc1def4766fdf39ec74121a20f6c95d9b7bda705707324a5571
4
+ data.tar.gz: c7aed404ad88d2e0365f7698cbad0a2d198a85860b9863d6322b10e53852f8a3
5
+ SHA512:
6
+ metadata.gz: 92a74815ea52c38c0a4d0f27cb78524413cc32f966732c520590937f9523c253344e04673be280ff04659c7ed4a8db96560fba54fc425221b8542244e275ed33
7
+ data.tar.gz: a5824999e81a4732742646e66cdbae55812340667122b262c799bf3a5b243e9185445855ef64c31c877941d6427cd4fdfb7d77b4f87c2160b489ebe90aa18af6
@@ -0,0 +1,3 @@
1
+ ## 0.1.0 (unreleased)
2
+
3
+ - First release
@@ -0,0 +1,25 @@
1
+ BSD 2-Clause License
2
+
3
+ Copyright (c) 2020, Andrew Kane
4
+ All rights reserved.
5
+
6
+ Redistribution and use in source and binary forms, with or without
7
+ modification, are permitted provided that the following conditions are met:
8
+
9
+ 1. Redistributions of source code must retain the above copyright notice, this
10
+ list of conditions and the following disclaimer.
11
+
12
+ 2. Redistributions in binary form must reproduce the above copyright notice,
13
+ this list of conditions and the following disclaimer in the documentation
14
+ and/or other materials provided with the distribution.
15
+
16
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
20
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,111 @@
1
+ # IsoTree
2
+
3
+ :evergreen_tree: [IsoTree](https://github.com/david-cortes/isotree) - outlier/anomaly detection for using Isolation Forest - for Ruby
4
+
5
+ Learn how [Isolation Forest](https://www.youtube.com/watch?v=RyFQXQf4w4w) works
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application’s Gemfile:
10
+
11
+ ```ruby
12
+ gem 'isotree'
13
+ ```
14
+
15
+ ## Getting Started
16
+
17
+ Prep your data
18
+
19
+ ```ruby
20
+ x = [[1, 2], [3, 4], [5, 6], [7, 8]]
21
+ ```
22
+
23
+ Train a model
24
+
25
+ ```ruby
26
+ model = IsoTree::IsolationForest.new
27
+ model.fit(x)
28
+ ```
29
+
30
+ Get outlier scores
31
+
32
+ ```ruby
33
+ model.predict(x)
34
+ ```
35
+
36
+ Scores are between 0 and 1, with higher scores indicating outliers
37
+
38
+ ## Parameters
39
+
40
+ Pass parameters - default values below
41
+
42
+ ```ruby
43
+ IsoTree::IsolationForest.new(
44
+ sample_size: nil,
45
+ ntrees: 500,
46
+ ndim: 3,
47
+ ntry: 3,
48
+ prob_pick_avg_gain: 0,
49
+ prob_pick_pooled_gain: 0,
50
+ prob_split_avg_gain: 0,
51
+ prob_split_pooled_gain: 0,
52
+ min_gain: 0,
53
+ all_perm: false,
54
+ coef_by_prop: false,
55
+ sample_with_replacement: false,
56
+ penalize_range: true,
57
+ weigh_by_kurtosis: false,
58
+ min_imp_obs: 3,
59
+ random_seed: 1,
60
+ nthreads: -1
61
+ )
62
+ ```
63
+
64
+ See a [detailed explanation](https://isotree.readthedocs.io/en/latest/#isotree.IsolationForest)
65
+
66
+ ## Data
67
+
68
+ Data can be an array of arrays
69
+
70
+ ```ruby
71
+ [[1, 2, 3], [4, 5, 6]]
72
+ ```
73
+
74
+ Or a Numo array
75
+
76
+ ```ruby
77
+ Numo::NArray.cast([[1, 2, 3], [4, 5, 6]])
78
+ ```
79
+
80
+ ## Performance
81
+
82
+ IsoTree uses OpenMP when possible for best performance. To enable OpenMP on Mac, run:
83
+
84
+ ```sh
85
+ brew install libomp
86
+ ```
87
+
88
+ Then reinstall the gem.
89
+
90
+ ## History
91
+
92
+ View the [changelog](https://github.com/ankane/isotree/blob/master/CHANGELOG.md)
93
+
94
+ ## Contributing
95
+
96
+ Everyone is encouraged to help improve this project. Here are a few ways you can help:
97
+
98
+ - [Report bugs](https://github.com/ankane/isotree/issues)
99
+ - Fix bugs and [submit pull requests](https://github.com/ankane/isotree/pulls)
100
+ - Write, clarify, or fix documentation
101
+ - Suggest or add new features
102
+
103
+ To get started with development:
104
+
105
+ ```sh
106
+ git clone --recursive https://github.com/ankane/isotree.git
107
+ cd isotree
108
+ bundle install
109
+ bundle exec rake compile
110
+ bundle exec rake test
111
+ ```
@@ -0,0 +1,178 @@
1
+ // isotree
2
+ #include <isotree.hpp>
3
+
4
+ // rice
5
+ #include <rice/Array.hpp>
6
+ #include <rice/Hash.hpp>
7
+ #include <rice/Module.hpp>
8
+ #include <rice/String.hpp>
9
+ #include <rice/Symbol.hpp>
10
+
11
+ using Rice::Array;
12
+ using Rice::Hash;
13
+ using Rice::Module;
14
+ using Rice::String;
15
+ using Rice::Symbol;
16
+ using Rice::define_class_under;
17
+ using Rice::define_module;
18
+
19
+ extern "C"
20
+ void Init_ext()
21
+ {
22
+ Module rb_mIsoTree = define_module("IsoTree");
23
+
24
+ Module rb_mExt = define_module_under(rb_mIsoTree, "Ext");
25
+ define_class_under<ExtIsoForest>(rb_mExt, "ExtIsoForest");
26
+
27
+ rb_mExt
28
+ .define_singleton_method(
29
+ "fit_iforest",
30
+ *[](Hash options) {
31
+ // model
32
+ ExtIsoForest iso;
33
+
34
+ // data
35
+ size_t nrows = options.get<size_t, Symbol>("nrows");
36
+ size_t ncols = options.get<size_t, Symbol>("ncols");
37
+ double* numeric_data = (double*) options.get<String, Symbol>("numeric_data").c_str();
38
+ size_t ncols_numeric = ncols;
39
+ int* categ_data = NULL;
40
+ size_t ncols_categ = 0;
41
+ int* ncat = NULL;
42
+ double* Xc = NULL;
43
+ sparse_ix* Xc_ind = NULL;
44
+ sparse_ix* Xc_indptr = NULL;
45
+
46
+ // options
47
+ CoefType coef_type = Normal;
48
+ double* sample_weights = NULL;
49
+ bool weight_as_sample = false;
50
+ size_t max_depth = 0;
51
+ bool limit_depth = true;
52
+ bool standardize_dist = false;
53
+ double* tmat = NULL;
54
+ double* output_depths = NULL;
55
+ bool standardize_depth = false;
56
+ double* col_weights = NULL;
57
+ MissingAction missing_action = Impute;
58
+ CategSplit cat_split_type = SubSet;
59
+ NewCategAction new_cat_action = Smallest;
60
+ Imputer *imputer = NULL;
61
+ UseDepthImp depth_imp = Higher;
62
+ WeighImpRows weigh_imp_rows = Inverse;
63
+ bool impute_at_fit = false;
64
+
65
+ // Rice has limit of 14 arguments, so use hash for options
66
+ size_t sample_size = options.get<size_t, Symbol>("sample_size");
67
+ size_t ndim = options.get<size_t, Symbol>("ndim");
68
+ size_t ntrees = options.get<size_t, Symbol>("ntrees");
69
+ size_t ntry = options.get<size_t, Symbol>("ntry");
70
+ double prob_pick_by_gain_avg = options.get<double, Symbol>("prob_pick_avg_gain");
71
+ double prob_split_by_gain_avg = options.get<double, Symbol>("prob_split_avg_gain");
72
+ double prob_pick_by_gain_pl = options.get<double, Symbol>("prob_pick_pooled_gain");
73
+ double prob_split_by_gain_pl = options.get<double, Symbol>("prob_split_pooled_gain");
74
+ double min_gain = options.get<double, Symbol>("min_gain");
75
+ bool all_perm = options.get<bool, Symbol>("all_perm");
76
+ bool coef_by_prop = options.get<bool, Symbol>("coef_by_prop");
77
+ bool with_replacement = options.get<bool, Symbol>("sample_with_replacement");
78
+ bool penalize_range = options.get<bool, Symbol>("penalize_range");
79
+ bool weigh_by_kurt = options.get<bool, Symbol>("weigh_by_kurtosis");
80
+ size_t min_imp_obs = options.get<size_t, Symbol>("min_imp_obs");
81
+ uint64_t random_seed = options.get<uint64_t, Symbol>("random_seed");
82
+ int nthreads = options.get<int, Symbol>("nthreads");
83
+
84
+ fit_iforest(
85
+ NULL,
86
+ &iso,
87
+ numeric_data,
88
+ ncols_numeric,
89
+ categ_data,
90
+ ncols_categ,
91
+ ncat,
92
+ Xc,
93
+ Xc_ind,
94
+ Xc_indptr,
95
+ ndim,
96
+ ntry,
97
+ coef_type,
98
+ coef_by_prop,
99
+ sample_weights,
100
+ with_replacement,
101
+ weight_as_sample,
102
+ nrows,
103
+ sample_size,
104
+ ntrees,
105
+ max_depth,
106
+ limit_depth,
107
+ penalize_range,
108
+ standardize_dist,
109
+ tmat,
110
+ output_depths,
111
+ standardize_depth,
112
+ col_weights,
113
+ weigh_by_kurt,
114
+ prob_pick_by_gain_avg,
115
+ prob_split_by_gain_avg,
116
+ prob_pick_by_gain_pl,
117
+ prob_split_by_gain_pl,
118
+ min_gain,
119
+ missing_action,
120
+ cat_split_type,
121
+ new_cat_action,
122
+ all_perm,
123
+ imputer,
124
+ min_imp_obs,
125
+ depth_imp,
126
+ weigh_imp_rows,
127
+ impute_at_fit,
128
+ random_seed,
129
+ nthreads
130
+ );
131
+
132
+ return iso;
133
+ })
134
+ .define_singleton_method(
135
+ "predict_iforest",
136
+ *[](ExtIsoForest& iso, Hash options) {
137
+ // data
138
+ size_t nrows = options.get<size_t, Symbol>("nrows");
139
+ double* numeric_data = (double*) options.get<String, Symbol>("numeric_data").c_str();
140
+ int* categ_data = NULL;
141
+ double* Xc = NULL;
142
+ sparse_ix* Xc_ind = NULL;
143
+ sparse_ix* Xc_indptr = NULL;
144
+ double* Xr = NULL;
145
+ sparse_ix* Xr_ind = NULL;
146
+ sparse_ix* Xr_indptr = NULL;
147
+
148
+ // options
149
+ int nthreads = options.get<int, Symbol>("nthreads");
150
+ bool standardize = true;
151
+ std::vector<double> outlier_scores(nrows);
152
+ sparse_ix* tree_num = NULL;
153
+
154
+ predict_iforest(
155
+ numeric_data,
156
+ categ_data,
157
+ Xc,
158
+ Xc_ind,
159
+ Xc_indptr,
160
+ Xr,
161
+ Xr_ind,
162
+ Xr_indptr,
163
+ nrows,
164
+ nthreads,
165
+ standardize,
166
+ NULL,
167
+ &iso,
168
+ outlier_scores.data(),
169
+ tree_num
170
+ );
171
+
172
+ Array ret;
173
+ for (size_t i = 0; i < outlier_scores.size(); i++) {
174
+ ret.push(outlier_scores[i]);
175
+ }
176
+ return ret;
177
+ });
178
+ }
@@ -0,0 +1,21 @@
1
+ require "mkmf-rice"
2
+
3
+ $CXXFLAGS += " -std=c++11 -D_USE_MERSENNE_TWISTER -D_ENABLE_CEREAL"
4
+
5
+ apple_clang = RbConfig::CONFIG["CC_VERSION_MESSAGE"] =~ /apple clang/i
6
+
7
+ # check omp first
8
+ if have_library("omp") || have_library("gomp")
9
+ $CXXFLAGS += " -Xclang" if apple_clang
10
+ $CXXFLAGS += " -fopenmp"
11
+ end
12
+
13
+ ext = File.expand_path(".", __dir__)
14
+ isotree = File.expand_path("../../vendor/isotree/src", __dir__)
15
+
16
+ exclude = %w(Rwrapper.cpp RcppExports.cpp)
17
+ $srcs = Dir["{#{ext},#{isotree}}/*.{cc,cpp}"].reject { |f| exclude.include?(File.basename(f)) }
18
+ $INCFLAGS << " -I#{isotree}"
19
+ $VPATH << isotree
20
+
21
+ create_makefile("isotree/ext")
@@ -0,0 +1,9 @@
1
+ # ext
2
+ require "isotree/ext"
3
+
4
+ # stdlib
5
+ require "etc"
6
+
7
+ # modules
8
+ require "isotree/isolation_forest"
9
+ require "isotree/version"
@@ -0,0 +1,94 @@
1
+ module IsoTree
2
+ class IsolationForest
3
+ def initialize(
4
+ sample_size: nil, ntrees: 500, ndim: 3, ntry: 3,
5
+ prob_pick_avg_gain: 0, prob_pick_pooled_gain: 0,
6
+ prob_split_avg_gain: 0, prob_split_pooled_gain: 0,
7
+ min_gain: 0, all_perm: false, coef_by_prop: false,
8
+ sample_with_replacement: false, penalize_range: true,
9
+ weigh_by_kurtosis: false, min_imp_obs: 3, random_seed: 1, nthreads: -1
10
+ )
11
+
12
+ @sample_size = sample_size
13
+ @ntrees = ntrees
14
+ @ndim = ndim
15
+ @ntry = ntry
16
+ @prob_pick_avg_gain = prob_pick_avg_gain
17
+ @prob_pick_pooled_gain = prob_pick_pooled_gain
18
+ @prob_split_avg_gain = prob_split_avg_gain
19
+ @prob_split_pooled_gain = prob_split_pooled_gain
20
+ @min_gain = min_gain
21
+ @all_perm = all_perm
22
+ @coef_by_prop = coef_by_prop
23
+ @sample_with_replacement = sample_with_replacement
24
+ @penalize_range = penalize_range
25
+ @weigh_by_kurtosis = weigh_by_kurtosis
26
+ @min_imp_obs = min_imp_obs
27
+ @random_seed = random_seed
28
+
29
+ # etc module returns virtual cores
30
+ nthreads = Etc.nprocessors if nthreads < 0
31
+ @nthreads = nthreads
32
+ end
33
+
34
+ def fit(x)
35
+ options = data_options(x).merge(fit_options)
36
+ options[:sample_size] ||= options[:nrows]
37
+ @ncols = options[:ncols]
38
+ @ext_iso_forest = Ext.fit_iforest(options)
39
+ end
40
+
41
+ def predict(x)
42
+ raise "Not fit" unless @ext_iso_forest
43
+ options = data_options(x).merge(nthreads: @nthreads)
44
+ if options[:ncols] != @ncols
45
+ raise ArgumentError, "Input must have #{@ncols} columns for this model"
46
+ end
47
+ Ext.predict_iforest(@ext_iso_forest, options)
48
+ end
49
+
50
+ private
51
+
52
+ # TODO support categorical data
53
+ def data_options(x)
54
+ if defined?(Numo::NArray) && x.is_a?(Numo::NArray)
55
+ raise ArgumentError, "Input must have 2 dimensions" if x.ndim != 2
56
+ x = x.cast_to(Numo::DFloat)
57
+ numeric_data = x.to_binary
58
+ nrows, ncols = x.shape
59
+ else
60
+ x = x.to_a
61
+ nrows = x.size
62
+ ncols = x.first ? x.first.size : 0
63
+ if x.any? { |r| r.size != ncols }
64
+ raise ArgumentError, "All rows must have the same number of columns"
65
+ end
66
+ numeric_data = x.flatten(1).pack("d*")
67
+ end
68
+ raise ArgumentError, "No data" if nrows == 0
69
+
70
+ {
71
+ nrows: nrows,
72
+ ncols: ncols,
73
+ numeric_data: numeric_data
74
+ }
75
+ end
76
+
77
+ def fit_options
78
+ keys = %i(
79
+ sample_size ntrees ndim ntry
80
+ prob_pick_avg_gain prob_pick_pooled_gain
81
+ prob_split_avg_gain prob_split_pooled_gain
82
+ min_gain all_perm coef_by_prop
83
+ sample_with_replacement penalize_range
84
+ weigh_by_kurtosis min_imp_obs
85
+ random_seed nthreads
86
+ )
87
+ options = {}
88
+ keys.each do |k|
89
+ options[k] = instance_variable_get("@#{k}")
90
+ end
91
+ options
92
+ end
93
+ end
94
+ end