isotree 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: fa5516ad971c1fc1def4766fdf39ec74121a20f6c95d9b7bda705707324a5571
4
+ data.tar.gz: c7aed404ad88d2e0365f7698cbad0a2d198a85860b9863d6322b10e53852f8a3
5
+ SHA512:
6
+ metadata.gz: 92a74815ea52c38c0a4d0f27cb78524413cc32f966732c520590937f9523c253344e04673be280ff04659c7ed4a8db96560fba54fc425221b8542244e275ed33
7
+ data.tar.gz: a5824999e81a4732742646e66cdbae55812340667122b262c799bf3a5b243e9185445855ef64c31c877941d6427cd4fdfb7d77b4f87c2160b489ebe90aa18af6
@@ -0,0 +1,3 @@
1
+ ## 0.1.0 (unreleased)
2
+
3
+ - First release
@@ -0,0 +1,25 @@
1
+ BSD 2-Clause License
2
+
3
+ Copyright (c) 2020, Andrew Kane
4
+ All rights reserved.
5
+
6
+ Redistribution and use in source and binary forms, with or without
7
+ modification, are permitted provided that the following conditions are met:
8
+
9
+ 1. Redistributions of source code must retain the above copyright notice, this
10
+ list of conditions and the following disclaimer.
11
+
12
+ 2. Redistributions in binary form must reproduce the above copyright notice,
13
+ this list of conditions and the following disclaimer in the documentation
14
+ and/or other materials provided with the distribution.
15
+
16
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
20
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,111 @@
1
+ # IsoTree
2
+
3
+ :evergreen_tree: [IsoTree](https://github.com/david-cortes/isotree) - outlier/anomaly detection for using Isolation Forest - for Ruby
4
+
5
+ Learn how [Isolation Forest](https://www.youtube.com/watch?v=RyFQXQf4w4w) works
6
+
7
+ ## Installation
8
+
9
+ Add this line to your application’s Gemfile:
10
+
11
+ ```ruby
12
+ gem 'isotree'
13
+ ```
14
+
15
+ ## Getting Started
16
+
17
+ Prep your data
18
+
19
+ ```ruby
20
+ x = [[1, 2], [3, 4], [5, 6], [7, 8]]
21
+ ```
22
+
23
+ Train a model
24
+
25
+ ```ruby
26
+ model = IsoTree::IsolationForest.new
27
+ model.fit(x)
28
+ ```
29
+
30
+ Get outlier scores
31
+
32
+ ```ruby
33
+ model.predict(x)
34
+ ```
35
+
36
+ Scores are between 0 and 1, with higher scores indicating outliers
37
+
38
+ ## Parameters
39
+
40
+ Pass parameters - default values below
41
+
42
+ ```ruby
43
+ IsoTree::IsolationForest.new(
44
+ sample_size: nil,
45
+ ntrees: 500,
46
+ ndim: 3,
47
+ ntry: 3,
48
+ prob_pick_avg_gain: 0,
49
+ prob_pick_pooled_gain: 0,
50
+ prob_split_avg_gain: 0,
51
+ prob_split_pooled_gain: 0,
52
+ min_gain: 0,
53
+ all_perm: false,
54
+ coef_by_prop: false,
55
+ sample_with_replacement: false,
56
+ penalize_range: true,
57
+ weigh_by_kurtosis: false,
58
+ min_imp_obs: 3,
59
+ random_seed: 1,
60
+ nthreads: -1
61
+ )
62
+ ```
63
+
64
+ See a [detailed explanation](https://isotree.readthedocs.io/en/latest/#isotree.IsolationForest)
65
+
66
+ ## Data
67
+
68
+ Data can be an array of arrays
69
+
70
+ ```ruby
71
+ [[1, 2, 3], [4, 5, 6]]
72
+ ```
73
+
74
+ Or a Numo array
75
+
76
+ ```ruby
77
+ Numo::NArray.cast([[1, 2, 3], [4, 5, 6]])
78
+ ```
79
+
80
+ ## Performance
81
+
82
+ IsoTree uses OpenMP when possible for best performance. To enable OpenMP on Mac, run:
83
+
84
+ ```sh
85
+ brew install libomp
86
+ ```
87
+
88
+ Then reinstall the gem.
89
+
90
+ ## History
91
+
92
+ View the [changelog](https://github.com/ankane/isotree/blob/master/CHANGELOG.md)
93
+
94
+ ## Contributing
95
+
96
+ Everyone is encouraged to help improve this project. Here are a few ways you can help:
97
+
98
+ - [Report bugs](https://github.com/ankane/isotree/issues)
99
+ - Fix bugs and [submit pull requests](https://github.com/ankane/isotree/pulls)
100
+ - Write, clarify, or fix documentation
101
+ - Suggest or add new features
102
+
103
+ To get started with development:
104
+
105
+ ```sh
106
+ git clone --recursive https://github.com/ankane/isotree.git
107
+ cd isotree
108
+ bundle install
109
+ bundle exec rake compile
110
+ bundle exec rake test
111
+ ```
@@ -0,0 +1,178 @@
1
+ // isotree
2
+ #include <isotree.hpp>
3
+
4
+ // rice
5
+ #include <rice/Array.hpp>
6
+ #include <rice/Hash.hpp>
7
+ #include <rice/Module.hpp>
8
+ #include <rice/String.hpp>
9
+ #include <rice/Symbol.hpp>
10
+
11
+ using Rice::Array;
12
+ using Rice::Hash;
13
+ using Rice::Module;
14
+ using Rice::String;
15
+ using Rice::Symbol;
16
+ using Rice::define_class_under;
17
+ using Rice::define_module;
18
+
19
+ extern "C"
20
+ void Init_ext()
21
+ {
22
+ Module rb_mIsoTree = define_module("IsoTree");
23
+
24
+ Module rb_mExt = define_module_under(rb_mIsoTree, "Ext");
25
+ define_class_under<ExtIsoForest>(rb_mExt, "ExtIsoForest");
26
+
27
+ rb_mExt
28
+ .define_singleton_method(
29
+ "fit_iforest",
30
+ *[](Hash options) {
31
+ // model
32
+ ExtIsoForest iso;
33
+
34
+ // data
35
+ size_t nrows = options.get<size_t, Symbol>("nrows");
36
+ size_t ncols = options.get<size_t, Symbol>("ncols");
37
+ double* numeric_data = (double*) options.get<String, Symbol>("numeric_data").c_str();
38
+ size_t ncols_numeric = ncols;
39
+ int* categ_data = NULL;
40
+ size_t ncols_categ = 0;
41
+ int* ncat = NULL;
42
+ double* Xc = NULL;
43
+ sparse_ix* Xc_ind = NULL;
44
+ sparse_ix* Xc_indptr = NULL;
45
+
46
+ // options
47
+ CoefType coef_type = Normal;
48
+ double* sample_weights = NULL;
49
+ bool weight_as_sample = false;
50
+ size_t max_depth = 0;
51
+ bool limit_depth = true;
52
+ bool standardize_dist = false;
53
+ double* tmat = NULL;
54
+ double* output_depths = NULL;
55
+ bool standardize_depth = false;
56
+ double* col_weights = NULL;
57
+ MissingAction missing_action = Impute;
58
+ CategSplit cat_split_type = SubSet;
59
+ NewCategAction new_cat_action = Smallest;
60
+ Imputer *imputer = NULL;
61
+ UseDepthImp depth_imp = Higher;
62
+ WeighImpRows weigh_imp_rows = Inverse;
63
+ bool impute_at_fit = false;
64
+
65
+ // Rice has limit of 14 arguments, so use hash for options
66
+ size_t sample_size = options.get<size_t, Symbol>("sample_size");
67
+ size_t ndim = options.get<size_t, Symbol>("ndim");
68
+ size_t ntrees = options.get<size_t, Symbol>("ntrees");
69
+ size_t ntry = options.get<size_t, Symbol>("ntry");
70
+ double prob_pick_by_gain_avg = options.get<double, Symbol>("prob_pick_avg_gain");
71
+ double prob_split_by_gain_avg = options.get<double, Symbol>("prob_split_avg_gain");
72
+ double prob_pick_by_gain_pl = options.get<double, Symbol>("prob_pick_pooled_gain");
73
+ double prob_split_by_gain_pl = options.get<double, Symbol>("prob_split_pooled_gain");
74
+ double min_gain = options.get<double, Symbol>("min_gain");
75
+ bool all_perm = options.get<bool, Symbol>("all_perm");
76
+ bool coef_by_prop = options.get<bool, Symbol>("coef_by_prop");
77
+ bool with_replacement = options.get<bool, Symbol>("sample_with_replacement");
78
+ bool penalize_range = options.get<bool, Symbol>("penalize_range");
79
+ bool weigh_by_kurt = options.get<bool, Symbol>("weigh_by_kurtosis");
80
+ size_t min_imp_obs = options.get<size_t, Symbol>("min_imp_obs");
81
+ uint64_t random_seed = options.get<uint64_t, Symbol>("random_seed");
82
+ int nthreads = options.get<int, Symbol>("nthreads");
83
+
84
+ fit_iforest(
85
+ NULL,
86
+ &iso,
87
+ numeric_data,
88
+ ncols_numeric,
89
+ categ_data,
90
+ ncols_categ,
91
+ ncat,
92
+ Xc,
93
+ Xc_ind,
94
+ Xc_indptr,
95
+ ndim,
96
+ ntry,
97
+ coef_type,
98
+ coef_by_prop,
99
+ sample_weights,
100
+ with_replacement,
101
+ weight_as_sample,
102
+ nrows,
103
+ sample_size,
104
+ ntrees,
105
+ max_depth,
106
+ limit_depth,
107
+ penalize_range,
108
+ standardize_dist,
109
+ tmat,
110
+ output_depths,
111
+ standardize_depth,
112
+ col_weights,
113
+ weigh_by_kurt,
114
+ prob_pick_by_gain_avg,
115
+ prob_split_by_gain_avg,
116
+ prob_pick_by_gain_pl,
117
+ prob_split_by_gain_pl,
118
+ min_gain,
119
+ missing_action,
120
+ cat_split_type,
121
+ new_cat_action,
122
+ all_perm,
123
+ imputer,
124
+ min_imp_obs,
125
+ depth_imp,
126
+ weigh_imp_rows,
127
+ impute_at_fit,
128
+ random_seed,
129
+ nthreads
130
+ );
131
+
132
+ return iso;
133
+ })
134
+ .define_singleton_method(
135
+ "predict_iforest",
136
+ *[](ExtIsoForest& iso, Hash options) {
137
+ // data
138
+ size_t nrows = options.get<size_t, Symbol>("nrows");
139
+ double* numeric_data = (double*) options.get<String, Symbol>("numeric_data").c_str();
140
+ int* categ_data = NULL;
141
+ double* Xc = NULL;
142
+ sparse_ix* Xc_ind = NULL;
143
+ sparse_ix* Xc_indptr = NULL;
144
+ double* Xr = NULL;
145
+ sparse_ix* Xr_ind = NULL;
146
+ sparse_ix* Xr_indptr = NULL;
147
+
148
+ // options
149
+ int nthreads = options.get<int, Symbol>("nthreads");
150
+ bool standardize = true;
151
+ std::vector<double> outlier_scores(nrows);
152
+ sparse_ix* tree_num = NULL;
153
+
154
+ predict_iforest(
155
+ numeric_data,
156
+ categ_data,
157
+ Xc,
158
+ Xc_ind,
159
+ Xc_indptr,
160
+ Xr,
161
+ Xr_ind,
162
+ Xr_indptr,
163
+ nrows,
164
+ nthreads,
165
+ standardize,
166
+ NULL,
167
+ &iso,
168
+ outlier_scores.data(),
169
+ tree_num
170
+ );
171
+
172
+ Array ret;
173
+ for (size_t i = 0; i < outlier_scores.size(); i++) {
174
+ ret.push(outlier_scores[i]);
175
+ }
176
+ return ret;
177
+ });
178
+ }
@@ -0,0 +1,21 @@
1
+ require "mkmf-rice"
2
+
3
+ $CXXFLAGS += " -std=c++11 -D_USE_MERSENNE_TWISTER -D_ENABLE_CEREAL"
4
+
5
+ apple_clang = RbConfig::CONFIG["CC_VERSION_MESSAGE"] =~ /apple clang/i
6
+
7
+ # check omp first
8
+ if have_library("omp") || have_library("gomp")
9
+ $CXXFLAGS += " -Xclang" if apple_clang
10
+ $CXXFLAGS += " -fopenmp"
11
+ end
12
+
13
+ ext = File.expand_path(".", __dir__)
14
+ isotree = File.expand_path("../../vendor/isotree/src", __dir__)
15
+
16
+ exclude = %w(Rwrapper.cpp RcppExports.cpp)
17
+ $srcs = Dir["{#{ext},#{isotree}}/*.{cc,cpp}"].reject { |f| exclude.include?(File.basename(f)) }
18
+ $INCFLAGS << " -I#{isotree}"
19
+ $VPATH << isotree
20
+
21
+ create_makefile("isotree/ext")
@@ -0,0 +1,9 @@
1
+ # ext
2
+ require "isotree/ext"
3
+
4
+ # stdlib
5
+ require "etc"
6
+
7
+ # modules
8
+ require "isotree/isolation_forest"
9
+ require "isotree/version"
@@ -0,0 +1,94 @@
1
+ module IsoTree
2
+ class IsolationForest
3
+ def initialize(
4
+ sample_size: nil, ntrees: 500, ndim: 3, ntry: 3,
5
+ prob_pick_avg_gain: 0, prob_pick_pooled_gain: 0,
6
+ prob_split_avg_gain: 0, prob_split_pooled_gain: 0,
7
+ min_gain: 0, all_perm: false, coef_by_prop: false,
8
+ sample_with_replacement: false, penalize_range: true,
9
+ weigh_by_kurtosis: false, min_imp_obs: 3, random_seed: 1, nthreads: -1
10
+ )
11
+
12
+ @sample_size = sample_size
13
+ @ntrees = ntrees
14
+ @ndim = ndim
15
+ @ntry = ntry
16
+ @prob_pick_avg_gain = prob_pick_avg_gain
17
+ @prob_pick_pooled_gain = prob_pick_pooled_gain
18
+ @prob_split_avg_gain = prob_split_avg_gain
19
+ @prob_split_pooled_gain = prob_split_pooled_gain
20
+ @min_gain = min_gain
21
+ @all_perm = all_perm
22
+ @coef_by_prop = coef_by_prop
23
+ @sample_with_replacement = sample_with_replacement
24
+ @penalize_range = penalize_range
25
+ @weigh_by_kurtosis = weigh_by_kurtosis
26
+ @min_imp_obs = min_imp_obs
27
+ @random_seed = random_seed
28
+
29
+ # etc module returns virtual cores
30
+ nthreads = Etc.nprocessors if nthreads < 0
31
+ @nthreads = nthreads
32
+ end
33
+
34
+ def fit(x)
35
+ options = data_options(x).merge(fit_options)
36
+ options[:sample_size] ||= options[:nrows]
37
+ @ncols = options[:ncols]
38
+ @ext_iso_forest = Ext.fit_iforest(options)
39
+ end
40
+
41
+ def predict(x)
42
+ raise "Not fit" unless @ext_iso_forest
43
+ options = data_options(x).merge(nthreads: @nthreads)
44
+ if options[:ncols] != @ncols
45
+ raise ArgumentError, "Input must have #{@ncols} columns for this model"
46
+ end
47
+ Ext.predict_iforest(@ext_iso_forest, options)
48
+ end
49
+
50
+ private
51
+
52
+ # TODO support categorical data
53
+ def data_options(x)
54
+ if defined?(Numo::NArray) && x.is_a?(Numo::NArray)
55
+ raise ArgumentError, "Input must have 2 dimensions" if x.ndim != 2
56
+ x = x.cast_to(Numo::DFloat)
57
+ numeric_data = x.to_binary
58
+ nrows, ncols = x.shape
59
+ else
60
+ x = x.to_a
61
+ nrows = x.size
62
+ ncols = x.first ? x.first.size : 0
63
+ if x.any? { |r| r.size != ncols }
64
+ raise ArgumentError, "All rows must have the same number of columns"
65
+ end
66
+ numeric_data = x.flatten(1).pack("d*")
67
+ end
68
+ raise ArgumentError, "No data" if nrows == 0
69
+
70
+ {
71
+ nrows: nrows,
72
+ ncols: ncols,
73
+ numeric_data: numeric_data
74
+ }
75
+ end
76
+
77
+ def fit_options
78
+ keys = %i(
79
+ sample_size ntrees ndim ntry
80
+ prob_pick_avg_gain prob_pick_pooled_gain
81
+ prob_split_avg_gain prob_split_pooled_gain
82
+ min_gain all_perm coef_by_prop
83
+ sample_with_replacement penalize_range
84
+ weigh_by_kurtosis min_imp_obs
85
+ random_seed nthreads
86
+ )
87
+ options = {}
88
+ keys.each do |k|
89
+ options[k] = instance_variable_get("@#{k}")
90
+ end
91
+ options
92
+ end
93
+ end
94
+ end