outliertree 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,14 @@
1
+ Copyright (C) 2020 Andrew Kane
2
+
3
+ This program is free software: you can redistribute it and/or modify
4
+ it under the terms of the GNU General Public License as published by
5
+ the Free Software Foundation, either version 3 of the License, or
6
+ (at your option) any later version.
7
+
8
+ This program is distributed in the hope that it will be useful,
9
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ GNU General Public License for more details.
12
+
13
+ You should have received a copy of the GNU General Public License
14
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
@@ -0,0 +1,107 @@
1
+ # OutlierTree
2
+
3
+ :evergreen_tree: [OutlierTree](https://github.com/david-cortes/outliertree) - explainable outlier/anomaly detection - for Ruby
4
+
5
+ Produces human-readable explanations for why values are detected as outliers
6
+
7
+ ```txt
8
+ Price (2.50) looks low given Department is Books and Sale is false
9
+ ```
10
+
11
+ ## Installation
12
+
13
+ Add this line to your application’s Gemfile:
14
+
15
+ ```ruby
16
+ gem 'outliertree'
17
+ ```
18
+
19
+ ## Getting Started
20
+
21
+ Prep your data
22
+
23
+ ```ruby
24
+ data = [
25
+ {department: "Books", sale: false, price: 2.50},
26
+ {department: "Books", sale: true, price: 3.00},
27
+ {department: "Movies", sale: false, price: 5.00}
28
+ ]
29
+ ```
30
+
31
+ Train a model
32
+
33
+ ```ruby
34
+ model = OutlierTree.new
35
+ model.fit(data)
36
+ ```
37
+
38
+ Get outliers
39
+
40
+ ```ruby
41
+ model.outliers(data)
42
+ ```
43
+
44
+ ## Parameters
45
+
46
+ Pass parameters - default values below
47
+
48
+ ```ruby
49
+ OutlierTree.new(
50
+ max_depth: 4,
51
+ min_gain: 0.01,
52
+ z_norm: 2.67,
53
+ z_outlier: 8.0,
54
+ pct_outliers: 0.01,
55
+ min_size_numeric: 25,
56
+ min_size_categ: 50,
57
+ categ_split: "binarize",
58
+ categ_outliers: "tail",
59
+ numeric_split: "raw",
60
+ follow_all: false,
61
+ gain_as_pct: true,
62
+ nthreads: -1
63
+ )
64
+ ```
65
+
66
+ See a [detailed explanation](https://outliertree.readthedocs.io/en/latest/#outliertree.OutlierTree)
67
+
68
+ ## Data
69
+
70
+ Data can be an array of hashes
71
+
72
+ ```ruby
73
+ [
74
+ {department: "Books", sale: false, price: 2.50},
75
+ {department: "Books", sale: true, price: 3.00},
76
+ {department: "Movies", sale: false, price: 5.00}
77
+ ]
78
+ ```
79
+
80
+ Or a Rover data frame
81
+
82
+ ```ruby
83
+ Rover.read_csv("data.csv")
84
+ ```
85
+
86
+ ## History
87
+
88
+ View the [changelog](https://github.com/ankane/outliertree/blob/master/CHANGELOG.md)
89
+
90
+ ## Contributing
91
+
92
+ Everyone is encouraged to help improve this project. Here are a few ways you can help:
93
+
94
+ - [Report bugs](https://github.com/ankane/outliertree/issues)
95
+ - Fix bugs and [submit pull requests](https://github.com/ankane/outliertree/pulls)
96
+ - Write, clarify, or fix documentation
97
+ - Suggest or add new features
98
+
99
+ To get started with development:
100
+
101
+ ```sh
102
+ git clone --recursive https://github.com/ankane/outliertree.git
103
+ cd outliertree
104
+ bundle install
105
+ bundle exec rake compile
106
+ bundle exec rake test
107
+ ```
@@ -0,0 +1,260 @@
1
+ // outliertree
2
+ #include <outlier_tree.hpp>
3
+
4
+ // rice
5
+ #include <rice/Array.hpp>
6
+ #include <rice/Hash.hpp>
7
+ #include <rice/Module.hpp>
8
+ #include <rice/Object.hpp>
9
+ #include <rice/String.hpp>
10
+ #include <rice/Symbol.hpp>
11
+
12
+ using Rice::Array;
13
+ using Rice::Hash;
14
+ using Rice::Module;
15
+ using Rice::Object;
16
+ using Rice::String;
17
+ using Rice::Symbol;
18
+ using Rice::define_class_under;
19
+ using Rice::define_module;
20
+
21
+ template<>
22
+ Object to_ruby<std::vector<char>>(std::vector<char> const & x)
23
+ {
24
+ Array a;
25
+ for (size_t i = 0; i < x.size(); i++) {
26
+ a.push(x[i]);
27
+ }
28
+ return a;
29
+ }
30
+
31
+ template<>
32
+ Object to_ruby<std::vector<int>>(std::vector<int> const & x)
33
+ {
34
+ Array a;
35
+ for (size_t i = 0; i < x.size(); i++) {
36
+ a.push(x[i]);
37
+ }
38
+ return a;
39
+ }
40
+
41
+ template<>
42
+ Object to_ruby<std::vector<unsigned long>>(std::vector<unsigned long> const & x)
43
+ {
44
+ Array a;
45
+ for (size_t i = 0; i < x.size(); i++) {
46
+ a.push(x[i]);
47
+ }
48
+ return a;
49
+ }
50
+
51
+ template<>
52
+ Object to_ruby<std::vector<double>>(std::vector<double> const & x)
53
+ {
54
+ Array a;
55
+ for (size_t i = 0; i < x.size(); i++) {
56
+ a.push(x[i]);
57
+ }
58
+ return a;
59
+ }
60
+
61
+ template<>
62
+ Object to_ruby<ColType>(ColType const & x)
63
+ {
64
+ switch (x) {
65
+ case Numeric: return Symbol("numeric");
66
+ case Categorical: return Symbol("categorical");
67
+ case Ordinal: return Symbol("ordinal");
68
+ case NoType: return Symbol("no_type");
69
+ }
70
+ throw std::runtime_error("Unknown column type");
71
+ }
72
+
73
+ template<>
74
+ Object to_ruby<SplitType>(SplitType const & x)
75
+ {
76
+ switch (x) {
77
+ case LessOrEqual: return Symbol("less_or_equal");
78
+ case Greater: return Symbol("greater");
79
+ case Equal: return Symbol("equal");
80
+ case NotEqual: return Symbol("not_equal");
81
+ case InSubset: return Symbol("in_subset");
82
+ case NotInSubset: return Symbol("not_in_subset");
83
+ case SingleCateg: return Symbol("single_categ");
84
+ case SubTrees: return Symbol("sub_trees");
85
+ case IsNa: return Symbol("is_na");
86
+ case Root: return Symbol("root");
87
+ }
88
+ throw std::runtime_error("Unknown split type");
89
+ }
90
+
91
+ extern "C"
92
+ void Init_ext()
93
+ {
94
+ Module rb_mOutlierTree = define_module("OutlierTree");
95
+ Module rb_mExt = define_module_under(rb_mOutlierTree, "Ext");
96
+
97
+ define_class_under<Cluster>(rb_mExt, "Cluster")
98
+ .define_method("upper_lim", *[](Cluster& self) { return self.upper_lim; })
99
+ .define_method("display_lim_high", *[](Cluster& self) { return self.display_lim_high; })
100
+ .define_method("perc_below", *[](Cluster& self) { return self.perc_below; })
101
+ .define_method("display_lim_low", *[](Cluster& self) { return self.display_lim_low; })
102
+ .define_method("perc_above", *[](Cluster& self) { return self.perc_above; })
103
+ .define_method("display_mean", *[](Cluster& self) { return self.display_mean; })
104
+ .define_method("display_sd", *[](Cluster& self) { return self.display_sd; })
105
+ .define_method("cluster_size", *[](Cluster& self) { return self.cluster_size; })
106
+ .define_method("split_point", *[](Cluster& self) { return self.split_point; })
107
+ .define_method("split_subset", *[](Cluster& self) { return self.split_subset; })
108
+ .define_method("split_lev", *[](Cluster& self) { return self.split_lev; })
109
+ .define_method("split_type", *[](Cluster& self) { return self.split_type; })
110
+ .define_method("column_type", *[](Cluster& self) { return self.column_type; })
111
+ .define_method("has_na_branch", *[](Cluster& self) { return self.has_NA_branch; })
112
+ .define_method("col_num", *[](Cluster& self) { return self.col_num; });
113
+
114
+ define_class_under<ClusterTree>(rb_mExt, "ClusterTree")
115
+ .define_method("parent_branch", *[](ClusterTree& self) { return self.parent_branch; })
116
+ .define_method("parent", *[](ClusterTree& self) { return self.parent; })
117
+ .define_method("all_branches", *[](ClusterTree& self) { return self.all_branches; })
118
+ .define_method("column_type", *[](ClusterTree& self) { return self.column_type; })
119
+ .define_method("col_num", *[](ClusterTree& self) { return self.col_num; })
120
+ .define_method("split_point", *[](ClusterTree& self) { return self.split_point; })
121
+ .define_method("split_subset", *[](ClusterTree& self) { return self.split_subset; })
122
+ .define_method("split_lev", *[](ClusterTree& self) { return self.split_lev; });
123
+
124
+ define_class_under<ModelOutputs>(rb_mExt, "ModelOutputs")
125
+ .define_method("outlier_scores_final", *[](ModelOutputs& self) { return self.outlier_scores_final; })
126
+ .define_method("outlier_columns_final", *[](ModelOutputs& self) { return self.outlier_columns_final; })
127
+ .define_method("outlier_clusters_final", *[](ModelOutputs& self) { return self.outlier_clusters_final; })
128
+ .define_method("outlier_trees_final", *[](ModelOutputs& self) { return self.outlier_trees_final; })
129
+ .define_method("outlier_depth_final", *[](ModelOutputs& self) { return self.outlier_depth_final; })
130
+ .define_method("outlier_decimals_distr", *[](ModelOutputs& self) { return self.outlier_decimals_distr; })
131
+ .define_method("min_decimals_col", *[](ModelOutputs& self) { return self.min_decimals_col; })
132
+ .define_method(
133
+ "all_clusters",
134
+ *[](ModelOutputs& self, size_t i, size_t j) {
135
+ return self.all_clusters[i][j];
136
+ })
137
+ .define_method(
138
+ "all_trees",
139
+ *[](ModelOutputs& self, size_t i, size_t j) {
140
+ return self.all_trees[i][j];
141
+ });
142
+
143
+ rb_mExt
144
+ .define_singleton_method(
145
+ "fit_outliers_models",
146
+ *[](Hash options) {
147
+ ModelOutputs model_outputs;
148
+
149
+ // data
150
+ size_t nrows = options.get<size_t, Symbol>("nrows");
151
+ size_t ncols_numeric = options.get<size_t, Symbol>("ncols_numeric");
152
+ size_t ncols_categ = options.get<size_t, Symbol>("ncols_categ");
153
+ size_t ncols_ord = options.get<size_t, Symbol>("ncols_ord");
154
+
155
+ double *restrict numeric_data = NULL;
156
+ if (ncols_numeric > 0) {
157
+ numeric_data = (double*) options.get<String, Symbol>("numeric_data").c_str();
158
+ }
159
+
160
+ int *restrict categorical_data = NULL;
161
+ int *restrict ncat = NULL;
162
+ if (ncols_categ > 0) {
163
+ categorical_data = (int*) options.get<String, Symbol>("categorical_data").c_str();
164
+ ncat = (int*) options.get<String, Symbol>("ncat").c_str();
165
+ }
166
+
167
+ int *restrict ordinal_data = NULL;
168
+ int *restrict ncat_ord = NULL;
169
+ if (ncols_ord > 0) {
170
+ ordinal_data = (int*) options.get<String, Symbol>("ordinal_data").c_str();
171
+ ncat_ord = (int*) options.get<String, Symbol>("ncat_ord").c_str();
172
+ }
173
+
174
+ // options
175
+ char *restrict cols_ignore = NULL;
176
+ int nthreads = options.get<int, Symbol>("nthreads");
177
+ bool categ_as_bin = options.get<bool, Symbol>("categ_as_bin");
178
+ bool ord_as_bin = options.get<bool, Symbol>("ord_as_bin");
179
+ bool cat_bruteforce_subset = options.get<bool, Symbol>("cat_bruteforce_subset");
180
+ bool categ_from_maj = options.get<bool, Symbol>("categ_from_maj");
181
+ bool take_mid = options.get<bool, Symbol>("take_mid");
182
+ size_t max_depth = options.get<size_t, Symbol>("max_depth");
183
+ double max_perc_outliers = options.get<double, Symbol>("pct_outliers");
184
+ size_t min_size_numeric = options.get<size_t, Symbol>("min_size_numeric");
185
+ size_t min_size_categ = options.get<size_t, Symbol>("min_size_categ");
186
+ double min_gain = options.get<double, Symbol>("min_gain");
187
+ bool gain_as_pct = options.get<bool, Symbol>("gain_as_pct");
188
+ bool follow_all = options.get<bool, Symbol>("follow_all");
189
+ double z_norm = options.get<double, Symbol>("z_norm");
190
+ double z_outlier = options.get<double, Symbol>("z_outlier");
191
+
192
+ fit_outliers_models(
193
+ model_outputs,
194
+ numeric_data,
195
+ ncols_numeric,
196
+ categorical_data,
197
+ ncols_categ,
198
+ ncat,
199
+ ordinal_data,
200
+ ncols_ord,
201
+ ncat_ord,
202
+ nrows,
203
+ cols_ignore,
204
+ nthreads,
205
+ categ_as_bin,
206
+ ord_as_bin,
207
+ cat_bruteforce_subset,
208
+ categ_from_maj,
209
+ take_mid,
210
+ max_depth,
211
+ max_perc_outliers,
212
+ min_size_numeric,
213
+ min_size_categ,
214
+ min_gain,
215
+ gain_as_pct,
216
+ follow_all,
217
+ z_norm,
218
+ z_outlier
219
+ );
220
+ return model_outputs;
221
+ })
222
+ .define_singleton_method(
223
+ "find_new_outliers",
224
+ *[](ModelOutputs& model_outputs, Hash options) {
225
+ // data
226
+ size_t nrows = options.get<size_t, Symbol>("nrows");
227
+ size_t ncols_numeric = options.get<size_t, Symbol>("ncols_numeric");
228
+ size_t ncols_categ = options.get<size_t, Symbol>("ncols_categ");
229
+ size_t ncols_ord = options.get<size_t, Symbol>("ncols_ord");
230
+
231
+ double *restrict numeric_data = NULL;
232
+ if (ncols_numeric > 0) {
233
+ numeric_data = (double*) options.get<String, Symbol>("numeric_data").c_str();
234
+ }
235
+
236
+ int *restrict categorical_data = NULL;
237
+ if (ncols_categ > 0) {
238
+ categorical_data = (int*) options.get<String, Symbol>("categorical_data").c_str();
239
+ }
240
+
241
+ int *restrict ordinal_data = NULL;
242
+ if (ncols_ord > 0) {
243
+ ordinal_data = (int*) options.get<String, Symbol>("ordinal_data").c_str();
244
+ }
245
+
246
+ // options
247
+ int nthreads = options.get<int, Symbol>("nthreads");
248
+
249
+ find_new_outliers(
250
+ numeric_data,
251
+ categorical_data,
252
+ ordinal_data,
253
+ nrows,
254
+ nthreads,
255
+ model_outputs
256
+ );
257
+
258
+ return model_outputs;
259
+ });
260
+ }
@@ -0,0 +1,21 @@
1
+ require "mkmf-rice"
2
+
3
+ $CXXFLAGS += " -std=c++11"
4
+
5
+ apple_clang = RbConfig::CONFIG["CC_VERSION_MESSAGE"] =~ /apple clang/i
6
+
7
+ # check omp first
8
+ if have_library("omp") || have_library("gomp")
9
+ $CXXFLAGS += " -Xclang" if apple_clang
10
+ $CXXFLAGS += " -fopenmp"
11
+ end
12
+
13
+ ext = File.expand_path(".", __dir__)
14
+ outliertree = File.expand_path("../../vendor/outliertree/src", __dir__)
15
+
16
+ exclude = %w(Rwrapper.cpp RcppExports.cpp)
17
+ $srcs = Dir["{#{ext},#{outliertree}}/*.{cc,cpp}"].reject { |f| exclude.include?(File.basename(f)) }
18
+ $INCFLAGS += " -I#{outliertree}"
19
+ $VPATH << outliertree
20
+
21
+ create_makefile("outliertree/ext")
@@ -0,0 +1,17 @@
1
+ # ext
2
+ require "outliertree/ext"
3
+
4
+ # stdlib
5
+ require "etc"
6
+
7
+ # modules
8
+ require "outliertree/dataset"
9
+ require "outliertree/model"
10
+ require "outliertree/result"
11
+ require "outliertree/version"
12
+
13
+ module OutlierTree
14
+ def self.new(**options)
15
+ Model.new(**options)
16
+ end
17
+ end
@@ -0,0 +1,35 @@
1
+ module OutlierTree
2
+ class Dataset
3
+ attr_reader :numeric_columns, :categorical_columns
4
+
5
+ def initialize(data)
6
+ @data = data
7
+
8
+ if defined?(Rover::DataFrame) && data.is_a?(Rover::DataFrame)
9
+ @vectors = data.vectors
10
+ @numeric_columns, @categorical_columns = data.keys.partition { |k, v| ![:object, :bool].include?(data[k].type) }
11
+ else
12
+ @vectors = {}
13
+ raise ArgumentError, "Array elements must be hashes" unless data.all? { |d| d.is_a?(Hash) }
14
+ keys = data.flat_map(&:keys).uniq
15
+ keys.each do |k|
16
+ @vectors[k] = []
17
+ end
18
+ data.each do |d|
19
+ keys.each do |k|
20
+ @vectors[k] << d[k]
21
+ end
22
+ end
23
+ @numeric_columns, @categorical_columns = keys.partition { |k| @vectors[k].all? { |v| v.nil? || v.is_a?(Numeric) } }
24
+ end
25
+ end
26
+
27
+ def [](k)
28
+ @vectors[k]
29
+ end
30
+
31
+ def size
32
+ @vectors.any? ? @vectors.values.first.size : 0
33
+ end
34
+ end
35
+ end