outliertree 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,14 @@
1
+ Copyright (C) 2020 Andrew Kane
2
+
3
+ This program is free software: you can redistribute it and/or modify
4
+ it under the terms of the GNU General Public License as published by
5
+ the Free Software Foundation, either version 3 of the License, or
6
+ (at your option) any later version.
7
+
8
+ This program is distributed in the hope that it will be useful,
9
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
10
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11
+ GNU General Public License for more details.
12
+
13
+ You should have received a copy of the GNU General Public License
14
+ along with this program. If not, see <https://www.gnu.org/licenses/>.
@@ -0,0 +1,107 @@
1
+ # OutlierTree
2
+
3
+ :evergreen_tree: [OutlierTree](https://github.com/david-cortes/outliertree) - explainable outlier/anomaly detection - for Ruby
4
+
5
+ Produces human-readable explanations for why values are detected as outliers
6
+
7
+ ```txt
8
+ Price (2.50) looks low given Department is Books and Sale is false
9
+ ```
10
+
11
+ ## Installation
12
+
13
+ Add this line to your application’s Gemfile:
14
+
15
+ ```ruby
16
+ gem 'outliertree'
17
+ ```
18
+
19
+ ## Getting Started
20
+
21
+ Prep your data
22
+
23
+ ```ruby
24
+ data = [
25
+ {department: "Books", sale: false, price: 2.50},
26
+ {department: "Books", sale: true, price: 3.00},
27
+ {department: "Movies", sale: false, price: 5.00}
28
+ ]
29
+ ```
30
+
31
+ Train a model
32
+
33
+ ```ruby
34
+ model = OutlierTree.new
35
+ model.fit(data)
36
+ ```
37
+
38
+ Get outliers
39
+
40
+ ```ruby
41
+ model.outliers(data)
42
+ ```
43
+
44
+ ## Parameters
45
+
46
+ Pass parameters - default values below
47
+
48
+ ```ruby
49
+ OutlierTree.new(
50
+ max_depth: 4,
51
+ min_gain: 0.01,
52
+ z_norm: 2.67,
53
+ z_outlier: 8.0,
54
+ pct_outliers: 0.01,
55
+ min_size_numeric: 25,
56
+ min_size_categ: 50,
57
+ categ_split: "binarize",
58
+ categ_outliers: "tail",
59
+ numeric_split: "raw",
60
+ follow_all: false,
61
+ gain_as_pct: true,
62
+ nthreads: -1
63
+ )
64
+ ```
65
+
66
+ See a [detailed explanation](https://outliertree.readthedocs.io/en/latest/#outliertree.OutlierTree)
67
+
68
+ ## Data
69
+
70
+ Data can be an array of hashes
71
+
72
+ ```ruby
73
+ [
74
+ {department: "Books", sale: false, price: 2.50},
75
+ {department: "Books", sale: true, price: 3.00},
76
+ {department: "Movies", sale: false, price: 5.00}
77
+ ]
78
+ ```
79
+
80
+ Or a Rover data frame
81
+
82
+ ```ruby
83
+ Rover.read_csv("data.csv")
84
+ ```
85
+
86
+ ## History
87
+
88
+ View the [changelog](https://github.com/ankane/outliertree/blob/master/CHANGELOG.md)
89
+
90
+ ## Contributing
91
+
92
+ Everyone is encouraged to help improve this project. Here are a few ways you can help:
93
+
94
+ - [Report bugs](https://github.com/ankane/outliertree/issues)
95
+ - Fix bugs and [submit pull requests](https://github.com/ankane/outliertree/pulls)
96
+ - Write, clarify, or fix documentation
97
+ - Suggest or add new features
98
+
99
+ To get started with development:
100
+
101
+ ```sh
102
+ git clone --recursive https://github.com/ankane/outliertree.git
103
+ cd outliertree
104
+ bundle install
105
+ bundle exec rake compile
106
+ bundle exec rake test
107
+ ```
@@ -0,0 +1,260 @@
1
+ // outliertree
2
+ #include <outlier_tree.hpp>
3
+
4
+ // rice
5
+ #include <rice/Array.hpp>
6
+ #include <rice/Hash.hpp>
7
+ #include <rice/Module.hpp>
8
+ #include <rice/Object.hpp>
9
+ #include <rice/String.hpp>
10
+ #include <rice/Symbol.hpp>
11
+
12
+ using Rice::Array;
13
+ using Rice::Hash;
14
+ using Rice::Module;
15
+ using Rice::Object;
16
+ using Rice::String;
17
+ using Rice::Symbol;
18
+ using Rice::define_class_under;
19
+ using Rice::define_module;
20
+
21
+ template<>
22
+ Object to_ruby<std::vector<char>>(std::vector<char> const & x)
23
+ {
24
+ Array a;
25
+ for (size_t i = 0; i < x.size(); i++) {
26
+ a.push(x[i]);
27
+ }
28
+ return a;
29
+ }
30
+
31
+ template<>
32
+ Object to_ruby<std::vector<int>>(std::vector<int> const & x)
33
+ {
34
+ Array a;
35
+ for (size_t i = 0; i < x.size(); i++) {
36
+ a.push(x[i]);
37
+ }
38
+ return a;
39
+ }
40
+
41
+ template<>
42
+ Object to_ruby<std::vector<unsigned long>>(std::vector<unsigned long> const & x)
43
+ {
44
+ Array a;
45
+ for (size_t i = 0; i < x.size(); i++) {
46
+ a.push(x[i]);
47
+ }
48
+ return a;
49
+ }
50
+
51
+ template<>
52
+ Object to_ruby<std::vector<double>>(std::vector<double> const & x)
53
+ {
54
+ Array a;
55
+ for (size_t i = 0; i < x.size(); i++) {
56
+ a.push(x[i]);
57
+ }
58
+ return a;
59
+ }
60
+
61
+ template<>
62
+ Object to_ruby<ColType>(ColType const & x)
63
+ {
64
+ switch (x) {
65
+ case Numeric: return Symbol("numeric");
66
+ case Categorical: return Symbol("categorical");
67
+ case Ordinal: return Symbol("ordinal");
68
+ case NoType: return Symbol("no_type");
69
+ }
70
+ throw std::runtime_error("Unknown column type");
71
+ }
72
+
73
+ template<>
74
+ Object to_ruby<SplitType>(SplitType const & x)
75
+ {
76
+ switch (x) {
77
+ case LessOrEqual: return Symbol("less_or_equal");
78
+ case Greater: return Symbol("greater");
79
+ case Equal: return Symbol("equal");
80
+ case NotEqual: return Symbol("not_equal");
81
+ case InSubset: return Symbol("in_subset");
82
+ case NotInSubset: return Symbol("not_in_subset");
83
+ case SingleCateg: return Symbol("single_categ");
84
+ case SubTrees: return Symbol("sub_trees");
85
+ case IsNa: return Symbol("is_na");
86
+ case Root: return Symbol("root");
87
+ }
88
+ throw std::runtime_error("Unknown split type");
89
+ }
90
+
91
+ extern "C"
92
+ void Init_ext()
93
+ {
94
+ Module rb_mOutlierTree = define_module("OutlierTree");
95
+ Module rb_mExt = define_module_under(rb_mOutlierTree, "Ext");
96
+
97
+ define_class_under<Cluster>(rb_mExt, "Cluster")
98
+ .define_method("upper_lim", *[](Cluster& self) { return self.upper_lim; })
99
+ .define_method("display_lim_high", *[](Cluster& self) { return self.display_lim_high; })
100
+ .define_method("perc_below", *[](Cluster& self) { return self.perc_below; })
101
+ .define_method("display_lim_low", *[](Cluster& self) { return self.display_lim_low; })
102
+ .define_method("perc_above", *[](Cluster& self) { return self.perc_above; })
103
+ .define_method("display_mean", *[](Cluster& self) { return self.display_mean; })
104
+ .define_method("display_sd", *[](Cluster& self) { return self.display_sd; })
105
+ .define_method("cluster_size", *[](Cluster& self) { return self.cluster_size; })
106
+ .define_method("split_point", *[](Cluster& self) { return self.split_point; })
107
+ .define_method("split_subset", *[](Cluster& self) { return self.split_subset; })
108
+ .define_method("split_lev", *[](Cluster& self) { return self.split_lev; })
109
+ .define_method("split_type", *[](Cluster& self) { return self.split_type; })
110
+ .define_method("column_type", *[](Cluster& self) { return self.column_type; })
111
+ .define_method("has_na_branch", *[](Cluster& self) { return self.has_NA_branch; })
112
+ .define_method("col_num", *[](Cluster& self) { return self.col_num; });
113
+
114
+ define_class_under<ClusterTree>(rb_mExt, "ClusterTree")
115
+ .define_method("parent_branch", *[](ClusterTree& self) { return self.parent_branch; })
116
+ .define_method("parent", *[](ClusterTree& self) { return self.parent; })
117
+ .define_method("all_branches", *[](ClusterTree& self) { return self.all_branches; })
118
+ .define_method("column_type", *[](ClusterTree& self) { return self.column_type; })
119
+ .define_method("col_num", *[](ClusterTree& self) { return self.col_num; })
120
+ .define_method("split_point", *[](ClusterTree& self) { return self.split_point; })
121
+ .define_method("split_subset", *[](ClusterTree& self) { return self.split_subset; })
122
+ .define_method("split_lev", *[](ClusterTree& self) { return self.split_lev; });
123
+
124
+ define_class_under<ModelOutputs>(rb_mExt, "ModelOutputs")
125
+ .define_method("outlier_scores_final", *[](ModelOutputs& self) { return self.outlier_scores_final; })
126
+ .define_method("outlier_columns_final", *[](ModelOutputs& self) { return self.outlier_columns_final; })
127
+ .define_method("outlier_clusters_final", *[](ModelOutputs& self) { return self.outlier_clusters_final; })
128
+ .define_method("outlier_trees_final", *[](ModelOutputs& self) { return self.outlier_trees_final; })
129
+ .define_method("outlier_depth_final", *[](ModelOutputs& self) { return self.outlier_depth_final; })
130
+ .define_method("outlier_decimals_distr", *[](ModelOutputs& self) { return self.outlier_decimals_distr; })
131
+ .define_method("min_decimals_col", *[](ModelOutputs& self) { return self.min_decimals_col; })
132
+ .define_method(
133
+ "all_clusters",
134
+ *[](ModelOutputs& self, size_t i, size_t j) {
135
+ return self.all_clusters[i][j];
136
+ })
137
+ .define_method(
138
+ "all_trees",
139
+ *[](ModelOutputs& self, size_t i, size_t j) {
140
+ return self.all_trees[i][j];
141
+ });
142
+
143
+ rb_mExt
144
+ .define_singleton_method(
145
+ "fit_outliers_models",
146
+ *[](Hash options) {
147
+ ModelOutputs model_outputs;
148
+
149
+ // data
150
+ size_t nrows = options.get<size_t, Symbol>("nrows");
151
+ size_t ncols_numeric = options.get<size_t, Symbol>("ncols_numeric");
152
+ size_t ncols_categ = options.get<size_t, Symbol>("ncols_categ");
153
+ size_t ncols_ord = options.get<size_t, Symbol>("ncols_ord");
154
+
155
+ double *restrict numeric_data = NULL;
156
+ if (ncols_numeric > 0) {
157
+ numeric_data = (double*) options.get<String, Symbol>("numeric_data").c_str();
158
+ }
159
+
160
+ int *restrict categorical_data = NULL;
161
+ int *restrict ncat = NULL;
162
+ if (ncols_categ > 0) {
163
+ categorical_data = (int*) options.get<String, Symbol>("categorical_data").c_str();
164
+ ncat = (int*) options.get<String, Symbol>("ncat").c_str();
165
+ }
166
+
167
+ int *restrict ordinal_data = NULL;
168
+ int *restrict ncat_ord = NULL;
169
+ if (ncols_ord > 0) {
170
+ ordinal_data = (int*) options.get<String, Symbol>("ordinal_data").c_str();
171
+ ncat_ord = (int*) options.get<String, Symbol>("ncat_ord").c_str();
172
+ }
173
+
174
+ // options
175
+ char *restrict cols_ignore = NULL;
176
+ int nthreads = options.get<int, Symbol>("nthreads");
177
+ bool categ_as_bin = options.get<bool, Symbol>("categ_as_bin");
178
+ bool ord_as_bin = options.get<bool, Symbol>("ord_as_bin");
179
+ bool cat_bruteforce_subset = options.get<bool, Symbol>("cat_bruteforce_subset");
180
+ bool categ_from_maj = options.get<bool, Symbol>("categ_from_maj");
181
+ bool take_mid = options.get<bool, Symbol>("take_mid");
182
+ size_t max_depth = options.get<size_t, Symbol>("max_depth");
183
+ double max_perc_outliers = options.get<double, Symbol>("pct_outliers");
184
+ size_t min_size_numeric = options.get<size_t, Symbol>("min_size_numeric");
185
+ size_t min_size_categ = options.get<size_t, Symbol>("min_size_categ");
186
+ double min_gain = options.get<double, Symbol>("min_gain");
187
+ bool gain_as_pct = options.get<bool, Symbol>("gain_as_pct");
188
+ bool follow_all = options.get<bool, Symbol>("follow_all");
189
+ double z_norm = options.get<double, Symbol>("z_norm");
190
+ double z_outlier = options.get<double, Symbol>("z_outlier");
191
+
192
+ fit_outliers_models(
193
+ model_outputs,
194
+ numeric_data,
195
+ ncols_numeric,
196
+ categorical_data,
197
+ ncols_categ,
198
+ ncat,
199
+ ordinal_data,
200
+ ncols_ord,
201
+ ncat_ord,
202
+ nrows,
203
+ cols_ignore,
204
+ nthreads,
205
+ categ_as_bin,
206
+ ord_as_bin,
207
+ cat_bruteforce_subset,
208
+ categ_from_maj,
209
+ take_mid,
210
+ max_depth,
211
+ max_perc_outliers,
212
+ min_size_numeric,
213
+ min_size_categ,
214
+ min_gain,
215
+ gain_as_pct,
216
+ follow_all,
217
+ z_norm,
218
+ z_outlier
219
+ );
220
+ return model_outputs;
221
+ })
222
+ .define_singleton_method(
223
+ "find_new_outliers",
224
+ *[](ModelOutputs& model_outputs, Hash options) {
225
+ // data
226
+ size_t nrows = options.get<size_t, Symbol>("nrows");
227
+ size_t ncols_numeric = options.get<size_t, Symbol>("ncols_numeric");
228
+ size_t ncols_categ = options.get<size_t, Symbol>("ncols_categ");
229
+ size_t ncols_ord = options.get<size_t, Symbol>("ncols_ord");
230
+
231
+ double *restrict numeric_data = NULL;
232
+ if (ncols_numeric > 0) {
233
+ numeric_data = (double*) options.get<String, Symbol>("numeric_data").c_str();
234
+ }
235
+
236
+ int *restrict categorical_data = NULL;
237
+ if (ncols_categ > 0) {
238
+ categorical_data = (int*) options.get<String, Symbol>("categorical_data").c_str();
239
+ }
240
+
241
+ int *restrict ordinal_data = NULL;
242
+ if (ncols_ord > 0) {
243
+ ordinal_data = (int*) options.get<String, Symbol>("ordinal_data").c_str();
244
+ }
245
+
246
+ // options
247
+ int nthreads = options.get<int, Symbol>("nthreads");
248
+
249
+ find_new_outliers(
250
+ numeric_data,
251
+ categorical_data,
252
+ ordinal_data,
253
+ nrows,
254
+ nthreads,
255
+ model_outputs
256
+ );
257
+
258
+ return model_outputs;
259
+ });
260
+ }
@@ -0,0 +1,21 @@
1
+ require "mkmf-rice"
2
+
3
+ $CXXFLAGS += " -std=c++11"
4
+
5
+ apple_clang = RbConfig::CONFIG["CC_VERSION_MESSAGE"] =~ /apple clang/i
6
+
7
+ # check omp first
8
+ if have_library("omp") || have_library("gomp")
9
+ $CXXFLAGS += " -Xclang" if apple_clang
10
+ $CXXFLAGS += " -fopenmp"
11
+ end
12
+
13
+ ext = File.expand_path(".", __dir__)
14
+ outliertree = File.expand_path("../../vendor/outliertree/src", __dir__)
15
+
16
+ exclude = %w(Rwrapper.cpp RcppExports.cpp)
17
+ $srcs = Dir["{#{ext},#{outliertree}}/*.{cc,cpp}"].reject { |f| exclude.include?(File.basename(f)) }
18
+ $INCFLAGS += " -I#{outliertree}"
19
+ $VPATH << outliertree
20
+
21
+ create_makefile("outliertree/ext")
@@ -0,0 +1,17 @@
1
+ # ext
2
+ require "outliertree/ext"
3
+
4
+ # stdlib
5
+ require "etc"
6
+
7
+ # modules
8
+ require "outliertree/dataset"
9
+ require "outliertree/model"
10
+ require "outliertree/result"
11
+ require "outliertree/version"
12
+
13
+ module OutlierTree
14
+ def self.new(**options)
15
+ Model.new(**options)
16
+ end
17
+ end
@@ -0,0 +1,35 @@
1
+ module OutlierTree
2
+ class Dataset
3
+ attr_reader :numeric_columns, :categorical_columns
4
+
5
+ def initialize(data)
6
+ @data = data
7
+
8
+ if defined?(Rover::DataFrame) && data.is_a?(Rover::DataFrame)
9
+ @vectors = data.vectors
10
+ @numeric_columns, @categorical_columns = data.keys.partition { |k, v| ![:object, :bool].include?(data[k].type) }
11
+ else
12
+ @vectors = {}
13
+ raise ArgumentError, "Array elements must be hashes" unless data.all? { |d| d.is_a?(Hash) }
14
+ keys = data.flat_map(&:keys).uniq
15
+ keys.each do |k|
16
+ @vectors[k] = []
17
+ end
18
+ data.each do |d|
19
+ keys.each do |k|
20
+ @vectors[k] << d[k]
21
+ end
22
+ end
23
+ @numeric_columns, @categorical_columns = keys.partition { |k| @vectors[k].all? { |v| v.nil? || v.is_a?(Numeric) } }
24
+ end
25
+ end
26
+
27
+ def [](k)
28
+ @vectors[k]
29
+ end
30
+
31
+ def size
32
+ @vectors.any? ? @vectors.values.first.size : 0
33
+ end
34
+ end
35
+ end