isotree 0.1.2 → 0.1.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/LICENSE.txt +1 -0
- data/README.md +29 -5
- data/ext/isotree/ext.cpp +33 -11
- data/lib/isotree.rb +1 -0
- data/lib/isotree/dataset.rb +73 -0
- data/lib/isotree/isolation_forest.rb +82 -29
- data/lib/isotree/version.rb +1 -1
- metadata +17 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2e1eee89fa5df77e8f659e270c1d73ebb658baf8e4e52756e4ba17c2b1efd502
|
4
|
+
data.tar.gz: d0c1725819661ae742febd10decd4ff3d3c3bd8717e59127784488dd1e0ae890
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b887f8c29061c3577614fe3a267901498852c3de5fe50c6281465722d4b30acd49b68338de96a8a92690558e18cdbe34339e761f7d93d3c828dba40e8eef1d21
|
7
|
+
data.tar.gz: cc396e69aac246653bb45692a2097e0d8ca345a6cc7089c3d76cb8df0afc08feff44d79a2705fb99d6fee0df34cfe22e98ff85b914f3d0e4b87c2e8f1bbcec6d
|
data/CHANGELOG.md
CHANGED
data/LICENSE.txt
CHANGED
data/README.md
CHANGED
@@ -4,6 +4,8 @@
|
|
4
4
|
|
5
5
|
Learn how [Isolation Forest](https://www.youtube.com/watch?v=RyFQXQf4w4w) works
|
6
6
|
|
7
|
+
:deciduous_tree: Check out [OutlierTree](https://github.com/ankane/outliertree) for human-readable explanations of outliers
|
8
|
+
|
7
9
|
[![Build Status](https://travis-ci.org/ankane/isotree.svg?branch=master)](https://travis-ci.org/ankane/isotree)
|
8
10
|
|
9
11
|
## Installation
|
@@ -19,20 +21,24 @@ gem 'isotree'
|
|
19
21
|
Prep your data
|
20
22
|
|
21
23
|
```ruby
|
22
|
-
|
24
|
+
data = [
|
25
|
+
{department: "Books", sale: false, price: 2.50},
|
26
|
+
{department: "Books", sale: true, price: 3.00},
|
27
|
+
{department: "Movies", sale: false, price: 5.00}
|
28
|
+
]
|
23
29
|
```
|
24
30
|
|
25
31
|
Train a model
|
26
32
|
|
27
33
|
```ruby
|
28
34
|
model = IsoTree::IsolationForest.new
|
29
|
-
model.fit(
|
35
|
+
model.fit(data)
|
30
36
|
```
|
31
37
|
|
32
38
|
Get outlier scores
|
33
39
|
|
34
40
|
```ruby
|
35
|
-
model.predict(
|
41
|
+
model.predict(data)
|
36
42
|
```
|
37
43
|
|
38
44
|
Scores are between 0 and 1, with higher scores indicating outliers
|
@@ -67,10 +73,20 @@ See a [detailed explanation](https://isotree.readthedocs.io/en/latest/#isotree.I
|
|
67
73
|
|
68
74
|
## Data
|
69
75
|
|
70
|
-
Data can be an array of
|
76
|
+
Data can be an array of hashes
|
77
|
+
|
78
|
+
```ruby
|
79
|
+
[
|
80
|
+
{department: "Books", sale: false, price: 2.50},
|
81
|
+
{department: "Books", sale: true, price: 3.00},
|
82
|
+
{department: "Movies", sale: false, price: 5.00}
|
83
|
+
]
|
84
|
+
```
|
85
|
+
|
86
|
+
Or a Rover data frame
|
71
87
|
|
72
88
|
```ruby
|
73
|
-
|
89
|
+
Rover.read_csv("data.csv")
|
74
90
|
```
|
75
91
|
|
76
92
|
Or a Numo array
|
@@ -94,6 +110,14 @@ gem uninstall isotree --force
|
|
94
110
|
bundle install
|
95
111
|
```
|
96
112
|
|
113
|
+
## Reference
|
114
|
+
|
115
|
+
Get the average isolation depth
|
116
|
+
|
117
|
+
```ruby
|
118
|
+
model.predict(data, output: "avg_depth")
|
119
|
+
```
|
120
|
+
|
97
121
|
## History
|
98
122
|
|
99
123
|
View the [changelog](https://github.com/ankane/isotree/blob/master/CHANGELOG.md)
|
data/ext/isotree/ext.cpp
CHANGED
@@ -33,12 +33,22 @@ void Init_ext()
|
|
33
33
|
|
34
34
|
// data
|
35
35
|
size_t nrows = options.get<size_t, Symbol>("nrows");
|
36
|
-
size_t
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
36
|
+
size_t ncols_numeric = options.get<size_t, Symbol>("ncols_numeric");
|
37
|
+
size_t ncols_categ = options.get<size_t, Symbol>("ncols_categ");
|
38
|
+
|
39
|
+
double *restrict numeric_data = NULL;
|
40
|
+
if (ncols_numeric > 0) {
|
41
|
+
numeric_data = (double*) options.get<String, Symbol>("numeric_data").c_str();
|
42
|
+
}
|
43
|
+
|
44
|
+
int *restrict categorical_data = NULL;
|
45
|
+
int *restrict ncat = NULL;
|
46
|
+
if (ncols_categ > 0) {
|
47
|
+
categorical_data = (int*) options.get<String, Symbol>("categorical_data").c_str();
|
48
|
+
ncat = (int*) options.get<String, Symbol>("ncat").c_str();
|
49
|
+
}
|
50
|
+
|
51
|
+
// not used (sparse matrices)
|
42
52
|
double* Xc = NULL;
|
43
53
|
sparse_ix* Xc_ind = NULL;
|
44
54
|
sparse_ix* Xc_indptr = NULL;
|
@@ -86,7 +96,7 @@ void Init_ext()
|
|
86
96
|
&iso,
|
87
97
|
numeric_data,
|
88
98
|
ncols_numeric,
|
89
|
-
|
99
|
+
categorical_data,
|
90
100
|
ncols_categ,
|
91
101
|
ncat,
|
92
102
|
Xc,
|
@@ -136,8 +146,20 @@ void Init_ext()
|
|
136
146
|
*[](ExtIsoForest& iso, Hash options) {
|
137
147
|
// data
|
138
148
|
size_t nrows = options.get<size_t, Symbol>("nrows");
|
139
|
-
|
140
|
-
|
149
|
+
size_t ncols_numeric = options.get<size_t, Symbol>("ncols_numeric");
|
150
|
+
size_t ncols_categ = options.get<size_t, Symbol>("ncols_categ");
|
151
|
+
|
152
|
+
double *restrict numeric_data = NULL;
|
153
|
+
if (ncols_numeric > 0) {
|
154
|
+
numeric_data = (double*) options.get<String, Symbol>("numeric_data").c_str();
|
155
|
+
}
|
156
|
+
|
157
|
+
int *restrict categorical_data = NULL;
|
158
|
+
if (ncols_categ > 0) {
|
159
|
+
categorical_data = (int*) options.get<String, Symbol>("categorical_data").c_str();
|
160
|
+
}
|
161
|
+
|
162
|
+
// not used (sparse matrices)
|
141
163
|
double* Xc = NULL;
|
142
164
|
sparse_ix* Xc_ind = NULL;
|
143
165
|
sparse_ix* Xc_indptr = NULL;
|
@@ -147,13 +169,13 @@ void Init_ext()
|
|
147
169
|
|
148
170
|
// options
|
149
171
|
int nthreads = options.get<int, Symbol>("nthreads");
|
150
|
-
bool standardize =
|
172
|
+
bool standardize = options.get<bool, Symbol>("standardize");
|
151
173
|
std::vector<double> outlier_scores(nrows);
|
152
174
|
sparse_ix* tree_num = NULL;
|
153
175
|
|
154
176
|
predict_iforest(
|
155
177
|
numeric_data,
|
156
|
-
|
178
|
+
categorical_data,
|
157
179
|
Xc,
|
158
180
|
Xc_ind,
|
159
181
|
Xc_indptr,
|
data/lib/isotree.rb
CHANGED
@@ -0,0 +1,73 @@
|
|
1
|
+
module IsoTree
|
2
|
+
class Dataset
|
3
|
+
attr_reader :numeric_columns, :categorical_columns, :array_type
|
4
|
+
|
5
|
+
def initialize(data)
|
6
|
+
@data = data
|
7
|
+
|
8
|
+
if defined?(Rover::DataFrame) && data.is_a?(Rover::DataFrame)
|
9
|
+
@vectors = data.vectors
|
10
|
+
@numeric_columns, @categorical_columns = data.keys.partition { |k, v| ![:object, :bool].include?(data[k].type) }
|
11
|
+
@array_type = false
|
12
|
+
elsif defined?(Numo::NArray) && data.is_a?(Numo::NArray)
|
13
|
+
raise ArgumentError, "Input must have 2 dimensions" if data.ndim != 2
|
14
|
+
|
15
|
+
data = data.cast_to(Numo::DFloat)
|
16
|
+
ncols = data.shape[1]
|
17
|
+
|
18
|
+
@numeric_columns = ncols.times.to_a
|
19
|
+
@categorical_columns = []
|
20
|
+
|
21
|
+
@vectors = {}
|
22
|
+
@numeric_columns.each do |k|
|
23
|
+
@vectors[k] = data[true, k]
|
24
|
+
end
|
25
|
+
@array_type = true
|
26
|
+
else
|
27
|
+
data = data.to_a
|
28
|
+
|
29
|
+
hashes = data.all? { |d| d.is_a?(Hash) }
|
30
|
+
arrays = !hashes && data.all? { |d| d.is_a?(Array) }
|
31
|
+
unless hashes || arrays
|
32
|
+
raise ArgumentError, "Array elements must be all hashes or arrays"
|
33
|
+
end
|
34
|
+
|
35
|
+
nrows = data.size
|
36
|
+
ncols = data.first ? data.first.size : 0
|
37
|
+
if data.any? { |r| r.size != ncols }
|
38
|
+
raise ArgumentError, "All rows must have the same number of columns"
|
39
|
+
end
|
40
|
+
|
41
|
+
keys =
|
42
|
+
if hashes
|
43
|
+
data.flat_map(&:keys).uniq
|
44
|
+
else
|
45
|
+
ncols.times.to_a
|
46
|
+
end
|
47
|
+
|
48
|
+
@vectors = {}
|
49
|
+
keys.each do |k|
|
50
|
+
@vectors[k] = []
|
51
|
+
end
|
52
|
+
data.each do |d|
|
53
|
+
keys.each do |k|
|
54
|
+
@vectors[k] << d[k]
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
@numeric_columns, @categorical_columns = keys.partition { |k| @vectors[k].all? { |v| v.nil? || v.is_a?(Numeric) } }
|
59
|
+
@array_type = arrays
|
60
|
+
end
|
61
|
+
|
62
|
+
raise ArgumentError, "No data" if size == 0
|
63
|
+
end
|
64
|
+
|
65
|
+
def [](k)
|
66
|
+
@vectors[k]
|
67
|
+
end
|
68
|
+
|
69
|
+
def size
|
70
|
+
@vectors.any? ? @vectors.values.first.size : 0
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
@@ -32,52 +32,105 @@ module IsoTree
|
|
32
32
|
end
|
33
33
|
|
34
34
|
def fit(x)
|
35
|
+
x = Dataset.new(x)
|
36
|
+
prep_fit(x)
|
35
37
|
options = data_options(x).merge(fit_options)
|
36
38
|
options[:sample_size] ||= options[:nrows]
|
37
|
-
@ncols = options[:ncols]
|
38
39
|
@ext_iso_forest = Ext.fit_iforest(options)
|
39
40
|
end
|
40
41
|
|
41
|
-
def predict(x)
|
42
|
+
def predict(x, output: "score")
|
42
43
|
raise "Not fit" unless @ext_iso_forest
|
44
|
+
|
45
|
+
x = Dataset.new(x)
|
46
|
+
prep_predict(x)
|
47
|
+
|
43
48
|
options = data_options(x).merge(nthreads: @nthreads)
|
44
|
-
|
45
|
-
|
49
|
+
case output
|
50
|
+
when "score"
|
51
|
+
options[:standardize] = true
|
52
|
+
when "avg_depth"
|
53
|
+
options[:standardize] = false
|
54
|
+
else
|
55
|
+
raise ArgumentError, "Unknown output"
|
46
56
|
end
|
57
|
+
|
47
58
|
Ext.predict_iforest(@ext_iso_forest, options)
|
48
59
|
end
|
49
60
|
|
50
61
|
private
|
51
62
|
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
raise ArgumentError, "All rows must have the same number of columns"
|
63
|
+
def prep_fit(df)
|
64
|
+
@numeric_columns = df.numeric_columns
|
65
|
+
@categorical_columns = df.categorical_columns
|
66
|
+
@categories = {}
|
67
|
+
@categorical_columns.each do |k|
|
68
|
+
@categories[k] = df[k].uniq.to_a.compact.map.with_index.to_h
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
# TODO handle column type mismatches
|
73
|
+
def prep_predict(df)
|
74
|
+
expected_columns = @numeric_columns + @categorical_columns
|
75
|
+
if df.array_type
|
76
|
+
if df.numeric_columns.size + df.categorical_columns.size != expected_columns.size
|
77
|
+
raise ArgumentError, "Input must have #{expected_columns.size} columns for this model"
|
68
78
|
end
|
69
|
-
|
70
|
-
|
71
|
-
|
79
|
+
end
|
80
|
+
expected_columns.each do |k|
|
81
|
+
raise ArgumentError, "Missing column: #{k}" unless df[k]
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
def data_options(df)
|
86
|
+
options = {}
|
87
|
+
|
88
|
+
# numeric
|
89
|
+
numeric_data = String.new
|
90
|
+
@numeric_columns.each do |k|
|
91
|
+
v = df[k]
|
92
|
+
v = v.to_numo if v.respond_to?(:to_numo) # Rover
|
93
|
+
binary_str =
|
94
|
+
if v.respond_to?(:to_binary) # Rover and Numo
|
95
|
+
v.cast_to(Numo::DFloat).to_binary
|
96
|
+
else
|
97
|
+
v.pack("d*")
|
98
|
+
end
|
99
|
+
numeric_data << binary_str
|
100
|
+
end
|
101
|
+
options[:numeric_data] = numeric_data
|
102
|
+
options[:ncols_numeric] = @numeric_columns.size
|
103
|
+
|
104
|
+
# categorical
|
105
|
+
categorical_data = String.new
|
106
|
+
ncat = String.new
|
107
|
+
@categorical_columns.each do |k|
|
108
|
+
categories = @categories[k]
|
109
|
+
# for unseen values, set to categories.size
|
110
|
+
categories_size = categories.size
|
111
|
+
values = df[k].map { |v| v.nil? ? -1 : (categories[v] || categories_size) }
|
112
|
+
# TODO make more efficient
|
113
|
+
if values.any? { |v| v == categories_size }
|
114
|
+
warn "[isotree] Unseen values in column: #{k}"
|
72
115
|
end
|
116
|
+
|
117
|
+
v = values
|
118
|
+
v = v.to_numo if v.respond_to?(:to_numo) # Rover
|
119
|
+
binary_str =
|
120
|
+
if v.respond_to?(:to_binary) # Rover and Numo
|
121
|
+
v.cast_to(Numo::Int32).to_binary
|
122
|
+
else
|
123
|
+
v.pack("i*")
|
124
|
+
end
|
125
|
+
categorical_data << binary_str
|
126
|
+
ncat << [categories.size].pack("i")
|
73
127
|
end
|
74
|
-
|
128
|
+
options[:categorical_data] = categorical_data
|
129
|
+
options[:ncols_categ] = @categorical_columns.size
|
130
|
+
options[:ncat] = ncat
|
75
131
|
|
76
|
-
|
77
|
-
|
78
|
-
ncols: ncols,
|
79
|
-
numeric_data: numeric_data
|
80
|
-
}
|
132
|
+
options[:nrows] = df.size
|
133
|
+
options
|
81
134
|
end
|
82
135
|
|
83
136
|
def fit_options
|
data/lib/isotree/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: isotree
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-08-
|
11
|
+
date: 2020-08-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rice
|
@@ -94,6 +94,20 @@ dependencies:
|
|
94
94
|
- - ">="
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: rover-df
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
97
111
|
description:
|
98
112
|
email: andrew@chartkick.com
|
99
113
|
executables: []
|
@@ -107,6 +121,7 @@ files:
|
|
107
121
|
- ext/isotree/ext.cpp
|
108
122
|
- ext/isotree/extconf.rb
|
109
123
|
- lib/isotree.rb
|
124
|
+
- lib/isotree/dataset.rb
|
110
125
|
- lib/isotree/isolation_forest.rb
|
111
126
|
- lib/isotree/version.rb
|
112
127
|
- vendor/isotree/LICENSE
|