isotree 0.1.2 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/LICENSE.txt +1 -0
- data/README.md +29 -5
- data/ext/isotree/ext.cpp +33 -11
- data/lib/isotree.rb +1 -0
- data/lib/isotree/dataset.rb +73 -0
- data/lib/isotree/isolation_forest.rb +82 -29
- data/lib/isotree/version.rb +1 -1
- metadata +17 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 2e1eee89fa5df77e8f659e270c1d73ebb658baf8e4e52756e4ba17c2b1efd502
|
4
|
+
data.tar.gz: d0c1725819661ae742febd10decd4ff3d3c3bd8717e59127784488dd1e0ae890
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b887f8c29061c3577614fe3a267901498852c3de5fe50c6281465722d4b30acd49b68338de96a8a92690558e18cdbe34339e761f7d93d3c828dba40e8eef1d21
|
7
|
+
data.tar.gz: cc396e69aac246653bb45692a2097e0d8ca345a6cc7089c3d76cb8df0afc08feff44d79a2705fb99d6fee0df34cfe22e98ff85b914f3d0e4b87c2e8f1bbcec6d
|
data/CHANGELOG.md
CHANGED
data/LICENSE.txt
CHANGED
data/README.md
CHANGED
@@ -4,6 +4,8 @@
|
|
4
4
|
|
5
5
|
Learn how [Isolation Forest](https://www.youtube.com/watch?v=RyFQXQf4w4w) works
|
6
6
|
|
7
|
+
:deciduous_tree: Check out [OutlierTree](https://github.com/ankane/outliertree) for human-readable explanations of outliers
|
8
|
+
|
7
9
|
[](https://travis-ci.org/ankane/isotree)
|
8
10
|
|
9
11
|
## Installation
|
@@ -19,20 +21,24 @@ gem 'isotree'
|
|
19
21
|
Prep your data
|
20
22
|
|
21
23
|
```ruby
|
22
|
-
|
24
|
+
data = [
|
25
|
+
{department: "Books", sale: false, price: 2.50},
|
26
|
+
{department: "Books", sale: true, price: 3.00},
|
27
|
+
{department: "Movies", sale: false, price: 5.00}
|
28
|
+
]
|
23
29
|
```
|
24
30
|
|
25
31
|
Train a model
|
26
32
|
|
27
33
|
```ruby
|
28
34
|
model = IsoTree::IsolationForest.new
|
29
|
-
model.fit(
|
35
|
+
model.fit(data)
|
30
36
|
```
|
31
37
|
|
32
38
|
Get outlier scores
|
33
39
|
|
34
40
|
```ruby
|
35
|
-
model.predict(
|
41
|
+
model.predict(data)
|
36
42
|
```
|
37
43
|
|
38
44
|
Scores are between 0 and 1, with higher scores indicating outliers
|
@@ -67,10 +73,20 @@ See a [detailed explanation](https://isotree.readthedocs.io/en/latest/#isotree.I
|
|
67
73
|
|
68
74
|
## Data
|
69
75
|
|
70
|
-
Data can be an array of
|
76
|
+
Data can be an array of hashes
|
77
|
+
|
78
|
+
```ruby
|
79
|
+
[
|
80
|
+
{department: "Books", sale: false, price: 2.50},
|
81
|
+
{department: "Books", sale: true, price: 3.00},
|
82
|
+
{department: "Movies", sale: false, price: 5.00}
|
83
|
+
]
|
84
|
+
```
|
85
|
+
|
86
|
+
Or a Rover data frame
|
71
87
|
|
72
88
|
```ruby
|
73
|
-
|
89
|
+
Rover.read_csv("data.csv")
|
74
90
|
```
|
75
91
|
|
76
92
|
Or a Numo array
|
@@ -94,6 +110,14 @@ gem uninstall isotree --force
|
|
94
110
|
bundle install
|
95
111
|
```
|
96
112
|
|
113
|
+
## Reference
|
114
|
+
|
115
|
+
Get the average isolation depth
|
116
|
+
|
117
|
+
```ruby
|
118
|
+
model.predict(data, output: "avg_depth")
|
119
|
+
```
|
120
|
+
|
97
121
|
## History
|
98
122
|
|
99
123
|
View the [changelog](https://github.com/ankane/isotree/blob/master/CHANGELOG.md)
|
data/ext/isotree/ext.cpp
CHANGED
@@ -33,12 +33,22 @@ void Init_ext()
|
|
33
33
|
|
34
34
|
// data
|
35
35
|
size_t nrows = options.get<size_t, Symbol>("nrows");
|
36
|
-
size_t
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
36
|
+
size_t ncols_numeric = options.get<size_t, Symbol>("ncols_numeric");
|
37
|
+
size_t ncols_categ = options.get<size_t, Symbol>("ncols_categ");
|
38
|
+
|
39
|
+
double *restrict numeric_data = NULL;
|
40
|
+
if (ncols_numeric > 0) {
|
41
|
+
numeric_data = (double*) options.get<String, Symbol>("numeric_data").c_str();
|
42
|
+
}
|
43
|
+
|
44
|
+
int *restrict categorical_data = NULL;
|
45
|
+
int *restrict ncat = NULL;
|
46
|
+
if (ncols_categ > 0) {
|
47
|
+
categorical_data = (int*) options.get<String, Symbol>("categorical_data").c_str();
|
48
|
+
ncat = (int*) options.get<String, Symbol>("ncat").c_str();
|
49
|
+
}
|
50
|
+
|
51
|
+
// not used (sparse matrices)
|
42
52
|
double* Xc = NULL;
|
43
53
|
sparse_ix* Xc_ind = NULL;
|
44
54
|
sparse_ix* Xc_indptr = NULL;
|
@@ -86,7 +96,7 @@ void Init_ext()
|
|
86
96
|
&iso,
|
87
97
|
numeric_data,
|
88
98
|
ncols_numeric,
|
89
|
-
|
99
|
+
categorical_data,
|
90
100
|
ncols_categ,
|
91
101
|
ncat,
|
92
102
|
Xc,
|
@@ -136,8 +146,20 @@ void Init_ext()
|
|
136
146
|
*[](ExtIsoForest& iso, Hash options) {
|
137
147
|
// data
|
138
148
|
size_t nrows = options.get<size_t, Symbol>("nrows");
|
139
|
-
|
140
|
-
|
149
|
+
size_t ncols_numeric = options.get<size_t, Symbol>("ncols_numeric");
|
150
|
+
size_t ncols_categ = options.get<size_t, Symbol>("ncols_categ");
|
151
|
+
|
152
|
+
double *restrict numeric_data = NULL;
|
153
|
+
if (ncols_numeric > 0) {
|
154
|
+
numeric_data = (double*) options.get<String, Symbol>("numeric_data").c_str();
|
155
|
+
}
|
156
|
+
|
157
|
+
int *restrict categorical_data = NULL;
|
158
|
+
if (ncols_categ > 0) {
|
159
|
+
categorical_data = (int*) options.get<String, Symbol>("categorical_data").c_str();
|
160
|
+
}
|
161
|
+
|
162
|
+
// not used (sparse matrices)
|
141
163
|
double* Xc = NULL;
|
142
164
|
sparse_ix* Xc_ind = NULL;
|
143
165
|
sparse_ix* Xc_indptr = NULL;
|
@@ -147,13 +169,13 @@ void Init_ext()
|
|
147
169
|
|
148
170
|
// options
|
149
171
|
int nthreads = options.get<int, Symbol>("nthreads");
|
150
|
-
bool standardize =
|
172
|
+
bool standardize = options.get<bool, Symbol>("standardize");
|
151
173
|
std::vector<double> outlier_scores(nrows);
|
152
174
|
sparse_ix* tree_num = NULL;
|
153
175
|
|
154
176
|
predict_iforest(
|
155
177
|
numeric_data,
|
156
|
-
|
178
|
+
categorical_data,
|
157
179
|
Xc,
|
158
180
|
Xc_ind,
|
159
181
|
Xc_indptr,
|
data/lib/isotree.rb
CHANGED
@@ -0,0 +1,73 @@
|
|
1
|
+
module IsoTree
|
2
|
+
class Dataset
|
3
|
+
attr_reader :numeric_columns, :categorical_columns, :array_type
|
4
|
+
|
5
|
+
def initialize(data)
|
6
|
+
@data = data
|
7
|
+
|
8
|
+
if defined?(Rover::DataFrame) && data.is_a?(Rover::DataFrame)
|
9
|
+
@vectors = data.vectors
|
10
|
+
@numeric_columns, @categorical_columns = data.keys.partition { |k, v| ![:object, :bool].include?(data[k].type) }
|
11
|
+
@array_type = false
|
12
|
+
elsif defined?(Numo::NArray) && data.is_a?(Numo::NArray)
|
13
|
+
raise ArgumentError, "Input must have 2 dimensions" if data.ndim != 2
|
14
|
+
|
15
|
+
data = data.cast_to(Numo::DFloat)
|
16
|
+
ncols = data.shape[1]
|
17
|
+
|
18
|
+
@numeric_columns = ncols.times.to_a
|
19
|
+
@categorical_columns = []
|
20
|
+
|
21
|
+
@vectors = {}
|
22
|
+
@numeric_columns.each do |k|
|
23
|
+
@vectors[k] = data[true, k]
|
24
|
+
end
|
25
|
+
@array_type = true
|
26
|
+
else
|
27
|
+
data = data.to_a
|
28
|
+
|
29
|
+
hashes = data.all? { |d| d.is_a?(Hash) }
|
30
|
+
arrays = !hashes && data.all? { |d| d.is_a?(Array) }
|
31
|
+
unless hashes || arrays
|
32
|
+
raise ArgumentError, "Array elements must be all hashes or arrays"
|
33
|
+
end
|
34
|
+
|
35
|
+
nrows = data.size
|
36
|
+
ncols = data.first ? data.first.size : 0
|
37
|
+
if data.any? { |r| r.size != ncols }
|
38
|
+
raise ArgumentError, "All rows must have the same number of columns"
|
39
|
+
end
|
40
|
+
|
41
|
+
keys =
|
42
|
+
if hashes
|
43
|
+
data.flat_map(&:keys).uniq
|
44
|
+
else
|
45
|
+
ncols.times.to_a
|
46
|
+
end
|
47
|
+
|
48
|
+
@vectors = {}
|
49
|
+
keys.each do |k|
|
50
|
+
@vectors[k] = []
|
51
|
+
end
|
52
|
+
data.each do |d|
|
53
|
+
keys.each do |k|
|
54
|
+
@vectors[k] << d[k]
|
55
|
+
end
|
56
|
+
end
|
57
|
+
|
58
|
+
@numeric_columns, @categorical_columns = keys.partition { |k| @vectors[k].all? { |v| v.nil? || v.is_a?(Numeric) } }
|
59
|
+
@array_type = arrays
|
60
|
+
end
|
61
|
+
|
62
|
+
raise ArgumentError, "No data" if size == 0
|
63
|
+
end
|
64
|
+
|
65
|
+
def [](k)
|
66
|
+
@vectors[k]
|
67
|
+
end
|
68
|
+
|
69
|
+
def size
|
70
|
+
@vectors.any? ? @vectors.values.first.size : 0
|
71
|
+
end
|
72
|
+
end
|
73
|
+
end
|
@@ -32,52 +32,105 @@ module IsoTree
|
|
32
32
|
end
|
33
33
|
|
34
34
|
def fit(x)
|
35
|
+
x = Dataset.new(x)
|
36
|
+
prep_fit(x)
|
35
37
|
options = data_options(x).merge(fit_options)
|
36
38
|
options[:sample_size] ||= options[:nrows]
|
37
|
-
@ncols = options[:ncols]
|
38
39
|
@ext_iso_forest = Ext.fit_iforest(options)
|
39
40
|
end
|
40
41
|
|
41
|
-
def predict(x)
|
42
|
+
def predict(x, output: "score")
|
42
43
|
raise "Not fit" unless @ext_iso_forest
|
44
|
+
|
45
|
+
x = Dataset.new(x)
|
46
|
+
prep_predict(x)
|
47
|
+
|
43
48
|
options = data_options(x).merge(nthreads: @nthreads)
|
44
|
-
|
45
|
-
|
49
|
+
case output
|
50
|
+
when "score"
|
51
|
+
options[:standardize] = true
|
52
|
+
when "avg_depth"
|
53
|
+
options[:standardize] = false
|
54
|
+
else
|
55
|
+
raise ArgumentError, "Unknown output"
|
46
56
|
end
|
57
|
+
|
47
58
|
Ext.predict_iforest(@ext_iso_forest, options)
|
48
59
|
end
|
49
60
|
|
50
61
|
private
|
51
62
|
|
52
|
-
|
53
|
-
|
54
|
-
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
60
|
-
|
61
|
-
|
62
|
-
|
63
|
-
|
64
|
-
|
65
|
-
|
66
|
-
|
67
|
-
raise ArgumentError, "All rows must have the same number of columns"
|
63
|
+
def prep_fit(df)
|
64
|
+
@numeric_columns = df.numeric_columns
|
65
|
+
@categorical_columns = df.categorical_columns
|
66
|
+
@categories = {}
|
67
|
+
@categorical_columns.each do |k|
|
68
|
+
@categories[k] = df[k].uniq.to_a.compact.map.with_index.to_h
|
69
|
+
end
|
70
|
+
end
|
71
|
+
|
72
|
+
# TODO handle column type mismatches
|
73
|
+
def prep_predict(df)
|
74
|
+
expected_columns = @numeric_columns + @categorical_columns
|
75
|
+
if df.array_type
|
76
|
+
if df.numeric_columns.size + df.categorical_columns.size != expected_columns.size
|
77
|
+
raise ArgumentError, "Input must have #{expected_columns.size} columns for this model"
|
68
78
|
end
|
69
|
-
|
70
|
-
|
71
|
-
|
79
|
+
end
|
80
|
+
expected_columns.each do |k|
|
81
|
+
raise ArgumentError, "Missing column: #{k}" unless df[k]
|
82
|
+
end
|
83
|
+
end
|
84
|
+
|
85
|
+
def data_options(df)
|
86
|
+
options = {}
|
87
|
+
|
88
|
+
# numeric
|
89
|
+
numeric_data = String.new
|
90
|
+
@numeric_columns.each do |k|
|
91
|
+
v = df[k]
|
92
|
+
v = v.to_numo if v.respond_to?(:to_numo) # Rover
|
93
|
+
binary_str =
|
94
|
+
if v.respond_to?(:to_binary) # Rover and Numo
|
95
|
+
v.cast_to(Numo::DFloat).to_binary
|
96
|
+
else
|
97
|
+
v.pack("d*")
|
98
|
+
end
|
99
|
+
numeric_data << binary_str
|
100
|
+
end
|
101
|
+
options[:numeric_data] = numeric_data
|
102
|
+
options[:ncols_numeric] = @numeric_columns.size
|
103
|
+
|
104
|
+
# categorical
|
105
|
+
categorical_data = String.new
|
106
|
+
ncat = String.new
|
107
|
+
@categorical_columns.each do |k|
|
108
|
+
categories = @categories[k]
|
109
|
+
# for unseen values, set to categories.size
|
110
|
+
categories_size = categories.size
|
111
|
+
values = df[k].map { |v| v.nil? ? -1 : (categories[v] || categories_size) }
|
112
|
+
# TODO make more efficient
|
113
|
+
if values.any? { |v| v == categories_size }
|
114
|
+
warn "[isotree] Unseen values in column: #{k}"
|
72
115
|
end
|
116
|
+
|
117
|
+
v = values
|
118
|
+
v = v.to_numo if v.respond_to?(:to_numo) # Rover
|
119
|
+
binary_str =
|
120
|
+
if v.respond_to?(:to_binary) # Rover and Numo
|
121
|
+
v.cast_to(Numo::Int32).to_binary
|
122
|
+
else
|
123
|
+
v.pack("i*")
|
124
|
+
end
|
125
|
+
categorical_data << binary_str
|
126
|
+
ncat << [categories.size].pack("i")
|
73
127
|
end
|
74
|
-
|
128
|
+
options[:categorical_data] = categorical_data
|
129
|
+
options[:ncols_categ] = @categorical_columns.size
|
130
|
+
options[:ncat] = ncat
|
75
131
|
|
76
|
-
|
77
|
-
|
78
|
-
ncols: ncols,
|
79
|
-
numeric_data: numeric_data
|
80
|
-
}
|
132
|
+
options[:nrows] = df.size
|
133
|
+
options
|
81
134
|
end
|
82
135
|
|
83
136
|
def fit_options
|
data/lib/isotree/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: isotree
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-08-
|
11
|
+
date: 2020-08-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rice
|
@@ -94,6 +94,20 @@ dependencies:
|
|
94
94
|
- - ">="
|
95
95
|
- !ruby/object:Gem::Version
|
96
96
|
version: '0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: rover-df
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - ">="
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '0'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - ">="
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '0'
|
97
111
|
description:
|
98
112
|
email: andrew@chartkick.com
|
99
113
|
executables: []
|
@@ -107,6 +121,7 @@ files:
|
|
107
121
|
- ext/isotree/ext.cpp
|
108
122
|
- ext/isotree/extconf.rb
|
109
123
|
- lib/isotree.rb
|
124
|
+
- lib/isotree/dataset.rb
|
110
125
|
- lib/isotree/isolation_forest.rb
|
111
126
|
- lib/isotree/version.rb
|
112
127
|
- vendor/isotree/LICENSE
|