isotree 0.1.2 → 0.1.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b15de55d1a752d14cc97e2b5372308b2d4cb6a1e6fcfce0a05da6f769708b189
4
- data.tar.gz: af21414cea40a26b2e291230e5d48bf4f804e1c77837a3132921b896bc617961
3
+ metadata.gz: 2e1eee89fa5df77e8f659e270c1d73ebb658baf8e4e52756e4ba17c2b1efd502
4
+ data.tar.gz: d0c1725819661ae742febd10decd4ff3d3c3bd8717e59127784488dd1e0ae890
5
5
  SHA512:
6
- metadata.gz: 8127b5402c9c9f03bd2bd475b01a5cc8fbd3900ac1517d401ff4647d634e1f1049c8de51086095b132f30217f3571f8aa9e84c5fd18a0d3ac420a84203da85b7
7
- data.tar.gz: 63b26ee19d8c49ce33d61891110db56597221a776830eb2aaad84c6d46038cb30822431a6f30b1051289f6becab0b652d968fbd4cf065c0925d50d5ef769c89a
6
+ metadata.gz: b887f8c29061c3577614fe3a267901498852c3de5fe50c6281465722d4b30acd49b68338de96a8a92690558e18cdbe34339e761f7d93d3c828dba40e8eef1d21
7
+ data.tar.gz: cc396e69aac246653bb45692a2097e0d8ca345a6cc7089c3d76cb8df0afc08feff44d79a2705fb99d6fee0df34cfe22e98ff85b914f3d0e4b87c2e8f1bbcec6d
@@ -1,3 +1,9 @@
1
+ ## 0.1.3 (2020-08-13)
2
+
3
+ - Added support for categorical data
4
+ - Added support for Rover data frames
5
+ - Added `output` option to `predict` method
6
+
1
7
  ## 0.1.2 (2020-08-11)
2
8
 
3
9
  - Fixed outlier scores
@@ -1,5 +1,6 @@
1
1
  BSD 2-Clause License
2
2
 
3
+ Copyright (c) 2019, David Cortes
3
4
  Copyright (c) 2020, Andrew Kane
4
5
  All rights reserved.
5
6
 
data/README.md CHANGED
@@ -4,6 +4,8 @@
4
4
 
5
5
  Learn how [Isolation Forest](https://www.youtube.com/watch?v=RyFQXQf4w4w) works
6
6
 
7
+ :deciduous_tree: Check out [OutlierTree](https://github.com/ankane/outliertree) for human-readable explanations of outliers
8
+
7
9
  [![Build Status](https://travis-ci.org/ankane/isotree.svg?branch=master)](https://travis-ci.org/ankane/isotree)
8
10
 
9
11
  ## Installation
@@ -19,20 +21,24 @@ gem 'isotree'
19
21
  Prep your data
20
22
 
21
23
  ```ruby
22
- x = [[1, 2], [3, 4], [5, 6], [7, 8]]
24
+ data = [
25
+ {department: "Books", sale: false, price: 2.50},
26
+ {department: "Books", sale: true, price: 3.00},
27
+ {department: "Movies", sale: false, price: 5.00}
28
+ ]
23
29
  ```
24
30
 
25
31
  Train a model
26
32
 
27
33
  ```ruby
28
34
  model = IsoTree::IsolationForest.new
29
- model.fit(x)
35
+ model.fit(data)
30
36
  ```
31
37
 
32
38
  Get outlier scores
33
39
 
34
40
  ```ruby
35
- model.predict(x)
41
+ model.predict(data)
36
42
  ```
37
43
 
38
44
  Scores are between 0 and 1, with higher scores indicating outliers
@@ -67,10 +73,20 @@ See a [detailed explanation](https://isotree.readthedocs.io/en/latest/#isotree.I
67
73
 
68
74
  ## Data
69
75
 
70
- Data can be an array of arrays
76
+ Data can be an array of hashes
77
+
78
+ ```ruby
79
+ [
80
+ {department: "Books", sale: false, price: 2.50},
81
+ {department: "Books", sale: true, price: 3.00},
82
+ {department: "Movies", sale: false, price: 5.00}
83
+ ]
84
+ ```
85
+
86
+ Or a Rover data frame
71
87
 
72
88
  ```ruby
73
- [[1, 2, 3], [4, 5, 6]]
89
+ Rover.read_csv("data.csv")
74
90
  ```
75
91
 
76
92
  Or a Numo array
@@ -94,6 +110,14 @@ gem uninstall isotree --force
94
110
  bundle install
95
111
  ```
96
112
 
113
+ ## Reference
114
+
115
+ Get the average isolation depth
116
+
117
+ ```ruby
118
+ model.predict(data, output: "avg_depth")
119
+ ```
120
+
97
121
  ## History
98
122
 
99
123
  View the [changelog](https://github.com/ankane/isotree/blob/master/CHANGELOG.md)
@@ -33,12 +33,22 @@ void Init_ext()
33
33
 
34
34
  // data
35
35
  size_t nrows = options.get<size_t, Symbol>("nrows");
36
- size_t ncols = options.get<size_t, Symbol>("ncols");
37
- double* numeric_data = (double*) options.get<String, Symbol>("numeric_data").c_str();
38
- size_t ncols_numeric = ncols;
39
- int* categ_data = NULL;
40
- size_t ncols_categ = 0;
41
- int* ncat = NULL;
36
+ size_t ncols_numeric = options.get<size_t, Symbol>("ncols_numeric");
37
+ size_t ncols_categ = options.get<size_t, Symbol>("ncols_categ");
38
+
39
+ double *restrict numeric_data = NULL;
40
+ if (ncols_numeric > 0) {
41
+ numeric_data = (double*) options.get<String, Symbol>("numeric_data").c_str();
42
+ }
43
+
44
+ int *restrict categorical_data = NULL;
45
+ int *restrict ncat = NULL;
46
+ if (ncols_categ > 0) {
47
+ categorical_data = (int*) options.get<String, Symbol>("categorical_data").c_str();
48
+ ncat = (int*) options.get<String, Symbol>("ncat").c_str();
49
+ }
50
+
51
+ // not used (sparse matrices)
42
52
  double* Xc = NULL;
43
53
  sparse_ix* Xc_ind = NULL;
44
54
  sparse_ix* Xc_indptr = NULL;
@@ -86,7 +96,7 @@ void Init_ext()
86
96
  &iso,
87
97
  numeric_data,
88
98
  ncols_numeric,
89
- categ_data,
99
+ categorical_data,
90
100
  ncols_categ,
91
101
  ncat,
92
102
  Xc,
@@ -136,8 +146,20 @@ void Init_ext()
136
146
  *[](ExtIsoForest& iso, Hash options) {
137
147
  // data
138
148
  size_t nrows = options.get<size_t, Symbol>("nrows");
139
- double* numeric_data = (double*) options.get<String, Symbol>("numeric_data").c_str();
140
- int* categ_data = NULL;
149
+ size_t ncols_numeric = options.get<size_t, Symbol>("ncols_numeric");
150
+ size_t ncols_categ = options.get<size_t, Symbol>("ncols_categ");
151
+
152
+ double *restrict numeric_data = NULL;
153
+ if (ncols_numeric > 0) {
154
+ numeric_data = (double*) options.get<String, Symbol>("numeric_data").c_str();
155
+ }
156
+
157
+ int *restrict categorical_data = NULL;
158
+ if (ncols_categ > 0) {
159
+ categorical_data = (int*) options.get<String, Symbol>("categorical_data").c_str();
160
+ }
161
+
162
+ // not used (sparse matrices)
141
163
  double* Xc = NULL;
142
164
  sparse_ix* Xc_ind = NULL;
143
165
  sparse_ix* Xc_indptr = NULL;
@@ -147,13 +169,13 @@ void Init_ext()
147
169
 
148
170
  // options
149
171
  int nthreads = options.get<int, Symbol>("nthreads");
150
- bool standardize = true;
172
+ bool standardize = options.get<bool, Symbol>("standardize");
151
173
  std::vector<double> outlier_scores(nrows);
152
174
  sparse_ix* tree_num = NULL;
153
175
 
154
176
  predict_iforest(
155
177
  numeric_data,
156
- categ_data,
178
+ categorical_data,
157
179
  Xc,
158
180
  Xc_ind,
159
181
  Xc_indptr,
@@ -5,5 +5,6 @@ require "isotree/ext"
5
5
  require "etc"
6
6
 
7
7
  # modules
8
+ require "isotree/dataset"
8
9
  require "isotree/isolation_forest"
9
10
  require "isotree/version"
@@ -0,0 +1,73 @@
1
+ module IsoTree
2
+ class Dataset
3
+ attr_reader :numeric_columns, :categorical_columns, :array_type
4
+
5
+ def initialize(data)
6
+ @data = data
7
+
8
+ if defined?(Rover::DataFrame) && data.is_a?(Rover::DataFrame)
9
+ @vectors = data.vectors
10
+ @numeric_columns, @categorical_columns = data.keys.partition { |k, v| ![:object, :bool].include?(data[k].type) }
11
+ @array_type = false
12
+ elsif defined?(Numo::NArray) && data.is_a?(Numo::NArray)
13
+ raise ArgumentError, "Input must have 2 dimensions" if data.ndim != 2
14
+
15
+ data = data.cast_to(Numo::DFloat)
16
+ ncols = data.shape[1]
17
+
18
+ @numeric_columns = ncols.times.to_a
19
+ @categorical_columns = []
20
+
21
+ @vectors = {}
22
+ @numeric_columns.each do |k|
23
+ @vectors[k] = data[true, k]
24
+ end
25
+ @array_type = true
26
+ else
27
+ data = data.to_a
28
+
29
+ hashes = data.all? { |d| d.is_a?(Hash) }
30
+ arrays = !hashes && data.all? { |d| d.is_a?(Array) }
31
+ unless hashes || arrays
32
+ raise ArgumentError, "Array elements must be all hashes or arrays"
33
+ end
34
+
35
+ nrows = data.size
36
+ ncols = data.first ? data.first.size : 0
37
+ if data.any? { |r| r.size != ncols }
38
+ raise ArgumentError, "All rows must have the same number of columns"
39
+ end
40
+
41
+ keys =
42
+ if hashes
43
+ data.flat_map(&:keys).uniq
44
+ else
45
+ ncols.times.to_a
46
+ end
47
+
48
+ @vectors = {}
49
+ keys.each do |k|
50
+ @vectors[k] = []
51
+ end
52
+ data.each do |d|
53
+ keys.each do |k|
54
+ @vectors[k] << d[k]
55
+ end
56
+ end
57
+
58
+ @numeric_columns, @categorical_columns = keys.partition { |k| @vectors[k].all? { |v| v.nil? || v.is_a?(Numeric) } }
59
+ @array_type = arrays
60
+ end
61
+
62
+ raise ArgumentError, "No data" if size == 0
63
+ end
64
+
65
+ def [](k)
66
+ @vectors[k]
67
+ end
68
+
69
+ def size
70
+ @vectors.any? ? @vectors.values.first.size : 0
71
+ end
72
+ end
73
+ end
@@ -32,52 +32,105 @@ module IsoTree
32
32
  end
33
33
 
34
34
  def fit(x)
35
+ x = Dataset.new(x)
36
+ prep_fit(x)
35
37
  options = data_options(x).merge(fit_options)
36
38
  options[:sample_size] ||= options[:nrows]
37
- @ncols = options[:ncols]
38
39
  @ext_iso_forest = Ext.fit_iforest(options)
39
40
  end
40
41
 
41
- def predict(x)
42
+ def predict(x, output: "score")
42
43
  raise "Not fit" unless @ext_iso_forest
44
+
45
+ x = Dataset.new(x)
46
+ prep_predict(x)
47
+
43
48
  options = data_options(x).merge(nthreads: @nthreads)
44
- if options[:ncols] != @ncols
45
- raise ArgumentError, "Input must have #{@ncols} columns for this model"
49
+ case output
50
+ when "score"
51
+ options[:standardize] = true
52
+ when "avg_depth"
53
+ options[:standardize] = false
54
+ else
55
+ raise ArgumentError, "Unknown output"
46
56
  end
57
+
47
58
  Ext.predict_iforest(@ext_iso_forest, options)
48
59
  end
49
60
 
50
61
  private
51
62
 
52
- # TODO support categorical data
53
- def data_options(x)
54
- if defined?(Numo::NArray) && x.is_a?(Numo::NArray)
55
- raise ArgumentError, "Input must have 2 dimensions" if x.ndim != 2
56
- x = x.cast_to(Numo::DFloat)
57
- nrows, ncols = x.shape
58
- numeric_data = String.new
59
- ncols.times do |i|
60
- numeric_data << x[true, i].to_binary
61
- end
62
- else
63
- x = x.to_a
64
- nrows = x.size
65
- ncols = x.first ? x.first.size : 0
66
- if x.any? { |r| r.size != ncols }
67
- raise ArgumentError, "All rows must have the same number of columns"
63
+ def prep_fit(df)
64
+ @numeric_columns = df.numeric_columns
65
+ @categorical_columns = df.categorical_columns
66
+ @categories = {}
67
+ @categorical_columns.each do |k|
68
+ @categories[k] = df[k].uniq.to_a.compact.map.with_index.to_h
69
+ end
70
+ end
71
+
72
+ # TODO handle column type mismatches
73
+ def prep_predict(df)
74
+ expected_columns = @numeric_columns + @categorical_columns
75
+ if df.array_type
76
+ if df.numeric_columns.size + df.categorical_columns.size != expected_columns.size
77
+ raise ArgumentError, "Input must have #{expected_columns.size} columns for this model"
68
78
  end
69
- numeric_data = String.new
70
- ncols.times do |i|
71
- numeric_data << x.map { |v| v[i] }.pack("d*")
79
+ end
80
+ expected_columns.each do |k|
81
+ raise ArgumentError, "Missing column: #{k}" unless df[k]
82
+ end
83
+ end
84
+
85
+ def data_options(df)
86
+ options = {}
87
+
88
+ # numeric
89
+ numeric_data = String.new
90
+ @numeric_columns.each do |k|
91
+ v = df[k]
92
+ v = v.to_numo if v.respond_to?(:to_numo) # Rover
93
+ binary_str =
94
+ if v.respond_to?(:to_binary) # Rover and Numo
95
+ v.cast_to(Numo::DFloat).to_binary
96
+ else
97
+ v.pack("d*")
98
+ end
99
+ numeric_data << binary_str
100
+ end
101
+ options[:numeric_data] = numeric_data
102
+ options[:ncols_numeric] = @numeric_columns.size
103
+
104
+ # categorical
105
+ categorical_data = String.new
106
+ ncat = String.new
107
+ @categorical_columns.each do |k|
108
+ categories = @categories[k]
109
+ # for unseen values, set to categories.size
110
+ categories_size = categories.size
111
+ values = df[k].map { |v| v.nil? ? -1 : (categories[v] || categories_size) }
112
+ # TODO make more efficient
113
+ if values.any? { |v| v == categories_size }
114
+ warn "[isotree] Unseen values in column: #{k}"
72
115
  end
116
+
117
+ v = values
118
+ v = v.to_numo if v.respond_to?(:to_numo) # Rover
119
+ binary_str =
120
+ if v.respond_to?(:to_binary) # Rover and Numo
121
+ v.cast_to(Numo::Int32).to_binary
122
+ else
123
+ v.pack("i*")
124
+ end
125
+ categorical_data << binary_str
126
+ ncat << [categories.size].pack("i")
73
127
  end
74
- raise ArgumentError, "No data" if nrows == 0
128
+ options[:categorical_data] = categorical_data
129
+ options[:ncols_categ] = @categorical_columns.size
130
+ options[:ncat] = ncat
75
131
 
76
- {
77
- nrows: nrows,
78
- ncols: ncols,
79
- numeric_data: numeric_data
80
- }
132
+ options[:nrows] = df.size
133
+ options
81
134
  end
82
135
 
83
136
  def fit_options
@@ -1,3 +1,3 @@
1
1
  module IsoTree
2
- VERSION = "0.1.2"
2
+ VERSION = "0.1.3"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: isotree
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-08-11 00:00:00.000000000 Z
11
+ date: 2020-08-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rice
@@ -94,6 +94,20 @@ dependencies:
94
94
  - - ">="
95
95
  - !ruby/object:Gem::Version
96
96
  version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: rover-df
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
97
111
  description:
98
112
  email: andrew@chartkick.com
99
113
  executables: []
@@ -107,6 +121,7 @@ files:
107
121
  - ext/isotree/ext.cpp
108
122
  - ext/isotree/extconf.rb
109
123
  - lib/isotree.rb
124
+ - lib/isotree/dataset.rb
110
125
  - lib/isotree/isolation_forest.rb
111
126
  - lib/isotree/version.rb
112
127
  - vendor/isotree/LICENSE