isotree 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b15de55d1a752d14cc97e2b5372308b2d4cb6a1e6fcfce0a05da6f769708b189
4
- data.tar.gz: af21414cea40a26b2e291230e5d48bf4f804e1c77837a3132921b896bc617961
3
+ metadata.gz: 2e1eee89fa5df77e8f659e270c1d73ebb658baf8e4e52756e4ba17c2b1efd502
4
+ data.tar.gz: d0c1725819661ae742febd10decd4ff3d3c3bd8717e59127784488dd1e0ae890
5
5
  SHA512:
6
- metadata.gz: 8127b5402c9c9f03bd2bd475b01a5cc8fbd3900ac1517d401ff4647d634e1f1049c8de51086095b132f30217f3571f8aa9e84c5fd18a0d3ac420a84203da85b7
7
- data.tar.gz: 63b26ee19d8c49ce33d61891110db56597221a776830eb2aaad84c6d46038cb30822431a6f30b1051289f6becab0b652d968fbd4cf065c0925d50d5ef769c89a
6
+ metadata.gz: b887f8c29061c3577614fe3a267901498852c3de5fe50c6281465722d4b30acd49b68338de96a8a92690558e18cdbe34339e761f7d93d3c828dba40e8eef1d21
7
+ data.tar.gz: cc396e69aac246653bb45692a2097e0d8ca345a6cc7089c3d76cb8df0afc08feff44d79a2705fb99d6fee0df34cfe22e98ff85b914f3d0e4b87c2e8f1bbcec6d
@@ -1,3 +1,9 @@
1
+ ## 0.1.3 (2020-08-13)
2
+
3
+ - Added support for categorical data
4
+ - Added support for Rover data frames
5
+ - Added `output` option to `predict` method
6
+
1
7
  ## 0.1.2 (2020-08-11)
2
8
 
3
9
  - Fixed outlier scores
@@ -1,5 +1,6 @@
1
1
  BSD 2-Clause License
2
2
 
3
+ Copyright (c) 2019, David Cortes
3
4
  Copyright (c) 2020, Andrew Kane
4
5
  All rights reserved.
5
6
 
data/README.md CHANGED
@@ -4,6 +4,8 @@
4
4
 
5
5
  Learn how [Isolation Forest](https://www.youtube.com/watch?v=RyFQXQf4w4w) works
6
6
 
7
+ :deciduous_tree: Check out [OutlierTree](https://github.com/ankane/outliertree) for human-readable explanations of outliers
8
+
7
9
  [![Build Status](https://travis-ci.org/ankane/isotree.svg?branch=master)](https://travis-ci.org/ankane/isotree)
8
10
 
9
11
  ## Installation
@@ -19,20 +21,24 @@ gem 'isotree'
19
21
  Prep your data
20
22
 
21
23
  ```ruby
22
- x = [[1, 2], [3, 4], [5, 6], [7, 8]]
24
+ data = [
25
+ {department: "Books", sale: false, price: 2.50},
26
+ {department: "Books", sale: true, price: 3.00},
27
+ {department: "Movies", sale: false, price: 5.00}
28
+ ]
23
29
  ```
24
30
 
25
31
  Train a model
26
32
 
27
33
  ```ruby
28
34
  model = IsoTree::IsolationForest.new
29
- model.fit(x)
35
+ model.fit(data)
30
36
  ```
31
37
 
32
38
  Get outlier scores
33
39
 
34
40
  ```ruby
35
- model.predict(x)
41
+ model.predict(data)
36
42
  ```
37
43
 
38
44
  Scores are between 0 and 1, with higher scores indicating outliers
@@ -67,10 +73,20 @@ See a [detailed explanation](https://isotree.readthedocs.io/en/latest/#isotree.I
67
73
 
68
74
  ## Data
69
75
 
70
- Data can be an array of arrays
76
+ Data can be an array of hashes
77
+
78
+ ```ruby
79
+ [
80
+ {department: "Books", sale: false, price: 2.50},
81
+ {department: "Books", sale: true, price: 3.00},
82
+ {department: "Movies", sale: false, price: 5.00}
83
+ ]
84
+ ```
85
+
86
+ Or a Rover data frame
71
87
 
72
88
  ```ruby
73
- [[1, 2, 3], [4, 5, 6]]
89
+ Rover.read_csv("data.csv")
74
90
  ```
75
91
 
76
92
  Or a Numo array
@@ -94,6 +110,14 @@ gem uninstall isotree --force
94
110
  bundle install
95
111
  ```
96
112
 
113
+ ## Reference
114
+
115
+ Get the average isolation depth
116
+
117
+ ```ruby
118
+ model.predict(data, output: "avg_depth")
119
+ ```
120
+
97
121
  ## History
98
122
 
99
123
  View the [changelog](https://github.com/ankane/isotree/blob/master/CHANGELOG.md)
@@ -33,12 +33,22 @@ void Init_ext()
33
33
 
34
34
  // data
35
35
  size_t nrows = options.get<size_t, Symbol>("nrows");
36
- size_t ncols = options.get<size_t, Symbol>("ncols");
37
- double* numeric_data = (double*) options.get<String, Symbol>("numeric_data").c_str();
38
- size_t ncols_numeric = ncols;
39
- int* categ_data = NULL;
40
- size_t ncols_categ = 0;
41
- int* ncat = NULL;
36
+ size_t ncols_numeric = options.get<size_t, Symbol>("ncols_numeric");
37
+ size_t ncols_categ = options.get<size_t, Symbol>("ncols_categ");
38
+
39
+ double *restrict numeric_data = NULL;
40
+ if (ncols_numeric > 0) {
41
+ numeric_data = (double*) options.get<String, Symbol>("numeric_data").c_str();
42
+ }
43
+
44
+ int *restrict categorical_data = NULL;
45
+ int *restrict ncat = NULL;
46
+ if (ncols_categ > 0) {
47
+ categorical_data = (int*) options.get<String, Symbol>("categorical_data").c_str();
48
+ ncat = (int*) options.get<String, Symbol>("ncat").c_str();
49
+ }
50
+
51
+ // not used (sparse matrices)
42
52
  double* Xc = NULL;
43
53
  sparse_ix* Xc_ind = NULL;
44
54
  sparse_ix* Xc_indptr = NULL;
@@ -86,7 +96,7 @@ void Init_ext()
86
96
  &iso,
87
97
  numeric_data,
88
98
  ncols_numeric,
89
- categ_data,
99
+ categorical_data,
90
100
  ncols_categ,
91
101
  ncat,
92
102
  Xc,
@@ -136,8 +146,20 @@ void Init_ext()
136
146
  *[](ExtIsoForest& iso, Hash options) {
137
147
  // data
138
148
  size_t nrows = options.get<size_t, Symbol>("nrows");
139
- double* numeric_data = (double*) options.get<String, Symbol>("numeric_data").c_str();
140
- int* categ_data = NULL;
149
+ size_t ncols_numeric = options.get<size_t, Symbol>("ncols_numeric");
150
+ size_t ncols_categ = options.get<size_t, Symbol>("ncols_categ");
151
+
152
+ double *restrict numeric_data = NULL;
153
+ if (ncols_numeric > 0) {
154
+ numeric_data = (double*) options.get<String, Symbol>("numeric_data").c_str();
155
+ }
156
+
157
+ int *restrict categorical_data = NULL;
158
+ if (ncols_categ > 0) {
159
+ categorical_data = (int*) options.get<String, Symbol>("categorical_data").c_str();
160
+ }
161
+
162
+ // not used (sparse matrices)
141
163
  double* Xc = NULL;
142
164
  sparse_ix* Xc_ind = NULL;
143
165
  sparse_ix* Xc_indptr = NULL;
@@ -147,13 +169,13 @@ void Init_ext()
147
169
 
148
170
  // options
149
171
  int nthreads = options.get<int, Symbol>("nthreads");
150
- bool standardize = true;
172
+ bool standardize = options.get<bool, Symbol>("standardize");
151
173
  std::vector<double> outlier_scores(nrows);
152
174
  sparse_ix* tree_num = NULL;
153
175
 
154
176
  predict_iforest(
155
177
  numeric_data,
156
- categ_data,
178
+ categorical_data,
157
179
  Xc,
158
180
  Xc_ind,
159
181
  Xc_indptr,
@@ -5,5 +5,6 @@ require "isotree/ext"
5
5
  require "etc"
6
6
 
7
7
  # modules
8
+ require "isotree/dataset"
8
9
  require "isotree/isolation_forest"
9
10
  require "isotree/version"
@@ -0,0 +1,73 @@
1
+ module IsoTree
2
+ class Dataset
3
+ attr_reader :numeric_columns, :categorical_columns, :array_type
4
+
5
+ def initialize(data)
6
+ @data = data
7
+
8
+ if defined?(Rover::DataFrame) && data.is_a?(Rover::DataFrame)
9
+ @vectors = data.vectors
10
+ @numeric_columns, @categorical_columns = data.keys.partition { |k, v| ![:object, :bool].include?(data[k].type) }
11
+ @array_type = false
12
+ elsif defined?(Numo::NArray) && data.is_a?(Numo::NArray)
13
+ raise ArgumentError, "Input must have 2 dimensions" if data.ndim != 2
14
+
15
+ data = data.cast_to(Numo::DFloat)
16
+ ncols = data.shape[1]
17
+
18
+ @numeric_columns = ncols.times.to_a
19
+ @categorical_columns = []
20
+
21
+ @vectors = {}
22
+ @numeric_columns.each do |k|
23
+ @vectors[k] = data[true, k]
24
+ end
25
+ @array_type = true
26
+ else
27
+ data = data.to_a
28
+
29
+ hashes = data.all? { |d| d.is_a?(Hash) }
30
+ arrays = !hashes && data.all? { |d| d.is_a?(Array) }
31
+ unless hashes || arrays
32
+ raise ArgumentError, "Array elements must be all hashes or arrays"
33
+ end
34
+
35
+ nrows = data.size
36
+ ncols = data.first ? data.first.size : 0
37
+ if data.any? { |r| r.size != ncols }
38
+ raise ArgumentError, "All rows must have the same number of columns"
39
+ end
40
+
41
+ keys =
42
+ if hashes
43
+ data.flat_map(&:keys).uniq
44
+ else
45
+ ncols.times.to_a
46
+ end
47
+
48
+ @vectors = {}
49
+ keys.each do |k|
50
+ @vectors[k] = []
51
+ end
52
+ data.each do |d|
53
+ keys.each do |k|
54
+ @vectors[k] << d[k]
55
+ end
56
+ end
57
+
58
+ @numeric_columns, @categorical_columns = keys.partition { |k| @vectors[k].all? { |v| v.nil? || v.is_a?(Numeric) } }
59
+ @array_type = arrays
60
+ end
61
+
62
+ raise ArgumentError, "No data" if size == 0
63
+ end
64
+
65
+ def [](k)
66
+ @vectors[k]
67
+ end
68
+
69
+ def size
70
+ @vectors.any? ? @vectors.values.first.size : 0
71
+ end
72
+ end
73
+ end
@@ -32,52 +32,105 @@ module IsoTree
32
32
  end
33
33
 
34
34
  def fit(x)
35
+ x = Dataset.new(x)
36
+ prep_fit(x)
35
37
  options = data_options(x).merge(fit_options)
36
38
  options[:sample_size] ||= options[:nrows]
37
- @ncols = options[:ncols]
38
39
  @ext_iso_forest = Ext.fit_iforest(options)
39
40
  end
40
41
 
41
- def predict(x)
42
+ def predict(x, output: "score")
42
43
  raise "Not fit" unless @ext_iso_forest
44
+
45
+ x = Dataset.new(x)
46
+ prep_predict(x)
47
+
43
48
  options = data_options(x).merge(nthreads: @nthreads)
44
- if options[:ncols] != @ncols
45
- raise ArgumentError, "Input must have #{@ncols} columns for this model"
49
+ case output
50
+ when "score"
51
+ options[:standardize] = true
52
+ when "avg_depth"
53
+ options[:standardize] = false
54
+ else
55
+ raise ArgumentError, "Unknown output"
46
56
  end
57
+
47
58
  Ext.predict_iforest(@ext_iso_forest, options)
48
59
  end
49
60
 
50
61
  private
51
62
 
52
- # TODO support categorical data
53
- def data_options(x)
54
- if defined?(Numo::NArray) && x.is_a?(Numo::NArray)
55
- raise ArgumentError, "Input must have 2 dimensions" if x.ndim != 2
56
- x = x.cast_to(Numo::DFloat)
57
- nrows, ncols = x.shape
58
- numeric_data = String.new
59
- ncols.times do |i|
60
- numeric_data << x[true, i].to_binary
61
- end
62
- else
63
- x = x.to_a
64
- nrows = x.size
65
- ncols = x.first ? x.first.size : 0
66
- if x.any? { |r| r.size != ncols }
67
- raise ArgumentError, "All rows must have the same number of columns"
63
+ def prep_fit(df)
64
+ @numeric_columns = df.numeric_columns
65
+ @categorical_columns = df.categorical_columns
66
+ @categories = {}
67
+ @categorical_columns.each do |k|
68
+ @categories[k] = df[k].uniq.to_a.compact.map.with_index.to_h
69
+ end
70
+ end
71
+
72
+ # TODO handle column type mismatches
73
+ def prep_predict(df)
74
+ expected_columns = @numeric_columns + @categorical_columns
75
+ if df.array_type
76
+ if df.numeric_columns.size + df.categorical_columns.size != expected_columns.size
77
+ raise ArgumentError, "Input must have #{expected_columns.size} columns for this model"
68
78
  end
69
- numeric_data = String.new
70
- ncols.times do |i|
71
- numeric_data << x.map { |v| v[i] }.pack("d*")
79
+ end
80
+ expected_columns.each do |k|
81
+ raise ArgumentError, "Missing column: #{k}" unless df[k]
82
+ end
83
+ end
84
+
85
+ def data_options(df)
86
+ options = {}
87
+
88
+ # numeric
89
+ numeric_data = String.new
90
+ @numeric_columns.each do |k|
91
+ v = df[k]
92
+ v = v.to_numo if v.respond_to?(:to_numo) # Rover
93
+ binary_str =
94
+ if v.respond_to?(:to_binary) # Rover and Numo
95
+ v.cast_to(Numo::DFloat).to_binary
96
+ else
97
+ v.pack("d*")
98
+ end
99
+ numeric_data << binary_str
100
+ end
101
+ options[:numeric_data] = numeric_data
102
+ options[:ncols_numeric] = @numeric_columns.size
103
+
104
+ # categorical
105
+ categorical_data = String.new
106
+ ncat = String.new
107
+ @categorical_columns.each do |k|
108
+ categories = @categories[k]
109
+ # for unseen values, set to categories.size
110
+ categories_size = categories.size
111
+ values = df[k].map { |v| v.nil? ? -1 : (categories[v] || categories_size) }
112
+ # TODO make more efficient
113
+ if values.any? { |v| v == categories_size }
114
+ warn "[isotree] Unseen values in column: #{k}"
72
115
  end
116
+
117
+ v = values
118
+ v = v.to_numo if v.respond_to?(:to_numo) # Rover
119
+ binary_str =
120
+ if v.respond_to?(:to_binary) # Rover and Numo
121
+ v.cast_to(Numo::Int32).to_binary
122
+ else
123
+ v.pack("i*")
124
+ end
125
+ categorical_data << binary_str
126
+ ncat << [categories.size].pack("i")
73
127
  end
74
- raise ArgumentError, "No data" if nrows == 0
128
+ options[:categorical_data] = categorical_data
129
+ options[:ncols_categ] = @categorical_columns.size
130
+ options[:ncat] = ncat
75
131
 
76
- {
77
- nrows: nrows,
78
- ncols: ncols,
79
- numeric_data: numeric_data
80
- }
132
+ options[:nrows] = df.size
133
+ options
81
134
  end
82
135
 
83
136
  def fit_options
@@ -1,3 +1,3 @@
1
1
  module IsoTree
2
- VERSION = "0.1.2"
2
+ VERSION = "0.1.3"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: isotree
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.2
4
+ version: 0.1.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-08-11 00:00:00.000000000 Z
11
+ date: 2020-08-13 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rice
@@ -94,6 +94,20 @@ dependencies:
94
94
  - - ">="
95
95
  - !ruby/object:Gem::Version
96
96
  version: '0'
97
+ - !ruby/object:Gem::Dependency
98
+ name: rover-df
99
+ requirement: !ruby/object:Gem::Requirement
100
+ requirements:
101
+ - - ">="
102
+ - !ruby/object:Gem::Version
103
+ version: '0'
104
+ type: :development
105
+ prerelease: false
106
+ version_requirements: !ruby/object:Gem::Requirement
107
+ requirements:
108
+ - - ">="
109
+ - !ruby/object:Gem::Version
110
+ version: '0'
97
111
  description:
98
112
  email: andrew@chartkick.com
99
113
  executables: []
@@ -107,6 +121,7 @@ files:
107
121
  - ext/isotree/ext.cpp
108
122
  - ext/isotree/extconf.rb
109
123
  - lib/isotree.rb
124
+ - lib/isotree/dataset.rb
110
125
  - lib/isotree/isolation_forest.rb
111
126
  - lib/isotree/version.rb
112
127
  - vendor/isotree/LICENSE