RubyGems - isotree - Versions diffs - 0.1.2 → 0.1.3 - Mend

isotree 0.1.2 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (10) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +6 -0
data/LICENSE.txt +1 -0
data/README.md +29 -5
data/ext/isotree/ext.cpp +33 -11
data/lib/isotree.rb +1 -0
data/lib/isotree/dataset.rb +73 -0
data/lib/isotree/isolation_forest.rb +82 -29
data/lib/isotree/version.rb +1 -1
metadata +17 -2

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: b15de55d1a752d14cc97e2b5372308b2d4cb6a1e6fcfce0a05da6f769708b189
-  data.tar.gz: af21414cea40a26b2e291230e5d48bf4f804e1c77837a3132921b896bc617961
+  metadata.gz: 2e1eee89fa5df77e8f659e270c1d73ebb658baf8e4e52756e4ba17c2b1efd502
+  data.tar.gz: d0c1725819661ae742febd10decd4ff3d3c3bd8717e59127784488dd1e0ae890
 SHA512:
-  metadata.gz: 8127b5402c9c9f03bd2bd475b01a5cc8fbd3900ac1517d401ff4647d634e1f1049c8de51086095b132f30217f3571f8aa9e84c5fd18a0d3ac420a84203da85b7
-  data.tar.gz: 63b26ee19d8c49ce33d61891110db56597221a776830eb2aaad84c6d46038cb30822431a6f30b1051289f6becab0b652d968fbd4cf065c0925d50d5ef769c89a
+  metadata.gz: b887f8c29061c3577614fe3a267901498852c3de5fe50c6281465722d4b30acd49b68338de96a8a92690558e18cdbe34339e761f7d93d3c828dba40e8eef1d21
+  data.tar.gz: cc396e69aac246653bb45692a2097e0d8ca345a6cc7089c3d76cb8df0afc08feff44d79a2705fb99d6fee0df34cfe22e98ff85b914f3d0e4b87c2e8f1bbcec6d

data/CHANGELOG.md CHANGED

@@ -1,3 +1,9 @@
+## 0.1.3 (2020-08-13)
+- Added support for categorical data
+- Added support for Rover data frames
+- Added `output` option to `predict` method
 ## 0.1.2 (2020-08-11)
 - Fixed outlier scores

data/LICENSE.txt CHANGED

@@ -1,5 +1,6 @@
 BSD 2-Clause License
+Copyright (c) 2019, David Cortes
 Copyright (c) 2020, Andrew Kane
 All rights reserved.

data/README.md CHANGED

@@ -4,6 +4,8 @@
 Learn how [Isolation Forest](https://www.youtube.com/watch?v=RyFQXQf4w4w) works
+:deciduous_tree: Check out [OutlierTree](https://github.com/ankane/outliertree) for human-readable explanations of outliers
 [![Build Status](https://travis-ci.org/ankane/isotree.svg?branch=master)](https://travis-ci.org/ankane/isotree)
 ## Installation
@@ -19,20 +21,24 @@ gem 'isotree'
 Prep your data
 ```ruby
-x = [[1, 2], [3, 4], [5, 6], [7, 8]]
+data = [
+  {department: "Books",  sale: false, price: 2.50},
+  {department: "Books",  sale: true,  price: 3.00},
+  {department: "Movies", sale: false, price: 5.00}
+]
 ```
 Train a model
 ```ruby
 model = IsoTree::IsolationForest.new
-model.fit(x)
+model.fit(data)
 ```
 Get outlier scores
 ```ruby
-model.predict(x)
+model.predict(data)
 ```
 Scores are between 0 and 1, with higher scores indicating outliers
@@ -67,10 +73,20 @@ See a [detailed explanation](https://isotree.readthedocs.io/en/latest/#isotree.I
 ## Data
-Data can be an array of arrays
+Data can be an array of hashes
+```ruby
+[
+  {department: "Books",  sale: false, price: 2.50},
+  {department: "Books",  sale: true,  price: 3.00},
+  {department: "Movies", sale: false, price: 5.00}
+]
+```
+Or a Rover data frame
 ```ruby
-[[1, 2, 3], [4, 5, 6]]
+Rover.read_csv("data.csv")
 ```
 Or a Numo array
@@ -94,6 +110,14 @@ gem uninstall isotree --force
 bundle install
 ```
+## Reference
+Get the average isolation depth
+```ruby
+model.predict(data, output: "avg_depth")
+```
 ## History
 View the [changelog](https://github.com/ankane/isotree/blob/master/CHANGELOG.md)

data/ext/isotree/ext.cpp CHANGED

@@ -33,12 +33,22 @@ void Init_ext()
         // data
         size_t nrows = options.get<size_t, Symbol>("nrows");
-        size_t ncols = options.get<size_t, Symbol>("ncols");
-        double* numeric_data = (double*) options.get<String, Symbol>("numeric_data").c_str();
-        size_t ncols_numeric = ncols;
-        int* categ_data = NULL;
-        size_t ncols_categ = 0;
-        int* ncat = NULL;
+        size_t ncols_numeric = options.get<size_t, Symbol>("ncols_numeric");
+        size_t ncols_categ = options.get<size_t, Symbol>("ncols_categ");
+        double *restrict numeric_data = NULL;
+        if (ncols_numeric > 0) {
+          numeric_data = (double*) options.get<String, Symbol>("numeric_data").c_str();
+        }
+        int *restrict categorical_data = NULL;
+        int *restrict ncat = NULL;
+        if (ncols_categ > 0) {
+          categorical_data = (int*) options.get<String, Symbol>("categorical_data").c_str();
+          ncat = (int*) options.get<String, Symbol>("ncat").c_str();
+        }
+        // not used (sparse matrices)
         double* Xc = NULL;
         sparse_ix* Xc_ind = NULL;
         sparse_ix* Xc_indptr = NULL;
@@ -86,7 +96,7 @@ void Init_ext()
           &iso,
           numeric_data,
           ncols_numeric,
-          categ_data,
+          categorical_data,
           ncols_categ,
           ncat,
           Xc,
@@ -136,8 +146,20 @@ void Init_ext()
       *[](ExtIsoForest& iso, Hash options) {
         // data
         size_t nrows = options.get<size_t, Symbol>("nrows");
-        double* numeric_data = (double*) options.get<String, Symbol>("numeric_data").c_str();
-        int* categ_data = NULL;
+        size_t ncols_numeric = options.get<size_t, Symbol>("ncols_numeric");
+        size_t ncols_categ = options.get<size_t, Symbol>("ncols_categ");
+        double *restrict numeric_data = NULL;
+        if (ncols_numeric > 0) {
+          numeric_data = (double*) options.get<String, Symbol>("numeric_data").c_str();
+        }
+        int *restrict categorical_data = NULL;
+        if (ncols_categ > 0) {
+          categorical_data = (int*) options.get<String, Symbol>("categorical_data").c_str();
+        }
+        // not used (sparse matrices)
         double* Xc = NULL;
         sparse_ix* Xc_ind = NULL;
         sparse_ix* Xc_indptr = NULL;
@@ -147,13 +169,13 @@ void Init_ext()
         // options
         int nthreads = options.get<int, Symbol>("nthreads");
-        bool standardize = true;
+        bool standardize = options.get<bool, Symbol>("standardize");
         std::vector<double> outlier_scores(nrows);
         sparse_ix* tree_num = NULL;
         predict_iforest(
           numeric_data,
-          categ_data,
+          categorical_data,
           Xc,
           Xc_ind,
           Xc_indptr,

data/lib/isotree.rb CHANGED

@@ -5,5 +5,6 @@ require "isotree/ext"
 require "etc"
 # modules
+require "isotree/dataset"
 require "isotree/isolation_forest"
 require "isotree/version"

data/lib/isotree/dataset.rb ADDED

@@ -0,0 +1,73 @@
+module IsoTree
+  class Dataset
+    attr_reader :numeric_columns, :categorical_columns, :array_type
+    def initialize(data)
+      @data = data
+      if defined?(Rover::DataFrame) && data.is_a?(Rover::DataFrame)
+        @vectors = data.vectors
+        @numeric_columns, @categorical_columns = data.keys.partition { |k, v| ![:object, :bool].include?(data[k].type) }
+        @array_type = false
+      elsif defined?(Numo::NArray) && data.is_a?(Numo::NArray)
+        raise ArgumentError, "Input must have 2 dimensions" if data.ndim != 2
+        data = data.cast_to(Numo::DFloat)
+        ncols = data.shape[1]
+        @numeric_columns = ncols.times.to_a
+        @categorical_columns = []
+        @vectors = {}
+        @numeric_columns.each do |k|
+          @vectors[k] = data[true, k]
+        end
+        @array_type = true
+      else
+        data = data.to_a
+        hashes = data.all? { |d| d.is_a?(Hash) }
+        arrays = !hashes && data.all? { |d| d.is_a?(Array) }
+        unless hashes || arrays
+          raise ArgumentError, "Array elements must be all hashes or arrays"
+        end
+        nrows = data.size
+        ncols = data.first ? data.first.size : 0
+        if data.any? { |r| r.size != ncols }
+          raise ArgumentError, "All rows must have the same number of columns"
+        end
+        keys =
+          if hashes
+            data.flat_map(&:keys).uniq
+          else
+            ncols.times.to_a
+          end
+        @vectors = {}
+        keys.each do |k|
+          @vectors[k] = []
+        end
+        data.each do |d|
+          keys.each do |k|
+            @vectors[k] << d[k]
+          end
+        end
+        @numeric_columns, @categorical_columns = keys.partition { |k| @vectors[k].all? { |v| v.nil? || v.is_a?(Numeric) } }
+        @array_type = arrays
+      end
+      raise ArgumentError, "No data" if size == 0
+    end
+    def [](k)
+      @vectors[k]
+    end
+    def size
+      @vectors.any? ? @vectors.values.first.size : 0
+    end
+  end
+end

data/lib/isotree/isolation_forest.rb CHANGED

@@ -32,52 +32,105 @@ module IsoTree
     end
     def fit(x)
+      x = Dataset.new(x)
+      prep_fit(x)
       options = data_options(x).merge(fit_options)
       options[:sample_size] ||= options[:nrows]
-      @ncols = options[:ncols]
       @ext_iso_forest = Ext.fit_iforest(options)
     end
-    def predict(x)
+    def predict(x, output: "score")
       raise "Not fit" unless @ext_iso_forest
+      x = Dataset.new(x)
+      prep_predict(x)
       options = data_options(x).merge(nthreads: @nthreads)
-      if options[:ncols] != @ncols
-        raise ArgumentError, "Input must have #{@ncols} columns for this model"
+      case output
+      when "score"
+        options[:standardize] = true
+      when "avg_depth"
+        options[:standardize] = false
+      else
+        raise ArgumentError, "Unknown output"
       end
       Ext.predict_iforest(@ext_iso_forest, options)
     end
     private
-    # TODO support categorical data
-    def data_options(x)
-      if defined?(Numo::NArray) && x.is_a?(Numo::NArray)
-        raise ArgumentError, "Input must have 2 dimensions" if x.ndim != 2
-        x = x.cast_to(Numo::DFloat)
-        nrows, ncols = x.shape
-        numeric_data = String.new
-        ncols.times do |i|
-          numeric_data << x[true, i].to_binary
-        end
-      else
-        x = x.to_a
-        nrows = x.size
-        ncols = x.first ? x.first.size : 0
-        if x.any? { |r| r.size != ncols }
-          raise ArgumentError, "All rows must have the same number of columns"
+    def prep_fit(df)
+      @numeric_columns = df.numeric_columns
+      @categorical_columns = df.categorical_columns
+      @categories = {}
+      @categorical_columns.each do |k|
+        @categories[k] = df[k].uniq.to_a.compact.map.with_index.to_h
+      end
+    end
+    # TODO handle column type mismatches
+    def prep_predict(df)
+      expected_columns = @numeric_columns + @categorical_columns
+      if df.array_type
+        if df.numeric_columns.size + df.categorical_columns.size != expected_columns.size
+          raise ArgumentError, "Input must have #{expected_columns.size} columns for this model"
         end
-        numeric_data = String.new
-        ncols.times do |i|
-          numeric_data << x.map { |v| v[i] }.pack("d*")
+      end
+      expected_columns.each do |k|
+        raise ArgumentError, "Missing column: #{k}" unless df[k]
+      end
+    end
+    def data_options(df)
+      options = {}
+      # numeric
+      numeric_data = String.new
+      @numeric_columns.each do |k|
+        v = df[k]
+        v = v.to_numo if v.respond_to?(:to_numo) # Rover
+        binary_str =
+          if v.respond_to?(:to_binary) # Rover and Numo
+            v.cast_to(Numo::DFloat).to_binary
+          else
+            v.pack("d*")
+          end
+        numeric_data << binary_str
+      end
+      options[:numeric_data] = numeric_data
+      options[:ncols_numeric] = @numeric_columns.size
+      # categorical
+      categorical_data = String.new
+      ncat = String.new
+      @categorical_columns.each do |k|
+        categories = @categories[k]
+        # for unseen values, set to categories.size
+        categories_size = categories.size
+        values = df[k].map { |v| v.nil? ? -1 : (categories[v] || categories_size) }
+        # TODO make more efficient
+        if values.any? { |v| v == categories_size }
+          warn "[isotree] Unseen values in column: #{k}"
         end
+        v = values
+        v = v.to_numo if v.respond_to?(:to_numo) # Rover
+        binary_str =
+          if v.respond_to?(:to_binary) # Rover and Numo
+            v.cast_to(Numo::Int32).to_binary
+          else
+            v.pack("i*")
+          end
+        categorical_data << binary_str
+        ncat << [categories.size].pack("i")
       end
-      raise ArgumentError, "No data" if nrows == 0
+      options[:categorical_data] = categorical_data
+      options[:ncols_categ] = @categorical_columns.size
+      options[:ncat] = ncat
-      {
-        nrows: nrows,
-        ncols: ncols,
-        numeric_data: numeric_data
-      }
+      options[:nrows] = df.size
+      options
     end
     def fit_options

data/lib/isotree/version.rb CHANGED

@@ -1,3 +1,3 @@
 module IsoTree
-  VERSION = "0.1.2"
+  VERSION = "0.1.3"
 end

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: isotree
 version: !ruby/object:Gem::Version
-  version: 0.1.2
+  version: 0.1.3
 platform: ruby
 authors:
 - Andrew Kane
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2020-08-11 00:00:00.000000000 Z
+date: 2020-08-13 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rice
@@ -94,6 +94,20 @@ dependencies:
     - - ">="
       - !ruby/object:Gem::Version
         version: '0'
+- !ruby/object:Gem::Dependency
+  name: rover-df
+  requirement: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
+  type: :development
+  prerelease: false
+  version_requirements: !ruby/object:Gem::Requirement
+    requirements:
+    - - ">="
+      - !ruby/object:Gem::Version
+        version: '0'
 description:
 email: andrew@chartkick.com
 executables: []
@@ -107,6 +121,7 @@ files:
 - ext/isotree/ext.cpp
 - ext/isotree/extconf.rb
 - lib/isotree.rb
+- lib/isotree/dataset.rb
 - lib/isotree/isolation_forest.rb
 - lib/isotree/version.rb
 - vendor/isotree/LICENSE