RubyGems - isotree - Versions diffs - 0.1.3 → 0.1.4 - Mend

isotree 0.1.3 → 0.1.4

Files changed (9) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +5 -0
data/README.md +6 -0
data/ext/isotree/ext.cpp +80 -19
data/lib/isotree/isolation_forest.rb +14 -5
data/lib/isotree/version.rb +1 -1
data/vendor/isotree/README.md +5 -1
data/vendor/isotree/src/fit_model.cpp +19 -8
metadata +2 -2

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 2e1eee89fa5df77e8f659e270c1d73ebb658baf8e4e52756e4ba17c2b1efd502
-  data.tar.gz: d0c1725819661ae742febd10decd4ff3d3c3bd8717e59127784488dd1e0ae890
+  metadata.gz: 6121fa6837526e05ea0124e69c189dea6e0c94a303a003ed8f5db5e8469e54b3
+  data.tar.gz: eb4a955ffa47876af4ece7ce7006635af4b43e29475055cf0046d61cbb54d443
 SHA512:
-  metadata.gz: b887f8c29061c3577614fe3a267901498852c3de5fe50c6281465722d4b30acd49b68338de96a8a92690558e18cdbe34339e761f7d93d3c828dba40e8eef1d21
-  data.tar.gz: cc396e69aac246653bb45692a2097e0d8ca345a6cc7089c3d76cb8df0afc08feff44d79a2705fb99d6fee0df34cfe22e98ff85b914f3d0e4b87c2e8f1bbcec6d
+  metadata.gz: 22eb873ac7cff7cd5a1e0e7c04e6c47e895c06d8e0e52cd5ee2c4ab31b652203737186fe2dfd84298ea9cdf5327bed0c4d22e67ec9fb787179eae978f922b127
+  data.tar.gz: bcc62a52fa71bf6e3175108a473e834b8a730ea0a94dd8adc017057a23a9bf56d451c3b955b3aafacd2cd9e6f4c237c31ce4f7a3eefd4d11aaed25183def3b92

data/CHANGELOG.md CHANGED

@@ -1,3 +1,8 @@
+## 0.1.4 (2020-08-22)
+- Added `missing_action`, `new_categ_action`, `categ_split_type`, `coefs`, `depth_imp`, and `weigh_imp_rows` options
+- Fixed signal handling
 ## 0.1.3 (2020-08-13)
 - Added support for categorical data

data/README.md CHANGED

@@ -58,12 +58,18 @@ IsoTree::IsolationForest.new(
   prob_split_avg_gain: 0,
   prob_split_pooled_gain: 0,
   min_gain: 0,
+  missing_action: "impute",
+  new_categ_action: "smallest",
+  categ_split_type: "subset",
   all_perm: false,
   coef_by_prop: false,
   sample_with_replacement: false,
   penalize_range: true,
   weigh_by_kurtosis: false,
+  coefs: "normal",
   min_imp_obs: 3,
+  depth_imp: "higher",
+  weigh_imp_rows: "inverse",
   random_seed: 1,
   nthreads: -1
 )

data/ext/isotree/ext.cpp CHANGED

@@ -5,17 +5,77 @@
 #include <rice/Array.hpp>
 #include <rice/Hash.hpp>
 #include <rice/Module.hpp>
+#include <rice/Object.hpp>
 #include <rice/String.hpp>
 #include <rice/Symbol.hpp>
 using Rice::Array;
 using Rice::Hash;
 using Rice::Module;
+using Rice::Object;
 using Rice::String;
 using Rice::Symbol;
 using Rice::define_class_under;
 using Rice::define_module;
+template<>
+NewCategAction from_ruby<NewCategAction>(Object x)
+{
+  auto value = x.to_s().str();
+  if (value == "weighted") return Weighted;
+  if (value == "smallest") return Smallest;
+  if (value == "random") return Random;
+  throw std::runtime_error("Unknown new categ action: " + value);
+}
+template<>
+MissingAction from_ruby<MissingAction>(Object x)
+{
+  auto value = x.to_s().str();
+  if (value == "divide") return Divide;
+  if (value == "impute") return Impute;
+  if (value == "fail") return Fail;
+  throw std::runtime_error("Unknown missing action: " + value);
+}
+template<>
+CategSplit from_ruby<CategSplit>(Object x)
+{
+  auto value = x.to_s().str();
+  if (value == "subset") return SubSet;
+  if (value == "single_categ") return SingleCateg;
+  throw std::runtime_error("Unknown categ split: " + value);
+}
+template<>
+CoefType from_ruby<CoefType>(Object x)
+{
+  auto value = x.to_s().str();
+  if (value == "uniform") return Uniform;
+  if (value == "normal") return Normal;
+  throw std::runtime_error("Unknown coef type: " + value);
+}
+template<>
+UseDepthImp from_ruby<UseDepthImp>(Object x)
+{
+  auto value = x.to_s().str();
+  if (value == "lower") return Lower;
+  if (value == "higher") return Higher;
+  if (value == "same") return Same;
+  throw std::runtime_error("Unknown depth imp: " + value);
+}
+template<>
+WeighImpRows from_ruby<WeighImpRows>(Object x)
+{
+  auto value = x.to_s().str();
+  if (value == "inverse") return Inverse;
+  if (value == "prop") return Prop;
+  if (value == "flat") return Flat;
+  throw std::runtime_error("Unknown weight imp rows: " + value);
+}
 extern "C"
 void Init_ext()
 {
@@ -54,25 +114,7 @@ void Init_ext()
         sparse_ix* Xc_indptr = NULL;
         // options
-        CoefType coef_type = Normal;
-        double* sample_weights = NULL;
-        bool weight_as_sample = false;
-        size_t max_depth = 0;
-        bool limit_depth = true;
-        bool standardize_dist = false;
-        double* tmat = NULL;
-        double* output_depths = NULL;
-        bool standardize_depth = false;
-        double* col_weights = NULL;
-        MissingAction missing_action = Impute;
-        CategSplit cat_split_type = SubSet;
-        NewCategAction new_cat_action = Smallest;
-        Imputer *imputer = NULL;
-        UseDepthImp depth_imp = Higher;
-        WeighImpRows weigh_imp_rows = Inverse;
-        bool impute_at_fit = false;
-        // Rice has limit of 14 arguments, so use hash for options
+        // Rice has limit of 14 arguments, so use hash
         size_t sample_size = options.get<size_t, Symbol>("sample_size");
         size_t ndim = options.get<size_t, Symbol>("ndim");
         size_t ntrees = options.get<size_t, Symbol>("ntrees");
@@ -82,15 +124,34 @@ void Init_ext()
         double prob_pick_by_gain_pl = options.get<double, Symbol>("prob_pick_pooled_gain");
         double prob_split_by_gain_pl = options.get<double, Symbol>("prob_split_pooled_gain");
         double min_gain = options.get<double, Symbol>("min_gain");
+        MissingAction missing_action = options.get<MissingAction, Symbol>("missing_action");
+        CategSplit cat_split_type = options.get<CategSplit, Symbol>("categ_split_type");
+        NewCategAction new_cat_action = options.get<NewCategAction, Symbol>("new_categ_action");
         bool all_perm = options.get<bool, Symbol>("all_perm");
         bool coef_by_prop = options.get<bool, Symbol>("coef_by_prop");
         bool with_replacement = options.get<bool, Symbol>("sample_with_replacement");
         bool penalize_range = options.get<bool, Symbol>("penalize_range");
         bool weigh_by_kurt = options.get<bool, Symbol>("weigh_by_kurtosis");
+        CoefType coef_type = options.get<CoefType, Symbol>("coefs");
         size_t min_imp_obs = options.get<size_t, Symbol>("min_imp_obs");
+        UseDepthImp depth_imp = options.get<UseDepthImp, Symbol>("depth_imp");
+        WeighImpRows weigh_imp_rows = options.get<WeighImpRows, Symbol>("weigh_imp_rows");
         uint64_t random_seed = options.get<uint64_t, Symbol>("random_seed");
         int nthreads = options.get<int, Symbol>("nthreads");
+        // TODO options
+        double* sample_weights = NULL;
+        bool weight_as_sample = false;
+        size_t max_depth = 0;
+        bool limit_depth = true;
+        bool standardize_dist = false;
+        double* tmat = NULL;
+        double* output_depths = NULL;
+        bool standardize_depth = false;
+        double* col_weights = NULL;
+        Imputer *imputer = NULL;
+        bool impute_at_fit = false;
         fit_iforest(
           NULL,
           &iso,

data/lib/isotree/isolation_forest.rb CHANGED

@@ -4,9 +4,11 @@ module IsoTree
       sample_size: nil, ntrees: 500, ndim: 3, ntry: 3,
       prob_pick_avg_gain: 0, prob_pick_pooled_gain: 0,
       prob_split_avg_gain: 0, prob_split_pooled_gain: 0,
-      min_gain: 0, all_perm: false, coef_by_prop: false,
+      min_gain: 0, missing_action: "impute", new_categ_action: "smallest",
+      categ_split_type: "subset", all_perm: false, coef_by_prop: false,
       sample_with_replacement: false, penalize_range: true,
-      weigh_by_kurtosis: false, min_imp_obs: 3, random_seed: 1, nthreads: -1
+      weigh_by_kurtosis: false, coefs: "normal", min_imp_obs: 3, depth_imp: "higher",
+      weigh_imp_rows: "inverse", random_seed: 1, nthreads: -1
     )
       @sample_size = sample_size
@@ -18,12 +20,18 @@ module IsoTree
       @prob_split_avg_gain = prob_split_avg_gain
       @prob_split_pooled_gain = prob_split_pooled_gain
       @min_gain = min_gain
+      @missing_action = missing_action
+      @new_categ_action = new_categ_action
+      @categ_split_type = categ_split_type
       @all_perm = all_perm
       @coef_by_prop = coef_by_prop
       @sample_with_replacement = sample_with_replacement
       @penalize_range = penalize_range
       @weigh_by_kurtosis = weigh_by_kurtosis
+      @coefs = coefs
       @min_imp_obs = min_imp_obs
+      @depth_imp = depth_imp
+      @weigh_imp_rows = weigh_imp_rows
       @random_seed = random_seed
       # etc module returns virtual cores
@@ -138,10 +146,11 @@ module IsoTree
         sample_size ntrees ndim ntry
         prob_pick_avg_gain prob_pick_pooled_gain
         prob_split_avg_gain prob_split_pooled_gain
-        min_gain all_perm coef_by_prop
+        min_gain missing_action new_categ_action
+        categ_split_type all_perm coef_by_prop
         sample_with_replacement penalize_range
-        weigh_by_kurtosis min_imp_obs
-        random_seed nthreads
+        weigh_by_kurtosis coefs min_imp_obs depth_imp
+        weigh_imp_rows random_seed nthreads
       )
       options = {}
       keys.each do |k|

data/lib/isotree/version.rb CHANGED

@@ -1,3 +1,3 @@
 module IsoTree
-  VERSION = "0.1.3"
+  VERSION = "0.1.4"
 end

data/vendor/isotree/README.md CHANGED

@@ -1,6 +1,6 @@
 # IsoTree
-Fast and multi-threaded implementation of Extended Isolation Forest, Fair-Cut Forest, SCiForest (a.k.a. Split-Criterion iForest), and regular Isolation Forest, for outlier/anomaly detection, plus additions for imputation of missing values, distance/similarity calculation between observations, and handling of categorical data. Written in C++ with interfaces for Python and R.
+Fast and multi-threaded implementation of Extended Isolation Forest, Fair-Cut Forest, SCiForest (a.k.a. Split-Criterion iForest), and regular Isolation Forest, for outlier/anomaly detection, plus additions for imputation of missing values, distance/similarity calculation between observations, and handling of categorical data. Written in C++ with interfaces for Python and R. An additional wrapper for Ruby can be found [here](https://github.com/ankane/isotree).
 The new concepts in this software are described in:
 * [Distance approximation using Isolation Forests](https://arxiv.org/abs/1910.12362)
@@ -82,6 +82,10 @@ sudo ldconfig
 (Will build as a shared object - linkage is then done with `-lisotree`)
+* Ruby
+See [external repository with wrapper](https://github.com/ankane/isotree).
 # Sample usage
 **Warning: default parameters in this implementation are very different from default parameters in others such as SciKit-Learn's, and these defaults won't scale to large datasets (see documentation for details).**

data/vendor/isotree/src/fit_model.cpp CHANGED

@@ -1,7 +1,7 @@
 /*    Isolation forests and variations thereof, with adjustments for incorporation
 *     of categorical variables and missing values.
 *     Writen for C++11 standard and aimed at being used in R and Python.
-*
+*
 *     This library is based on the following works:
 *     [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
 *         "Isolation forest."
@@ -20,7 +20,7 @@
 *     [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
 *     [8] Cortes, David. "Distance approximation using Isolation Forests." arXiv preprint arXiv:1910.12362 (2019).
 *     [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
-*
+*
 *     BSD 2-Clause License
 *     Copyright (c) 2019, David Cortes
 *     All rights reserved.
@@ -47,7 +47,7 @@
 bool interrupt_switch;
 /*  Fit Isolation Forest model, or variant of it such as SCiForest
-*
+*
 * Parameters:
 * ===========
 * - model_outputs (out)
@@ -291,7 +291,7 @@ bool interrupt_switch;
 *       Number of parallel threads to use. Note that, the more threads, the more memory will be
 *       allocated, even if the thread does not end up being used. Ignored when not building with
 *       OpenMP support.
-*
+*
 * Returns
 * =======
 * Will return macro 'EXIT_SUCCESS' (typically =0) upon completion.
@@ -300,7 +300,7 @@ bool interrupt_switch;
 * what these values correspond to, you can use the functions
 * 'return_EXIT_SUCESS' and 'return_EXIT_FAILURE', which will return them
 * as integers.
-*
+*
 * References
 * ==========
 * [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
@@ -418,6 +418,13 @@ int fit_iforest(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
     /* Global variable that determines if the procedure receives a stop signal */
     interrupt_switch = false;
+    /* TODO: find a better way of handling interrupt signals when calling in Python/R.
+       The following will still change the behavior of interrupts when called through e.g. Flask */
+    #if !defined(_WIN32) && !defined(_WIN64) && !defined(_MSC_VER)
+    struct sigaction sig_handle;
+    sig_handle.sa_flags = SA_RESETHAND;
+    sig_handle.sa_handler = set_interrup_global_variable;
+    #endif
     /* grow trees */
     #pragma omp parallel for num_threads(nthreads) schedule(dynamic) shared(model_outputs, model_outputs_ext, worker_memory, input_data, model_params)
@@ -461,7 +468,11 @@ int fit_iforest(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
         else
             model_outputs_ext->hplanes[tree].shrink_to_fit();
-        signal(SIGINT, set_interrup_global_variable);
+        #if !defined(_WIN32) && !defined(_WIN64) && !defined(_MSC_VER)
+        // sigaction(SIGINT, &sig_handle, NULL);
+        #else
+        // signal(SIGINT, set_interrup_global_variable);
+        #endif
     }
     /* check if the procedure got interrupted */
@@ -545,7 +556,7 @@ int fit_iforest(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
 /* Add additional trees to already-fitted isolation forest model
-*
+*
 * Parameters
 * ==========
 * - model_outputs
@@ -1001,7 +1012,7 @@ void fit_itree(std::vector<IsoTree>    *tree_root,
         if (
             model_params.cat_split_type == SubSet &&
             (
-                model_params.prob_pick_by_gain_avg  ||
+                model_params.prob_pick_by_gain_avg  ||
                 model_params.prob_pick_by_gain_pl
             )
            )

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: isotree
 version: !ruby/object:Gem::Version
-  version: 0.1.3
+  version: 0.1.4
 platform: ruby
 authors:
 - Andrew Kane
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2020-08-13 00:00:00.000000000 Z
+date: 2020-08-22 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rice