RubyGems - isotree - Versions diffs - 0.1.3 → 0.1.4 - Mend

isotree 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +5 -0
data/README.md +6 -0
data/ext/isotree/ext.cpp +80 -19
data/lib/isotree/isolation_forest.rb +14 -5
data/lib/isotree/version.rb +1 -1
data/vendor/isotree/README.md +5 -1
data/vendor/isotree/src/fit_model.cpp +19 -8
metadata +2 -2

checksums.yaml CHANGED

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: 2e1eee89fa5df77e8f659e270c1d73ebb658baf8e4e52756e4ba17c2b1efd502
-  data.tar.gz: d0c1725819661ae742febd10decd4ff3d3c3bd8717e59127784488dd1e0ae890
+  metadata.gz: 6121fa6837526e05ea0124e69c189dea6e0c94a303a003ed8f5db5e8469e54b3
+  data.tar.gz: eb4a955ffa47876af4ece7ce7006635af4b43e29475055cf0046d61cbb54d443
 SHA512:
-  metadata.gz: b887f8c29061c3577614fe3a267901498852c3de5fe50c6281465722d4b30acd49b68338de96a8a92690558e18cdbe34339e761f7d93d3c828dba40e8eef1d21
-  data.tar.gz: cc396e69aac246653bb45692a2097e0d8ca345a6cc7089c3d76cb8df0afc08feff44d79a2705fb99d6fee0df34cfe22e98ff85b914f3d0e4b87c2e8f1bbcec6d
+  metadata.gz: 22eb873ac7cff7cd5a1e0e7c04e6c47e895c06d8e0e52cd5ee2c4ab31b652203737186fe2dfd84298ea9cdf5327bed0c4d22e67ec9fb787179eae978f922b127
+  data.tar.gz: bcc62a52fa71bf6e3175108a473e834b8a730ea0a94dd8adc017057a23a9bf56d451c3b955b3aafacd2cd9e6f4c237c31ce4f7a3eefd4d11aaed25183def3b92

data/CHANGELOG.md CHANGED

@@ -1,3 +1,8 @@
+## 0.1.4 (2020-08-22)
+- Added `missing_action`, `new_categ_action`, `categ_split_type`, `coefs`, `depth_imp`, and `weigh_imp_rows` options
+- Fixed signal handling
 ## 0.1.3 (2020-08-13)
 - Added support for categorical data

data/README.md CHANGED

@@ -58,12 +58,18 @@ IsoTree::IsolationForest.new(
   prob_split_avg_gain: 0,
   prob_split_pooled_gain: 0,
   min_gain: 0,
+  missing_action: "impute",
+  new_categ_action: "smallest",
+  categ_split_type: "subset",
   all_perm: false,
   coef_by_prop: false,
   sample_with_replacement: false,
   penalize_range: true,
   weigh_by_kurtosis: false,
+  coefs: "normal",
   min_imp_obs: 3,
+  depth_imp: "higher",
+  weigh_imp_rows: "inverse",
   random_seed: 1,
   nthreads: -1
 )

data/ext/isotree/ext.cpp CHANGED

@@ -5,17 +5,77 @@
 #include <rice/Array.hpp>
 #include <rice/Hash.hpp>
 #include <rice/Module.hpp>
+#include <rice/Object.hpp>
 #include <rice/String.hpp>
 #include <rice/Symbol.hpp>
 using Rice::Array;
 using Rice::Hash;
 using Rice::Module;
+using Rice::Object;
 using Rice::String;
 using Rice::Symbol;
 using Rice::define_class_under;
 using Rice::define_module;
+template<>
+NewCategAction from_ruby<NewCategAction>(Object x)
+{
+  auto value = x.to_s().str();
+  if (value == "weighted") return Weighted;
+  if (value == "smallest") return Smallest;
+  if (value == "random") return Random;
+  throw std::runtime_error("Unknown new categ action: " + value);
+}
+template<>
+MissingAction from_ruby<MissingAction>(Object x)
+{
+  auto value = x.to_s().str();
+  if (value == "divide") return Divide;
+  if (value == "impute") return Impute;
+  if (value == "fail") return Fail;
+  throw std::runtime_error("Unknown missing action: " + value);
+}
+template<>
+CategSplit from_ruby<CategSplit>(Object x)
+{
+  auto value = x.to_s().str();
+  if (value == "subset") return SubSet;
+  if (value == "single_categ") return SingleCateg;
+  throw std::runtime_error("Unknown categ split: " + value);
+}
+template<>
+CoefType from_ruby<CoefType>(Object x)
+{
+  auto value = x.to_s().str();
+  if (value == "uniform") return Uniform;
+  if (value == "normal") return Normal;
+  throw std::runtime_error("Unknown coef type: " + value);
+}
+template<>
+UseDepthImp from_ruby<UseDepthImp>(Object x)
+{
+  auto value = x.to_s().str();
+  if (value == "lower") return Lower;
+  if (value == "higher") return Higher;
+  if (value == "same") return Same;
+  throw std::runtime_error("Unknown depth imp: " + value);
+}
+template<>
+WeighImpRows from_ruby<WeighImpRows>(Object x)
+{
+  auto value = x.to_s().str();
+  if (value == "inverse") return Inverse;
+  if (value == "prop") return Prop;
+  if (value == "flat") return Flat;
+  throw std::runtime_error("Unknown weight imp rows: " + value);
+}
 extern "C"
 void Init_ext()
 {
@@ -54,25 +114,7 @@ void Init_ext()
         sparse_ix* Xc_indptr = NULL;
         // options
-        CoefType coef_type = Normal;
-        double* sample_weights = NULL;
-        bool weight_as_sample = false;
-        size_t max_depth = 0;
-        bool limit_depth = true;
-        bool standardize_dist = false;
-        double* tmat = NULL;
-        double* output_depths = NULL;
-        bool standardize_depth = false;
-        double* col_weights = NULL;
-        MissingAction missing_action = Impute;
-        CategSplit cat_split_type = SubSet;
-        NewCategAction new_cat_action = Smallest;
-        Imputer *imputer = NULL;
-        UseDepthImp depth_imp = Higher;
-        WeighImpRows weigh_imp_rows = Inverse;
-        bool impute_at_fit = false;
-        // Rice has limit of 14 arguments, so use hash for options
+        // Rice has limit of 14 arguments, so use hash
         size_t sample_size = options.get<size_t, Symbol>("sample_size");
         size_t ndim = options.get<size_t, Symbol>("ndim");
         size_t ntrees = options.get<size_t, Symbol>("ntrees");
@@ -82,15 +124,34 @@ void Init_ext()
         double prob_pick_by_gain_pl = options.get<double, Symbol>("prob_pick_pooled_gain");
         double prob_split_by_gain_pl = options.get<double, Symbol>("prob_split_pooled_gain");
         double min_gain = options.get<double, Symbol>("min_gain");
+        MissingAction missing_action = options.get<MissingAction, Symbol>("missing_action");
+        CategSplit cat_split_type = options.get<CategSplit, Symbol>("categ_split_type");
+        NewCategAction new_cat_action = options.get<NewCategAction, Symbol>("new_categ_action");
         bool all_perm = options.get<bool, Symbol>("all_perm");
         bool coef_by_prop = options.get<bool, Symbol>("coef_by_prop");
         bool with_replacement = options.get<bool, Symbol>("sample_with_replacement");
         bool penalize_range = options.get<bool, Symbol>("penalize_range");
         bool weigh_by_kurt = options.get<bool, Symbol>("weigh_by_kurtosis");
+        CoefType coef_type = options.get<CoefType, Symbol>("coefs");
         size_t min_imp_obs = options.get<size_t, Symbol>("min_imp_obs");
+        UseDepthImp depth_imp = options.get<UseDepthImp, Symbol>("depth_imp");
+        WeighImpRows weigh_imp_rows = options.get<WeighImpRows, Symbol>("weigh_imp_rows");
         uint64_t random_seed = options.get<uint64_t, Symbol>("random_seed");
         int nthreads = options.get<int, Symbol>("nthreads");
+        // TODO options
+        double* sample_weights = NULL;
+        bool weight_as_sample = false;
+        size_t max_depth = 0;
+        bool limit_depth = true;
+        bool standardize_dist = false;
+        double* tmat = NULL;
+        double* output_depths = NULL;
+        bool standardize_depth = false;
+        double* col_weights = NULL;
+        Imputer *imputer = NULL;
+        bool impute_at_fit = false;
         fit_iforest(
           NULL,
           &iso,

data/lib/isotree/isolation_forest.rb CHANGED

@@ -4,9 +4,11 @@ module IsoTree
       sample_size: nil, ntrees: 500, ndim: 3, ntry: 3,
       prob_pick_avg_gain: 0, prob_pick_pooled_gain: 0,
       prob_split_avg_gain: 0, prob_split_pooled_gain: 0,
-      min_gain: 0, all_perm: false, coef_by_prop: false,
+      min_gain: 0, missing_action: "impute", new_categ_action: "smallest",
+      categ_split_type: "subset", all_perm: false, coef_by_prop: false,
       sample_with_replacement: false, penalize_range: true,
-      weigh_by_kurtosis: false, min_imp_obs: 3, random_seed: 1, nthreads: -1
+      weigh_by_kurtosis: false, coefs: "normal", min_imp_obs: 3, depth_imp: "higher",
+      weigh_imp_rows: "inverse", random_seed: 1, nthreads: -1
     )
       @sample_size = sample_size
@@ -18,12 +20,18 @@ module IsoTree
       @prob_split_avg_gain = prob_split_avg_gain
       @prob_split_pooled_gain = prob_split_pooled_gain
       @min_gain = min_gain
+      @missing_action = missing_action
+      @new_categ_action = new_categ_action
+      @categ_split_type = categ_split_type
       @all_perm = all_perm
       @coef_by_prop = coef_by_prop
       @sample_with_replacement = sample_with_replacement
       @penalize_range = penalize_range
       @weigh_by_kurtosis = weigh_by_kurtosis
+      @coefs = coefs
       @min_imp_obs = min_imp_obs
+      @depth_imp = depth_imp
+      @weigh_imp_rows = weigh_imp_rows
       @random_seed = random_seed
       # etc module returns virtual cores
@@ -138,10 +146,11 @@ module IsoTree
         sample_size ntrees ndim ntry
         prob_pick_avg_gain prob_pick_pooled_gain
         prob_split_avg_gain prob_split_pooled_gain
-        min_gain all_perm coef_by_prop
+        min_gain missing_action new_categ_action
+        categ_split_type all_perm coef_by_prop
         sample_with_replacement penalize_range
-        weigh_by_kurtosis min_imp_obs
-        random_seed nthreads
+        weigh_by_kurtosis coefs min_imp_obs depth_imp
+        weigh_imp_rows random_seed nthreads
       )
       options = {}
       keys.each do |k|

data/lib/isotree/version.rb CHANGED

@@ -1,3 +1,3 @@
 module IsoTree
-  VERSION = "0.1.3"
+  VERSION = "0.1.4"
 end

data/vendor/isotree/README.md CHANGED

@@ -1,6 +1,6 @@
 # IsoTree
-Fast and multi-threaded implementation of Extended Isolation Forest, Fair-Cut Forest, SCiForest (a.k.a. Split-Criterion iForest), and regular Isolation Forest, for outlier/anomaly detection, plus additions for imputation of missing values, distance/similarity calculation between observations, and handling of categorical data. Written in C++ with interfaces for Python and R.
+Fast and multi-threaded implementation of Extended Isolation Forest, Fair-Cut Forest, SCiForest (a.k.a. Split-Criterion iForest), and regular Isolation Forest, for outlier/anomaly detection, plus additions for imputation of missing values, distance/similarity calculation between observations, and handling of categorical data. Written in C++ with interfaces for Python and R. An additional wrapper for Ruby can be found [here](https://github.com/ankane/isotree).
 The new concepts in this software are described in:
 * [Distance approximation using Isolation Forests](https://arxiv.org/abs/1910.12362)
@@ -82,6 +82,10 @@ sudo ldconfig
 (Will build as a shared object - linkage is then done with `-lisotree`)
+* Ruby
+See [external repository with wrapper](https://github.com/ankane/isotree).
 # Sample usage
 **Warning: default parameters in this implementation are very different from default parameters in others such as SciKit-Learn's, and these defaults won't scale to large datasets (see documentation for details).**

data/vendor/isotree/src/fit_model.cpp CHANGED

@@ -1,7 +1,7 @@
 /*    Isolation forests and variations thereof, with adjustments for incorporation
 *     of categorical variables and missing values.
 *     Writen for C++11 standard and aimed at being used in R and Python.
-*
+*
 *     This library is based on the following works:
 *     [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
 *         "Isolation forest."
@@ -20,7 +20,7 @@
 *     [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
 *     [8] Cortes, David. "Distance approximation using Isolation Forests." arXiv preprint arXiv:1910.12362 (2019).
 *     [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
-*
+*
 *     BSD 2-Clause License
 *     Copyright (c) 2019, David Cortes
 *     All rights reserved.
@@ -47,7 +47,7 @@
 bool interrupt_switch;
 /*  Fit Isolation Forest model, or variant of it such as SCiForest
-*
+*
 * Parameters:
 * ===========
 * - model_outputs (out)
@@ -291,7 +291,7 @@ bool interrupt_switch;
 *       Number of parallel threads to use. Note that, the more threads, the more memory will be
 *       allocated, even if the thread does not end up being used. Ignored when not building with
 *       OpenMP support.
-*
+*
 * Returns
 * =======
 * Will return macro 'EXIT_SUCCESS' (typically =0) upon completion.
@@ -300,7 +300,7 @@ bool interrupt_switch;
 * what these values correspond to, you can use the functions
 * 'return_EXIT_SUCESS' and 'return_EXIT_FAILURE', which will return them
 * as integers.
-*
+*
 * References
 * ==========
 * [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
@@ -418,6 +418,13 @@ int fit_iforest(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
     /* Global variable that determines if the procedure receives a stop signal */
     interrupt_switch = false;
+    /* TODO: find a better way of handling interrupt signals when calling in Python/R.
+       The following will still change the behavior of interrupts when called through e.g. Flask */
+    #if !defined(_WIN32) && !defined(_WIN64) && !defined(_MSC_VER)
+    struct sigaction sig_handle;
+    sig_handle.sa_flags = SA_RESETHAND;
+    sig_handle.sa_handler = set_interrup_global_variable;
+    #endif
     /* grow trees */
     #pragma omp parallel for num_threads(nthreads) schedule(dynamic) shared(model_outputs, model_outputs_ext, worker_memory, input_data, model_params)
@@ -461,7 +468,11 @@ int fit_iforest(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
         else
             model_outputs_ext->hplanes[tree].shrink_to_fit();
-        signal(SIGINT, set_interrup_global_variable);
+        #if !defined(_WIN32) && !defined(_WIN64) && !defined(_MSC_VER)
+        // sigaction(SIGINT, &sig_handle, NULL);
+        #else
+        // signal(SIGINT, set_interrup_global_variable);
+        #endif
     }
     /* check if the procedure got interrupted */
@@ -545,7 +556,7 @@ int fit_iforest(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
 /* Add additional trees to already-fitted isolation forest model
-*
+*
 * Parameters
 * ==========
 * - model_outputs
@@ -1001,7 +1012,7 @@ void fit_itree(std::vector<IsoTree>    *tree_root,
         if (
             model_params.cat_split_type == SubSet &&
             (
-                model_params.prob_pick_by_gain_avg  ||
+                model_params.prob_pick_by_gain_avg  ||
                 model_params.prob_pick_by_gain_pl
             )
            )

metadata CHANGED

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: isotree
 version: !ruby/object:Gem::Version
-  version: 0.1.3
+  version: 0.1.4
 platform: ruby
 authors:
 - Andrew Kane
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2020-08-13 00:00:00.000000000 Z
+date: 2020-08-22 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rice