isotree 0.1.3 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2e1eee89fa5df77e8f659e270c1d73ebb658baf8e4e52756e4ba17c2b1efd502
4
- data.tar.gz: d0c1725819661ae742febd10decd4ff3d3c3bd8717e59127784488dd1e0ae890
3
+ metadata.gz: 6121fa6837526e05ea0124e69c189dea6e0c94a303a003ed8f5db5e8469e54b3
4
+ data.tar.gz: eb4a955ffa47876af4ece7ce7006635af4b43e29475055cf0046d61cbb54d443
5
5
  SHA512:
6
- metadata.gz: b887f8c29061c3577614fe3a267901498852c3de5fe50c6281465722d4b30acd49b68338de96a8a92690558e18cdbe34339e761f7d93d3c828dba40e8eef1d21
7
- data.tar.gz: cc396e69aac246653bb45692a2097e0d8ca345a6cc7089c3d76cb8df0afc08feff44d79a2705fb99d6fee0df34cfe22e98ff85b914f3d0e4b87c2e8f1bbcec6d
6
+ metadata.gz: 22eb873ac7cff7cd5a1e0e7c04e6c47e895c06d8e0e52cd5ee2c4ab31b652203737186fe2dfd84298ea9cdf5327bed0c4d22e67ec9fb787179eae978f922b127
7
+ data.tar.gz: bcc62a52fa71bf6e3175108a473e834b8a730ea0a94dd8adc017057a23a9bf56d451c3b955b3aafacd2cd9e6f4c237c31ce4f7a3eefd4d11aaed25183def3b92
@@ -1,3 +1,8 @@
1
+ ## 0.1.4 (2020-08-22)
2
+
3
+ - Added `missing_action`, `new_categ_action`, `categ_split_type`, `coefs`, `depth_imp`, and `weigh_imp_rows` options
4
+ - Fixed signal handling
5
+
1
6
  ## 0.1.3 (2020-08-13)
2
7
 
3
8
  - Added support for categorical data
data/README.md CHANGED
@@ -58,12 +58,18 @@ IsoTree::IsolationForest.new(
58
58
  prob_split_avg_gain: 0,
59
59
  prob_split_pooled_gain: 0,
60
60
  min_gain: 0,
61
+ missing_action: "impute",
62
+ new_categ_action: "smallest",
63
+ categ_split_type: "subset",
61
64
  all_perm: false,
62
65
  coef_by_prop: false,
63
66
  sample_with_replacement: false,
64
67
  penalize_range: true,
65
68
  weigh_by_kurtosis: false,
69
+ coefs: "normal",
66
70
  min_imp_obs: 3,
71
+ depth_imp: "higher",
72
+ weigh_imp_rows: "inverse",
67
73
  random_seed: 1,
68
74
  nthreads: -1
69
75
  )
@@ -5,17 +5,77 @@
5
5
  #include <rice/Array.hpp>
6
6
  #include <rice/Hash.hpp>
7
7
  #include <rice/Module.hpp>
8
+ #include <rice/Object.hpp>
8
9
  #include <rice/String.hpp>
9
10
  #include <rice/Symbol.hpp>
10
11
 
11
12
  using Rice::Array;
12
13
  using Rice::Hash;
13
14
  using Rice::Module;
15
+ using Rice::Object;
14
16
  using Rice::String;
15
17
  using Rice::Symbol;
16
18
  using Rice::define_class_under;
17
19
  using Rice::define_module;
18
20
 
21
+ template<>
22
+ NewCategAction from_ruby<NewCategAction>(Object x)
23
+ {
24
+ auto value = x.to_s().str();
25
+ if (value == "weighted") return Weighted;
26
+ if (value == "smallest") return Smallest;
27
+ if (value == "random") return Random;
28
+ throw std::runtime_error("Unknown new categ action: " + value);
29
+ }
30
+
31
+ template<>
32
+ MissingAction from_ruby<MissingAction>(Object x)
33
+ {
34
+ auto value = x.to_s().str();
35
+ if (value == "divide") return Divide;
36
+ if (value == "impute") return Impute;
37
+ if (value == "fail") return Fail;
38
+ throw std::runtime_error("Unknown missing action: " + value);
39
+ }
40
+
41
+ template<>
42
+ CategSplit from_ruby<CategSplit>(Object x)
43
+ {
44
+ auto value = x.to_s().str();
45
+ if (value == "subset") return SubSet;
46
+ if (value == "single_categ") return SingleCateg;
47
+ throw std::runtime_error("Unknown categ split: " + value);
48
+ }
49
+
50
+ template<>
51
+ CoefType from_ruby<CoefType>(Object x)
52
+ {
53
+ auto value = x.to_s().str();
54
+ if (value == "uniform") return Uniform;
55
+ if (value == "normal") return Normal;
56
+ throw std::runtime_error("Unknown coef type: " + value);
57
+ }
58
+
59
+ template<>
60
+ UseDepthImp from_ruby<UseDepthImp>(Object x)
61
+ {
62
+ auto value = x.to_s().str();
63
+ if (value == "lower") return Lower;
64
+ if (value == "higher") return Higher;
65
+ if (value == "same") return Same;
66
+ throw std::runtime_error("Unknown depth imp: " + value);
67
+ }
68
+
69
+ template<>
70
+ WeighImpRows from_ruby<WeighImpRows>(Object x)
71
+ {
72
+ auto value = x.to_s().str();
73
+ if (value == "inverse") return Inverse;
74
+ if (value == "prop") return Prop;
75
+ if (value == "flat") return Flat;
76
+ throw std::runtime_error("Unknown weight imp rows: " + value);
77
+ }
78
+
19
79
  extern "C"
20
80
  void Init_ext()
21
81
  {
@@ -54,25 +114,7 @@ void Init_ext()
54
114
  sparse_ix* Xc_indptr = NULL;
55
115
 
56
116
  // options
57
- CoefType coef_type = Normal;
58
- double* sample_weights = NULL;
59
- bool weight_as_sample = false;
60
- size_t max_depth = 0;
61
- bool limit_depth = true;
62
- bool standardize_dist = false;
63
- double* tmat = NULL;
64
- double* output_depths = NULL;
65
- bool standardize_depth = false;
66
- double* col_weights = NULL;
67
- MissingAction missing_action = Impute;
68
- CategSplit cat_split_type = SubSet;
69
- NewCategAction new_cat_action = Smallest;
70
- Imputer *imputer = NULL;
71
- UseDepthImp depth_imp = Higher;
72
- WeighImpRows weigh_imp_rows = Inverse;
73
- bool impute_at_fit = false;
74
-
75
- // Rice has limit of 14 arguments, so use hash for options
117
+ // Rice has limit of 14 arguments, so use hash
76
118
  size_t sample_size = options.get<size_t, Symbol>("sample_size");
77
119
  size_t ndim = options.get<size_t, Symbol>("ndim");
78
120
  size_t ntrees = options.get<size_t, Symbol>("ntrees");
@@ -82,15 +124,34 @@ void Init_ext()
82
124
  double prob_pick_by_gain_pl = options.get<double, Symbol>("prob_pick_pooled_gain");
83
125
  double prob_split_by_gain_pl = options.get<double, Symbol>("prob_split_pooled_gain");
84
126
  double min_gain = options.get<double, Symbol>("min_gain");
127
+ MissingAction missing_action = options.get<MissingAction, Symbol>("missing_action");
128
+ CategSplit cat_split_type = options.get<CategSplit, Symbol>("categ_split_type");
129
+ NewCategAction new_cat_action = options.get<NewCategAction, Symbol>("new_categ_action");
85
130
  bool all_perm = options.get<bool, Symbol>("all_perm");
86
131
  bool coef_by_prop = options.get<bool, Symbol>("coef_by_prop");
87
132
  bool with_replacement = options.get<bool, Symbol>("sample_with_replacement");
88
133
  bool penalize_range = options.get<bool, Symbol>("penalize_range");
89
134
  bool weigh_by_kurt = options.get<bool, Symbol>("weigh_by_kurtosis");
135
+ CoefType coef_type = options.get<CoefType, Symbol>("coefs");
90
136
  size_t min_imp_obs = options.get<size_t, Symbol>("min_imp_obs");
137
+ UseDepthImp depth_imp = options.get<UseDepthImp, Symbol>("depth_imp");
138
+ WeighImpRows weigh_imp_rows = options.get<WeighImpRows, Symbol>("weigh_imp_rows");
91
139
  uint64_t random_seed = options.get<uint64_t, Symbol>("random_seed");
92
140
  int nthreads = options.get<int, Symbol>("nthreads");
93
141
 
142
+ // TODO options
143
+ double* sample_weights = NULL;
144
+ bool weight_as_sample = false;
145
+ size_t max_depth = 0;
146
+ bool limit_depth = true;
147
+ bool standardize_dist = false;
148
+ double* tmat = NULL;
149
+ double* output_depths = NULL;
150
+ bool standardize_depth = false;
151
+ double* col_weights = NULL;
152
+ Imputer *imputer = NULL;
153
+ bool impute_at_fit = false;
154
+
94
155
  fit_iforest(
95
156
  NULL,
96
157
  &iso,
@@ -4,9 +4,11 @@ module IsoTree
4
4
  sample_size: nil, ntrees: 500, ndim: 3, ntry: 3,
5
5
  prob_pick_avg_gain: 0, prob_pick_pooled_gain: 0,
6
6
  prob_split_avg_gain: 0, prob_split_pooled_gain: 0,
7
- min_gain: 0, all_perm: false, coef_by_prop: false,
7
+ min_gain: 0, missing_action: "impute", new_categ_action: "smallest",
8
+ categ_split_type: "subset", all_perm: false, coef_by_prop: false,
8
9
  sample_with_replacement: false, penalize_range: true,
9
- weigh_by_kurtosis: false, min_imp_obs: 3, random_seed: 1, nthreads: -1
10
+ weigh_by_kurtosis: false, coefs: "normal", min_imp_obs: 3, depth_imp: "higher",
11
+ weigh_imp_rows: "inverse", random_seed: 1, nthreads: -1
10
12
  )
11
13
 
12
14
  @sample_size = sample_size
@@ -18,12 +20,18 @@ module IsoTree
18
20
  @prob_split_avg_gain = prob_split_avg_gain
19
21
  @prob_split_pooled_gain = prob_split_pooled_gain
20
22
  @min_gain = min_gain
23
+ @missing_action = missing_action
24
+ @new_categ_action = new_categ_action
25
+ @categ_split_type = categ_split_type
21
26
  @all_perm = all_perm
22
27
  @coef_by_prop = coef_by_prop
23
28
  @sample_with_replacement = sample_with_replacement
24
29
  @penalize_range = penalize_range
25
30
  @weigh_by_kurtosis = weigh_by_kurtosis
31
+ @coefs = coefs
26
32
  @min_imp_obs = min_imp_obs
33
+ @depth_imp = depth_imp
34
+ @weigh_imp_rows = weigh_imp_rows
27
35
  @random_seed = random_seed
28
36
 
29
37
  # etc module returns virtual cores
@@ -138,10 +146,11 @@ module IsoTree
138
146
  sample_size ntrees ndim ntry
139
147
  prob_pick_avg_gain prob_pick_pooled_gain
140
148
  prob_split_avg_gain prob_split_pooled_gain
141
- min_gain all_perm coef_by_prop
149
+ min_gain missing_action new_categ_action
150
+ categ_split_type all_perm coef_by_prop
142
151
  sample_with_replacement penalize_range
143
- weigh_by_kurtosis min_imp_obs
144
- random_seed nthreads
152
+ weigh_by_kurtosis coefs min_imp_obs depth_imp
153
+ weigh_imp_rows random_seed nthreads
145
154
  )
146
155
  options = {}
147
156
  keys.each do |k|
@@ -1,3 +1,3 @@
1
1
  module IsoTree
2
- VERSION = "0.1.3"
2
+ VERSION = "0.1.4"
3
3
  end
@@ -1,6 +1,6 @@
1
1
  # IsoTree
2
2
 
3
- Fast and multi-threaded implementation of Extended Isolation Forest, Fair-Cut Forest, SCiForest (a.k.a. Split-Criterion iForest), and regular Isolation Forest, for outlier/anomaly detection, plus additions for imputation of missing values, distance/similarity calculation between observations, and handling of categorical data. Written in C++ with interfaces for Python and R.
3
+ Fast and multi-threaded implementation of Extended Isolation Forest, Fair-Cut Forest, SCiForest (a.k.a. Split-Criterion iForest), and regular Isolation Forest, for outlier/anomaly detection, plus additions for imputation of missing values, distance/similarity calculation between observations, and handling of categorical data. Written in C++ with interfaces for Python and R. An additional wrapper for Ruby can be found [here](https://github.com/ankane/isotree).
4
4
 
5
5
  The new concepts in this software are described in:
6
6
  * [Distance approximation using Isolation Forests](https://arxiv.org/abs/1910.12362)
@@ -82,6 +82,10 @@ sudo ldconfig
82
82
 
83
83
  (Will build as a shared object - linkage is then done with `-lisotree`)
84
84
 
85
+ * Ruby
86
+
87
+ See [external repository with wrapper](https://github.com/ankane/isotree).
88
+
85
89
  # Sample usage
86
90
 
87
91
  **Warning: default parameters in this implementation are very different from default parameters in others such as SciKit-Learn's, and these defaults won't scale to large datasets (see documentation for details).**
@@ -1,7 +1,7 @@
1
1
  /* Isolation forests and variations thereof, with adjustments for incorporation
2
2
  * of categorical variables and missing values.
3
3
  * Writen for C++11 standard and aimed at being used in R and Python.
4
- *
4
+ *
5
5
  * This library is based on the following works:
6
6
  * [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
7
7
  * "Isolation forest."
@@ -20,7 +20,7 @@
20
20
  * [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
21
21
  * [8] Cortes, David. "Distance approximation using Isolation Forests." arXiv preprint arXiv:1910.12362 (2019).
22
22
  * [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
23
- *
23
+ *
24
24
  * BSD 2-Clause License
25
25
  * Copyright (c) 2019, David Cortes
26
26
  * All rights reserved.
@@ -47,7 +47,7 @@
47
47
  bool interrupt_switch;
48
48
 
49
49
  /* Fit Isolation Forest model, or variant of it such as SCiForest
50
- *
50
+ *
51
51
  * Parameters:
52
52
  * ===========
53
53
  * - model_outputs (out)
@@ -291,7 +291,7 @@ bool interrupt_switch;
291
291
  * Number of parallel threads to use. Note that, the more threads, the more memory will be
292
292
  * allocated, even if the thread does not end up being used. Ignored when not building with
293
293
  * OpenMP support.
294
- *
294
+ *
295
295
  * Returns
296
296
  * =======
297
297
  * Will return macro 'EXIT_SUCCESS' (typically =0) upon completion.
@@ -300,7 +300,7 @@ bool interrupt_switch;
300
300
  * what these values correspond to, you can use the functions
301
301
  * 'return_EXIT_SUCESS' and 'return_EXIT_FAILURE', which will return them
302
302
  * as integers.
303
- *
303
+ *
304
304
  * References
305
305
  * ==========
306
306
  * [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
@@ -418,6 +418,13 @@ int fit_iforest(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
418
418
 
419
419
  /* Global variable that determines if the procedure receives a stop signal */
420
420
  interrupt_switch = false;
421
+ /* TODO: find a better way of handling interrupt signals when calling in Python/R.
422
+ The following will still change the behavior of interrupts when called through e.g. Flask */
423
+ #if !defined(_WIN32) && !defined(_WIN64) && !defined(_MSC_VER)
424
+ struct sigaction sig_handle;
425
+ sig_handle.sa_flags = SA_RESETHAND;
426
+ sig_handle.sa_handler = set_interrup_global_variable;
427
+ #endif
421
428
 
422
429
  /* grow trees */
423
430
  #pragma omp parallel for num_threads(nthreads) schedule(dynamic) shared(model_outputs, model_outputs_ext, worker_memory, input_data, model_params)
@@ -461,7 +468,11 @@ int fit_iforest(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
461
468
  else
462
469
  model_outputs_ext->hplanes[tree].shrink_to_fit();
463
470
 
464
- signal(SIGINT, set_interrup_global_variable);
471
+ #if !defined(_WIN32) && !defined(_WIN64) && !defined(_MSC_VER)
472
+ // sigaction(SIGINT, &sig_handle, NULL);
473
+ #else
474
+ // signal(SIGINT, set_interrup_global_variable);
475
+ #endif
465
476
  }
466
477
 
467
478
  /* check if the procedure got interrupted */
@@ -545,7 +556,7 @@ int fit_iforest(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
545
556
 
546
557
 
547
558
  /* Add additional trees to already-fitted isolation forest model
548
- *
559
+ *
549
560
  * Parameters
550
561
  * ==========
551
562
  * - model_outputs
@@ -1001,7 +1012,7 @@ void fit_itree(std::vector<IsoTree> *tree_root,
1001
1012
  if (
1002
1013
  model_params.cat_split_type == SubSet &&
1003
1014
  (
1004
- model_params.prob_pick_by_gain_avg ||
1015
+ model_params.prob_pick_by_gain_avg ||
1005
1016
  model_params.prob_pick_by_gain_pl
1006
1017
  )
1007
1018
  )
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: isotree
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-08-13 00:00:00.000000000 Z
11
+ date: 2020-08-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rice