isotree 0.1.3 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 2e1eee89fa5df77e8f659e270c1d73ebb658baf8e4e52756e4ba17c2b1efd502
4
- data.tar.gz: d0c1725819661ae742febd10decd4ff3d3c3bd8717e59127784488dd1e0ae890
3
+ metadata.gz: 6121fa6837526e05ea0124e69c189dea6e0c94a303a003ed8f5db5e8469e54b3
4
+ data.tar.gz: eb4a955ffa47876af4ece7ce7006635af4b43e29475055cf0046d61cbb54d443
5
5
  SHA512:
6
- metadata.gz: b887f8c29061c3577614fe3a267901498852c3de5fe50c6281465722d4b30acd49b68338de96a8a92690558e18cdbe34339e761f7d93d3c828dba40e8eef1d21
7
- data.tar.gz: cc396e69aac246653bb45692a2097e0d8ca345a6cc7089c3d76cb8df0afc08feff44d79a2705fb99d6fee0df34cfe22e98ff85b914f3d0e4b87c2e8f1bbcec6d
6
+ metadata.gz: 22eb873ac7cff7cd5a1e0e7c04e6c47e895c06d8e0e52cd5ee2c4ab31b652203737186fe2dfd84298ea9cdf5327bed0c4d22e67ec9fb787179eae978f922b127
7
+ data.tar.gz: bcc62a52fa71bf6e3175108a473e834b8a730ea0a94dd8adc017057a23a9bf56d451c3b955b3aafacd2cd9e6f4c237c31ce4f7a3eefd4d11aaed25183def3b92
@@ -1,3 +1,8 @@
1
+ ## 0.1.4 (2020-08-22)
2
+
3
+ - Added `missing_action`, `new_categ_action`, `categ_split_type`, `coefs`, `depth_imp`, and `weigh_imp_rows` options
4
+ - Fixed signal handling
5
+
1
6
  ## 0.1.3 (2020-08-13)
2
7
 
3
8
  - Added support for categorical data
data/README.md CHANGED
@@ -58,12 +58,18 @@ IsoTree::IsolationForest.new(
58
58
  prob_split_avg_gain: 0,
59
59
  prob_split_pooled_gain: 0,
60
60
  min_gain: 0,
61
+ missing_action: "impute",
62
+ new_categ_action: "smallest",
63
+ categ_split_type: "subset",
61
64
  all_perm: false,
62
65
  coef_by_prop: false,
63
66
  sample_with_replacement: false,
64
67
  penalize_range: true,
65
68
  weigh_by_kurtosis: false,
69
+ coefs: "normal",
66
70
  min_imp_obs: 3,
71
+ depth_imp: "higher",
72
+ weigh_imp_rows: "inverse",
67
73
  random_seed: 1,
68
74
  nthreads: -1
69
75
  )
@@ -5,17 +5,77 @@
5
5
  #include <rice/Array.hpp>
6
6
  #include <rice/Hash.hpp>
7
7
  #include <rice/Module.hpp>
8
+ #include <rice/Object.hpp>
8
9
  #include <rice/String.hpp>
9
10
  #include <rice/Symbol.hpp>
10
11
 
11
12
  using Rice::Array;
12
13
  using Rice::Hash;
13
14
  using Rice::Module;
15
+ using Rice::Object;
14
16
  using Rice::String;
15
17
  using Rice::Symbol;
16
18
  using Rice::define_class_under;
17
19
  using Rice::define_module;
18
20
 
21
+ template<>
22
+ NewCategAction from_ruby<NewCategAction>(Object x)
23
+ {
24
+ auto value = x.to_s().str();
25
+ if (value == "weighted") return Weighted;
26
+ if (value == "smallest") return Smallest;
27
+ if (value == "random") return Random;
28
+ throw std::runtime_error("Unknown new categ action: " + value);
29
+ }
30
+
31
+ template<>
32
+ MissingAction from_ruby<MissingAction>(Object x)
33
+ {
34
+ auto value = x.to_s().str();
35
+ if (value == "divide") return Divide;
36
+ if (value == "impute") return Impute;
37
+ if (value == "fail") return Fail;
38
+ throw std::runtime_error("Unknown missing action: " + value);
39
+ }
40
+
41
+ template<>
42
+ CategSplit from_ruby<CategSplit>(Object x)
43
+ {
44
+ auto value = x.to_s().str();
45
+ if (value == "subset") return SubSet;
46
+ if (value == "single_categ") return SingleCateg;
47
+ throw std::runtime_error("Unknown categ split: " + value);
48
+ }
49
+
50
+ template<>
51
+ CoefType from_ruby<CoefType>(Object x)
52
+ {
53
+ auto value = x.to_s().str();
54
+ if (value == "uniform") return Uniform;
55
+ if (value == "normal") return Normal;
56
+ throw std::runtime_error("Unknown coef type: " + value);
57
+ }
58
+
59
+ template<>
60
+ UseDepthImp from_ruby<UseDepthImp>(Object x)
61
+ {
62
+ auto value = x.to_s().str();
63
+ if (value == "lower") return Lower;
64
+ if (value == "higher") return Higher;
65
+ if (value == "same") return Same;
66
+ throw std::runtime_error("Unknown depth imp: " + value);
67
+ }
68
+
69
+ template<>
70
+ WeighImpRows from_ruby<WeighImpRows>(Object x)
71
+ {
72
+ auto value = x.to_s().str();
73
+ if (value == "inverse") return Inverse;
74
+ if (value == "prop") return Prop;
75
+ if (value == "flat") return Flat;
76
+ throw std::runtime_error("Unknown weight imp rows: " + value);
77
+ }
78
+
19
79
  extern "C"
20
80
  void Init_ext()
21
81
  {
@@ -54,25 +114,7 @@ void Init_ext()
54
114
  sparse_ix* Xc_indptr = NULL;
55
115
 
56
116
  // options
57
- CoefType coef_type = Normal;
58
- double* sample_weights = NULL;
59
- bool weight_as_sample = false;
60
- size_t max_depth = 0;
61
- bool limit_depth = true;
62
- bool standardize_dist = false;
63
- double* tmat = NULL;
64
- double* output_depths = NULL;
65
- bool standardize_depth = false;
66
- double* col_weights = NULL;
67
- MissingAction missing_action = Impute;
68
- CategSplit cat_split_type = SubSet;
69
- NewCategAction new_cat_action = Smallest;
70
- Imputer *imputer = NULL;
71
- UseDepthImp depth_imp = Higher;
72
- WeighImpRows weigh_imp_rows = Inverse;
73
- bool impute_at_fit = false;
74
-
75
- // Rice has limit of 14 arguments, so use hash for options
117
+ // Rice has limit of 14 arguments, so use hash
76
118
  size_t sample_size = options.get<size_t, Symbol>("sample_size");
77
119
  size_t ndim = options.get<size_t, Symbol>("ndim");
78
120
  size_t ntrees = options.get<size_t, Symbol>("ntrees");
@@ -82,15 +124,34 @@ void Init_ext()
82
124
  double prob_pick_by_gain_pl = options.get<double, Symbol>("prob_pick_pooled_gain");
83
125
  double prob_split_by_gain_pl = options.get<double, Symbol>("prob_split_pooled_gain");
84
126
  double min_gain = options.get<double, Symbol>("min_gain");
127
+ MissingAction missing_action = options.get<MissingAction, Symbol>("missing_action");
128
+ CategSplit cat_split_type = options.get<CategSplit, Symbol>("categ_split_type");
129
+ NewCategAction new_cat_action = options.get<NewCategAction, Symbol>("new_categ_action");
85
130
  bool all_perm = options.get<bool, Symbol>("all_perm");
86
131
  bool coef_by_prop = options.get<bool, Symbol>("coef_by_prop");
87
132
  bool with_replacement = options.get<bool, Symbol>("sample_with_replacement");
88
133
  bool penalize_range = options.get<bool, Symbol>("penalize_range");
89
134
  bool weigh_by_kurt = options.get<bool, Symbol>("weigh_by_kurtosis");
135
+ CoefType coef_type = options.get<CoefType, Symbol>("coefs");
90
136
  size_t min_imp_obs = options.get<size_t, Symbol>("min_imp_obs");
137
+ UseDepthImp depth_imp = options.get<UseDepthImp, Symbol>("depth_imp");
138
+ WeighImpRows weigh_imp_rows = options.get<WeighImpRows, Symbol>("weigh_imp_rows");
91
139
  uint64_t random_seed = options.get<uint64_t, Symbol>("random_seed");
92
140
  int nthreads = options.get<int, Symbol>("nthreads");
93
141
 
142
+ // TODO options
143
+ double* sample_weights = NULL;
144
+ bool weight_as_sample = false;
145
+ size_t max_depth = 0;
146
+ bool limit_depth = true;
147
+ bool standardize_dist = false;
148
+ double* tmat = NULL;
149
+ double* output_depths = NULL;
150
+ bool standardize_depth = false;
151
+ double* col_weights = NULL;
152
+ Imputer *imputer = NULL;
153
+ bool impute_at_fit = false;
154
+
94
155
  fit_iforest(
95
156
  NULL,
96
157
  &iso,
@@ -4,9 +4,11 @@ module IsoTree
4
4
  sample_size: nil, ntrees: 500, ndim: 3, ntry: 3,
5
5
  prob_pick_avg_gain: 0, prob_pick_pooled_gain: 0,
6
6
  prob_split_avg_gain: 0, prob_split_pooled_gain: 0,
7
- min_gain: 0, all_perm: false, coef_by_prop: false,
7
+ min_gain: 0, missing_action: "impute", new_categ_action: "smallest",
8
+ categ_split_type: "subset", all_perm: false, coef_by_prop: false,
8
9
  sample_with_replacement: false, penalize_range: true,
9
- weigh_by_kurtosis: false, min_imp_obs: 3, random_seed: 1, nthreads: -1
10
+ weigh_by_kurtosis: false, coefs: "normal", min_imp_obs: 3, depth_imp: "higher",
11
+ weigh_imp_rows: "inverse", random_seed: 1, nthreads: -1
10
12
  )
11
13
 
12
14
  @sample_size = sample_size
@@ -18,12 +20,18 @@ module IsoTree
18
20
  @prob_split_avg_gain = prob_split_avg_gain
19
21
  @prob_split_pooled_gain = prob_split_pooled_gain
20
22
  @min_gain = min_gain
23
+ @missing_action = missing_action
24
+ @new_categ_action = new_categ_action
25
+ @categ_split_type = categ_split_type
21
26
  @all_perm = all_perm
22
27
  @coef_by_prop = coef_by_prop
23
28
  @sample_with_replacement = sample_with_replacement
24
29
  @penalize_range = penalize_range
25
30
  @weigh_by_kurtosis = weigh_by_kurtosis
31
+ @coefs = coefs
26
32
  @min_imp_obs = min_imp_obs
33
+ @depth_imp = depth_imp
34
+ @weigh_imp_rows = weigh_imp_rows
27
35
  @random_seed = random_seed
28
36
 
29
37
  # etc module returns virtual cores
@@ -138,10 +146,11 @@ module IsoTree
138
146
  sample_size ntrees ndim ntry
139
147
  prob_pick_avg_gain prob_pick_pooled_gain
140
148
  prob_split_avg_gain prob_split_pooled_gain
141
- min_gain all_perm coef_by_prop
149
+ min_gain missing_action new_categ_action
150
+ categ_split_type all_perm coef_by_prop
142
151
  sample_with_replacement penalize_range
143
- weigh_by_kurtosis min_imp_obs
144
- random_seed nthreads
152
+ weigh_by_kurtosis coefs min_imp_obs depth_imp
153
+ weigh_imp_rows random_seed nthreads
145
154
  )
146
155
  options = {}
147
156
  keys.each do |k|
@@ -1,3 +1,3 @@
1
1
  module IsoTree
2
- VERSION = "0.1.3"
2
+ VERSION = "0.1.4"
3
3
  end
@@ -1,6 +1,6 @@
1
1
  # IsoTree
2
2
 
3
- Fast and multi-threaded implementation of Extended Isolation Forest, Fair-Cut Forest, SCiForest (a.k.a. Split-Criterion iForest), and regular Isolation Forest, for outlier/anomaly detection, plus additions for imputation of missing values, distance/similarity calculation between observations, and handling of categorical data. Written in C++ with interfaces for Python and R.
3
+ Fast and multi-threaded implementation of Extended Isolation Forest, Fair-Cut Forest, SCiForest (a.k.a. Split-Criterion iForest), and regular Isolation Forest, for outlier/anomaly detection, plus additions for imputation of missing values, distance/similarity calculation between observations, and handling of categorical data. Written in C++ with interfaces for Python and R. An additional wrapper for Ruby can be found [here](https://github.com/ankane/isotree).
4
4
 
5
5
  The new concepts in this software are described in:
6
6
  * [Distance approximation using Isolation Forests](https://arxiv.org/abs/1910.12362)
@@ -82,6 +82,10 @@ sudo ldconfig
82
82
 
83
83
  (Will build as a shared object - linkage is then done with `-lisotree`)
84
84
 
85
+ * Ruby
86
+
87
+ See [external repository with wrapper](https://github.com/ankane/isotree).
88
+
85
89
  # Sample usage
86
90
 
87
91
  **Warning: default parameters in this implementation are very different from default parameters in others such as SciKit-Learn's, and these defaults won't scale to large datasets (see documentation for details).**
@@ -1,7 +1,7 @@
1
1
  /* Isolation forests and variations thereof, with adjustments for incorporation
2
2
  * of categorical variables and missing values.
3
3
  * Writen for C++11 standard and aimed at being used in R and Python.
4
- *
4
+ *
5
5
  * This library is based on the following works:
6
6
  * [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
7
7
  * "Isolation forest."
@@ -20,7 +20,7 @@
20
20
  * [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
21
21
  * [8] Cortes, David. "Distance approximation using Isolation Forests." arXiv preprint arXiv:1910.12362 (2019).
22
22
  * [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
23
- *
23
+ *
24
24
  * BSD 2-Clause License
25
25
  * Copyright (c) 2019, David Cortes
26
26
  * All rights reserved.
@@ -47,7 +47,7 @@
47
47
  bool interrupt_switch;
48
48
 
49
49
  /* Fit Isolation Forest model, or variant of it such as SCiForest
50
- *
50
+ *
51
51
  * Parameters:
52
52
  * ===========
53
53
  * - model_outputs (out)
@@ -291,7 +291,7 @@ bool interrupt_switch;
291
291
  * Number of parallel threads to use. Note that, the more threads, the more memory will be
292
292
  * allocated, even if the thread does not end up being used. Ignored when not building with
293
293
  * OpenMP support.
294
- *
294
+ *
295
295
  * Returns
296
296
  * =======
297
297
  * Will return macro 'EXIT_SUCCESS' (typically =0) upon completion.
@@ -300,7 +300,7 @@ bool interrupt_switch;
300
300
  * what these values correspond to, you can use the functions
301
301
  * 'return_EXIT_SUCESS' and 'return_EXIT_FAILURE', which will return them
302
302
  * as integers.
303
- *
303
+ *
304
304
  * References
305
305
  * ==========
306
306
  * [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
@@ -418,6 +418,13 @@ int fit_iforest(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
418
418
 
419
419
  /* Global variable that determines if the procedure receives a stop signal */
420
420
  interrupt_switch = false;
421
+ /* TODO: find a better way of handling interrupt signals when calling in Python/R.
422
+ The following will still change the behavior of interrupts when called through e.g. Flask */
423
+ #if !defined(_WIN32) && !defined(_WIN64) && !defined(_MSC_VER)
424
+ struct sigaction sig_handle;
425
+ sig_handle.sa_flags = SA_RESETHAND;
426
+ sig_handle.sa_handler = set_interrup_global_variable;
427
+ #endif
421
428
 
422
429
  /* grow trees */
423
430
  #pragma omp parallel for num_threads(nthreads) schedule(dynamic) shared(model_outputs, model_outputs_ext, worker_memory, input_data, model_params)
@@ -461,7 +468,11 @@ int fit_iforest(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
461
468
  else
462
469
  model_outputs_ext->hplanes[tree].shrink_to_fit();
463
470
 
464
- signal(SIGINT, set_interrup_global_variable);
471
+ #if !defined(_WIN32) && !defined(_WIN64) && !defined(_MSC_VER)
472
+ // sigaction(SIGINT, &sig_handle, NULL);
473
+ #else
474
+ // signal(SIGINT, set_interrup_global_variable);
475
+ #endif
465
476
  }
466
477
 
467
478
  /* check if the procedure got interrupted */
@@ -545,7 +556,7 @@ int fit_iforest(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
545
556
 
546
557
 
547
558
  /* Add additional trees to already-fitted isolation forest model
548
- *
559
+ *
549
560
  * Parameters
550
561
  * ==========
551
562
  * - model_outputs
@@ -1001,7 +1012,7 @@ void fit_itree(std::vector<IsoTree> *tree_root,
1001
1012
  if (
1002
1013
  model_params.cat_split_type == SubSet &&
1003
1014
  (
1004
- model_params.prob_pick_by_gain_avg ||
1015
+ model_params.prob_pick_by_gain_avg ||
1005
1016
  model_params.prob_pick_by_gain_pl
1006
1017
  )
1007
1018
  )
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: isotree
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.1.4
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-08-13 00:00:00.000000000 Z
11
+ date: 2020-08-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rice