isotree 0.1.3 → 0.1.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/README.md +6 -0
- data/ext/isotree/ext.cpp +80 -19
- data/lib/isotree/isolation_forest.rb +14 -5
- data/lib/isotree/version.rb +1 -1
- data/vendor/isotree/README.md +5 -1
- data/vendor/isotree/src/fit_model.cpp +19 -8
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 6121fa6837526e05ea0124e69c189dea6e0c94a303a003ed8f5db5e8469e54b3
|
4
|
+
data.tar.gz: eb4a955ffa47876af4ece7ce7006635af4b43e29475055cf0046d61cbb54d443
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 22eb873ac7cff7cd5a1e0e7c04e6c47e895c06d8e0e52cd5ee2c4ab31b652203737186fe2dfd84298ea9cdf5327bed0c4d22e67ec9fb787179eae978f922b127
|
7
|
+
data.tar.gz: bcc62a52fa71bf6e3175108a473e834b8a730ea0a94dd8adc017057a23a9bf56d451c3b955b3aafacd2cd9e6f4c237c31ce4f7a3eefd4d11aaed25183def3b92
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -58,12 +58,18 @@ IsoTree::IsolationForest.new(
|
|
58
58
|
prob_split_avg_gain: 0,
|
59
59
|
prob_split_pooled_gain: 0,
|
60
60
|
min_gain: 0,
|
61
|
+
missing_action: "impute",
|
62
|
+
new_categ_action: "smallest",
|
63
|
+
categ_split_type: "subset",
|
61
64
|
all_perm: false,
|
62
65
|
coef_by_prop: false,
|
63
66
|
sample_with_replacement: false,
|
64
67
|
penalize_range: true,
|
65
68
|
weigh_by_kurtosis: false,
|
69
|
+
coefs: "normal",
|
66
70
|
min_imp_obs: 3,
|
71
|
+
depth_imp: "higher",
|
72
|
+
weigh_imp_rows: "inverse",
|
67
73
|
random_seed: 1,
|
68
74
|
nthreads: -1
|
69
75
|
)
|
data/ext/isotree/ext.cpp
CHANGED
@@ -5,17 +5,77 @@
|
|
5
5
|
#include <rice/Array.hpp>
|
6
6
|
#include <rice/Hash.hpp>
|
7
7
|
#include <rice/Module.hpp>
|
8
|
+
#include <rice/Object.hpp>
|
8
9
|
#include <rice/String.hpp>
|
9
10
|
#include <rice/Symbol.hpp>
|
10
11
|
|
11
12
|
using Rice::Array;
|
12
13
|
using Rice::Hash;
|
13
14
|
using Rice::Module;
|
15
|
+
using Rice::Object;
|
14
16
|
using Rice::String;
|
15
17
|
using Rice::Symbol;
|
16
18
|
using Rice::define_class_under;
|
17
19
|
using Rice::define_module;
|
18
20
|
|
21
|
+
template<>
|
22
|
+
NewCategAction from_ruby<NewCategAction>(Object x)
|
23
|
+
{
|
24
|
+
auto value = x.to_s().str();
|
25
|
+
if (value == "weighted") return Weighted;
|
26
|
+
if (value == "smallest") return Smallest;
|
27
|
+
if (value == "random") return Random;
|
28
|
+
throw std::runtime_error("Unknown new categ action: " + value);
|
29
|
+
}
|
30
|
+
|
31
|
+
template<>
|
32
|
+
MissingAction from_ruby<MissingAction>(Object x)
|
33
|
+
{
|
34
|
+
auto value = x.to_s().str();
|
35
|
+
if (value == "divide") return Divide;
|
36
|
+
if (value == "impute") return Impute;
|
37
|
+
if (value == "fail") return Fail;
|
38
|
+
throw std::runtime_error("Unknown missing action: " + value);
|
39
|
+
}
|
40
|
+
|
41
|
+
template<>
|
42
|
+
CategSplit from_ruby<CategSplit>(Object x)
|
43
|
+
{
|
44
|
+
auto value = x.to_s().str();
|
45
|
+
if (value == "subset") return SubSet;
|
46
|
+
if (value == "single_categ") return SingleCateg;
|
47
|
+
throw std::runtime_error("Unknown categ split: " + value);
|
48
|
+
}
|
49
|
+
|
50
|
+
template<>
|
51
|
+
CoefType from_ruby<CoefType>(Object x)
|
52
|
+
{
|
53
|
+
auto value = x.to_s().str();
|
54
|
+
if (value == "uniform") return Uniform;
|
55
|
+
if (value == "normal") return Normal;
|
56
|
+
throw std::runtime_error("Unknown coef type: " + value);
|
57
|
+
}
|
58
|
+
|
59
|
+
template<>
|
60
|
+
UseDepthImp from_ruby<UseDepthImp>(Object x)
|
61
|
+
{
|
62
|
+
auto value = x.to_s().str();
|
63
|
+
if (value == "lower") return Lower;
|
64
|
+
if (value == "higher") return Higher;
|
65
|
+
if (value == "same") return Same;
|
66
|
+
throw std::runtime_error("Unknown depth imp: " + value);
|
67
|
+
}
|
68
|
+
|
69
|
+
template<>
|
70
|
+
WeighImpRows from_ruby<WeighImpRows>(Object x)
|
71
|
+
{
|
72
|
+
auto value = x.to_s().str();
|
73
|
+
if (value == "inverse") return Inverse;
|
74
|
+
if (value == "prop") return Prop;
|
75
|
+
if (value == "flat") return Flat;
|
76
|
+
throw std::runtime_error("Unknown weight imp rows: " + value);
|
77
|
+
}
|
78
|
+
|
19
79
|
extern "C"
|
20
80
|
void Init_ext()
|
21
81
|
{
|
@@ -54,25 +114,7 @@ void Init_ext()
|
|
54
114
|
sparse_ix* Xc_indptr = NULL;
|
55
115
|
|
56
116
|
// options
|
57
|
-
|
58
|
-
double* sample_weights = NULL;
|
59
|
-
bool weight_as_sample = false;
|
60
|
-
size_t max_depth = 0;
|
61
|
-
bool limit_depth = true;
|
62
|
-
bool standardize_dist = false;
|
63
|
-
double* tmat = NULL;
|
64
|
-
double* output_depths = NULL;
|
65
|
-
bool standardize_depth = false;
|
66
|
-
double* col_weights = NULL;
|
67
|
-
MissingAction missing_action = Impute;
|
68
|
-
CategSplit cat_split_type = SubSet;
|
69
|
-
NewCategAction new_cat_action = Smallest;
|
70
|
-
Imputer *imputer = NULL;
|
71
|
-
UseDepthImp depth_imp = Higher;
|
72
|
-
WeighImpRows weigh_imp_rows = Inverse;
|
73
|
-
bool impute_at_fit = false;
|
74
|
-
|
75
|
-
// Rice has limit of 14 arguments, so use hash for options
|
117
|
+
// Rice has limit of 14 arguments, so use hash
|
76
118
|
size_t sample_size = options.get<size_t, Symbol>("sample_size");
|
77
119
|
size_t ndim = options.get<size_t, Symbol>("ndim");
|
78
120
|
size_t ntrees = options.get<size_t, Symbol>("ntrees");
|
@@ -82,15 +124,34 @@ void Init_ext()
|
|
82
124
|
double prob_pick_by_gain_pl = options.get<double, Symbol>("prob_pick_pooled_gain");
|
83
125
|
double prob_split_by_gain_pl = options.get<double, Symbol>("prob_split_pooled_gain");
|
84
126
|
double min_gain = options.get<double, Symbol>("min_gain");
|
127
|
+
MissingAction missing_action = options.get<MissingAction, Symbol>("missing_action");
|
128
|
+
CategSplit cat_split_type = options.get<CategSplit, Symbol>("categ_split_type");
|
129
|
+
NewCategAction new_cat_action = options.get<NewCategAction, Symbol>("new_categ_action");
|
85
130
|
bool all_perm = options.get<bool, Symbol>("all_perm");
|
86
131
|
bool coef_by_prop = options.get<bool, Symbol>("coef_by_prop");
|
87
132
|
bool with_replacement = options.get<bool, Symbol>("sample_with_replacement");
|
88
133
|
bool penalize_range = options.get<bool, Symbol>("penalize_range");
|
89
134
|
bool weigh_by_kurt = options.get<bool, Symbol>("weigh_by_kurtosis");
|
135
|
+
CoefType coef_type = options.get<CoefType, Symbol>("coefs");
|
90
136
|
size_t min_imp_obs = options.get<size_t, Symbol>("min_imp_obs");
|
137
|
+
UseDepthImp depth_imp = options.get<UseDepthImp, Symbol>("depth_imp");
|
138
|
+
WeighImpRows weigh_imp_rows = options.get<WeighImpRows, Symbol>("weigh_imp_rows");
|
91
139
|
uint64_t random_seed = options.get<uint64_t, Symbol>("random_seed");
|
92
140
|
int nthreads = options.get<int, Symbol>("nthreads");
|
93
141
|
|
142
|
+
// TODO options
|
143
|
+
double* sample_weights = NULL;
|
144
|
+
bool weight_as_sample = false;
|
145
|
+
size_t max_depth = 0;
|
146
|
+
bool limit_depth = true;
|
147
|
+
bool standardize_dist = false;
|
148
|
+
double* tmat = NULL;
|
149
|
+
double* output_depths = NULL;
|
150
|
+
bool standardize_depth = false;
|
151
|
+
double* col_weights = NULL;
|
152
|
+
Imputer *imputer = NULL;
|
153
|
+
bool impute_at_fit = false;
|
154
|
+
|
94
155
|
fit_iforest(
|
95
156
|
NULL,
|
96
157
|
&iso,
|
@@ -4,9 +4,11 @@ module IsoTree
|
|
4
4
|
sample_size: nil, ntrees: 500, ndim: 3, ntry: 3,
|
5
5
|
prob_pick_avg_gain: 0, prob_pick_pooled_gain: 0,
|
6
6
|
prob_split_avg_gain: 0, prob_split_pooled_gain: 0,
|
7
|
-
min_gain: 0,
|
7
|
+
min_gain: 0, missing_action: "impute", new_categ_action: "smallest",
|
8
|
+
categ_split_type: "subset", all_perm: false, coef_by_prop: false,
|
8
9
|
sample_with_replacement: false, penalize_range: true,
|
9
|
-
weigh_by_kurtosis: false,
|
10
|
+
weigh_by_kurtosis: false, coefs: "normal", min_imp_obs: 3, depth_imp: "higher",
|
11
|
+
weigh_imp_rows: "inverse", random_seed: 1, nthreads: -1
|
10
12
|
)
|
11
13
|
|
12
14
|
@sample_size = sample_size
|
@@ -18,12 +20,18 @@ module IsoTree
|
|
18
20
|
@prob_split_avg_gain = prob_split_avg_gain
|
19
21
|
@prob_split_pooled_gain = prob_split_pooled_gain
|
20
22
|
@min_gain = min_gain
|
23
|
+
@missing_action = missing_action
|
24
|
+
@new_categ_action = new_categ_action
|
25
|
+
@categ_split_type = categ_split_type
|
21
26
|
@all_perm = all_perm
|
22
27
|
@coef_by_prop = coef_by_prop
|
23
28
|
@sample_with_replacement = sample_with_replacement
|
24
29
|
@penalize_range = penalize_range
|
25
30
|
@weigh_by_kurtosis = weigh_by_kurtosis
|
31
|
+
@coefs = coefs
|
26
32
|
@min_imp_obs = min_imp_obs
|
33
|
+
@depth_imp = depth_imp
|
34
|
+
@weigh_imp_rows = weigh_imp_rows
|
27
35
|
@random_seed = random_seed
|
28
36
|
|
29
37
|
# etc module returns virtual cores
|
@@ -138,10 +146,11 @@ module IsoTree
|
|
138
146
|
sample_size ntrees ndim ntry
|
139
147
|
prob_pick_avg_gain prob_pick_pooled_gain
|
140
148
|
prob_split_avg_gain prob_split_pooled_gain
|
141
|
-
min_gain
|
149
|
+
min_gain missing_action new_categ_action
|
150
|
+
categ_split_type all_perm coef_by_prop
|
142
151
|
sample_with_replacement penalize_range
|
143
|
-
weigh_by_kurtosis min_imp_obs
|
144
|
-
random_seed nthreads
|
152
|
+
weigh_by_kurtosis coefs min_imp_obs depth_imp
|
153
|
+
weigh_imp_rows random_seed nthreads
|
145
154
|
)
|
146
155
|
options = {}
|
147
156
|
keys.each do |k|
|
data/lib/isotree/version.rb
CHANGED
data/vendor/isotree/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# IsoTree
|
2
2
|
|
3
|
-
Fast and multi-threaded implementation of Extended Isolation Forest, Fair-Cut Forest, SCiForest (a.k.a. Split-Criterion iForest), and regular Isolation Forest, for outlier/anomaly detection, plus additions for imputation of missing values, distance/similarity calculation between observations, and handling of categorical data. Written in C++ with interfaces for Python and R.
|
3
|
+
Fast and multi-threaded implementation of Extended Isolation Forest, Fair-Cut Forest, SCiForest (a.k.a. Split-Criterion iForest), and regular Isolation Forest, for outlier/anomaly detection, plus additions for imputation of missing values, distance/similarity calculation between observations, and handling of categorical data. Written in C++ with interfaces for Python and R. An additional wrapper for Ruby can be found [here](https://github.com/ankane/isotree).
|
4
4
|
|
5
5
|
The new concepts in this software are described in:
|
6
6
|
* [Distance approximation using Isolation Forests](https://arxiv.org/abs/1910.12362)
|
@@ -82,6 +82,10 @@ sudo ldconfig
|
|
82
82
|
|
83
83
|
(Will build as a shared object - linkage is then done with `-lisotree`)
|
84
84
|
|
85
|
+
* Ruby
|
86
|
+
|
87
|
+
See [external repository with wrapper](https://github.com/ankane/isotree).
|
88
|
+
|
85
89
|
# Sample usage
|
86
90
|
|
87
91
|
**Warning: default parameters in this implementation are very different from default parameters in others such as SciKit-Learn's, and these defaults won't scale to large datasets (see documentation for details).**
|
@@ -1,7 +1,7 @@
|
|
1
1
|
/* Isolation forests and variations thereof, with adjustments for incorporation
|
2
2
|
* of categorical variables and missing values.
|
3
3
|
* Writen for C++11 standard and aimed at being used in R and Python.
|
4
|
-
*
|
4
|
+
*
|
5
5
|
* This library is based on the following works:
|
6
6
|
* [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
7
7
|
* "Isolation forest."
|
@@ -20,7 +20,7 @@
|
|
20
20
|
* [7] Quinlan, J. Ross. C4. 5: programs for machine learning. Elsevier, 2014.
|
21
21
|
* [8] Cortes, David. "Distance approximation using Isolation Forests." arXiv preprint arXiv:1910.12362 (2019).
|
22
22
|
* [9] Cortes, David. "Imputing missing values with unsupervised random trees." arXiv preprint arXiv:1911.06646 (2019).
|
23
|
-
*
|
23
|
+
*
|
24
24
|
* BSD 2-Clause License
|
25
25
|
* Copyright (c) 2019, David Cortes
|
26
26
|
* All rights reserved.
|
@@ -47,7 +47,7 @@
|
|
47
47
|
bool interrupt_switch;
|
48
48
|
|
49
49
|
/* Fit Isolation Forest model, or variant of it such as SCiForest
|
50
|
-
*
|
50
|
+
*
|
51
51
|
* Parameters:
|
52
52
|
* ===========
|
53
53
|
* - model_outputs (out)
|
@@ -291,7 +291,7 @@ bool interrupt_switch;
|
|
291
291
|
* Number of parallel threads to use. Note that, the more threads, the more memory will be
|
292
292
|
* allocated, even if the thread does not end up being used. Ignored when not building with
|
293
293
|
* OpenMP support.
|
294
|
-
*
|
294
|
+
*
|
295
295
|
* Returns
|
296
296
|
* =======
|
297
297
|
* Will return macro 'EXIT_SUCCESS' (typically =0) upon completion.
|
@@ -300,7 +300,7 @@ bool interrupt_switch;
|
|
300
300
|
* what these values correspond to, you can use the functions
|
301
301
|
* 'return_EXIT_SUCESS' and 'return_EXIT_FAILURE', which will return them
|
302
302
|
* as integers.
|
303
|
-
*
|
303
|
+
*
|
304
304
|
* References
|
305
305
|
* ==========
|
306
306
|
* [1] Liu, Fei Tony, Kai Ming Ting, and Zhi-Hua Zhou.
|
@@ -418,6 +418,13 @@ int fit_iforest(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
|
418
418
|
|
419
419
|
/* Global variable that determines if the procedure receives a stop signal */
|
420
420
|
interrupt_switch = false;
|
421
|
+
/* TODO: find a better way of handling interrupt signals when calling in Python/R.
|
422
|
+
The following will still change the behavior of interrupts when called through e.g. Flask */
|
423
|
+
#if !defined(_WIN32) && !defined(_WIN64) && !defined(_MSC_VER)
|
424
|
+
struct sigaction sig_handle;
|
425
|
+
sig_handle.sa_flags = SA_RESETHAND;
|
426
|
+
sig_handle.sa_handler = set_interrup_global_variable;
|
427
|
+
#endif
|
421
428
|
|
422
429
|
/* grow trees */
|
423
430
|
#pragma omp parallel for num_threads(nthreads) schedule(dynamic) shared(model_outputs, model_outputs_ext, worker_memory, input_data, model_params)
|
@@ -461,7 +468,11 @@ int fit_iforest(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
|
461
468
|
else
|
462
469
|
model_outputs_ext->hplanes[tree].shrink_to_fit();
|
463
470
|
|
464
|
-
|
471
|
+
#if !defined(_WIN32) && !defined(_WIN64) && !defined(_MSC_VER)
|
472
|
+
// sigaction(SIGINT, &sig_handle, NULL);
|
473
|
+
#else
|
474
|
+
// signal(SIGINT, set_interrup_global_variable);
|
475
|
+
#endif
|
465
476
|
}
|
466
477
|
|
467
478
|
/* check if the procedure got interrupted */
|
@@ -545,7 +556,7 @@ int fit_iforest(IsoForest *model_outputs, ExtIsoForest *model_outputs_ext,
|
|
545
556
|
|
546
557
|
|
547
558
|
/* Add additional trees to already-fitted isolation forest model
|
548
|
-
*
|
559
|
+
*
|
549
560
|
* Parameters
|
550
561
|
* ==========
|
551
562
|
* - model_outputs
|
@@ -1001,7 +1012,7 @@ void fit_itree(std::vector<IsoTree> *tree_root,
|
|
1001
1012
|
if (
|
1002
1013
|
model_params.cat_split_type == SubSet &&
|
1003
1014
|
(
|
1004
|
-
model_params.prob_pick_by_gain_avg ||
|
1015
|
+
model_params.prob_pick_by_gain_avg ||
|
1005
1016
|
model_params.prob_pick_by_gain_pl
|
1006
1017
|
)
|
1007
1018
|
)
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: isotree
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.4
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2020-08-
|
11
|
+
date: 2020-08-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rice
|