isotree 0.4.1 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/LICENSE.txt +1 -1
- data/ext/isotree/ext.cpp +89 -90
- data/lib/isotree/version.rb +1 -1
- metadata +3 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: fc614bbd4c6751f60bb22751881a739965950eb4bc8fe225521d6d0caa67e0c1
|
|
4
|
+
data.tar.gz: ef7305b765fc240173173f9926057bef019347a2769f3f91cafd048c9493957b
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 641b9f59f1ec699dfdcbc19f882173306f677156ddfd9f8f6b49754c4fc703ef1d873ec30db152a5a252937f01b9a233d849645406346bed780e2f87ba827763
|
|
7
|
+
data.tar.gz: 8be8a5643ecd5dfde93aef878d643e3e47cf84943062137c66dbb27c7fd1501a3df746520764b420568bc6e8d162cbd8f3b8d37a94aa8f2e64292a35d5a8e6fa
|
data/CHANGELOG.md
CHANGED
data/LICENSE.txt
CHANGED
data/ext/isotree/ext.cpp
CHANGED
|
@@ -1,8 +1,12 @@
|
|
|
1
1
|
// stdlib
|
|
2
2
|
#include <cmath>
|
|
3
|
+
#include <cstddef>
|
|
4
|
+
#include <cstdint>
|
|
3
5
|
#include <fstream>
|
|
4
6
|
#include <iostream>
|
|
7
|
+
#include <stdexcept>
|
|
5
8
|
#include <string>
|
|
9
|
+
#include <string_view>
|
|
6
10
|
#include <vector>
|
|
7
11
|
|
|
8
12
|
// isotree
|
|
@@ -120,77 +124,77 @@ void Init_ext() {
|
|
|
120
124
|
ExtIsoForest iso;
|
|
121
125
|
|
|
122
126
|
// data
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
127
|
+
auto nrows = options.get<size_t, Symbol>("nrows");
|
|
128
|
+
auto ncols_numeric = options.get<size_t, Symbol>("ncols_numeric");
|
|
129
|
+
auto ncols_categ = options.get<size_t, Symbol>("ncols_categ");
|
|
126
130
|
|
|
127
|
-
real_t* numeric_data =
|
|
131
|
+
real_t* numeric_data = nullptr;
|
|
128
132
|
if (ncols_numeric > 0) {
|
|
129
|
-
numeric_data = (
|
|
133
|
+
numeric_data = reinterpret_cast<real_t*>(const_cast<char*>(options.get<String, Symbol>("numeric_data").c_str()));
|
|
130
134
|
}
|
|
131
135
|
|
|
132
|
-
int* categorical_data =
|
|
133
|
-
int* ncat =
|
|
136
|
+
int* categorical_data = nullptr;
|
|
137
|
+
int* ncat = nullptr;
|
|
134
138
|
if (ncols_categ > 0) {
|
|
135
|
-
categorical_data = (
|
|
136
|
-
ncat = (
|
|
139
|
+
categorical_data = reinterpret_cast<int*>(const_cast<char*>(options.get<String, Symbol>("categorical_data").c_str()));
|
|
140
|
+
ncat = reinterpret_cast<int*>(const_cast<char*>(options.get<String, Symbol>("ncat").c_str()));
|
|
137
141
|
}
|
|
138
142
|
|
|
139
143
|
// not used (sparse matrices)
|
|
140
|
-
real_t* Xc =
|
|
141
|
-
sparse_ix* Xc_ind =
|
|
142
|
-
sparse_ix* Xc_indptr =
|
|
144
|
+
real_t* Xc = nullptr;
|
|
145
|
+
sparse_ix* Xc_ind = nullptr;
|
|
146
|
+
sparse_ix* Xc_indptr = nullptr;
|
|
143
147
|
|
|
144
148
|
// options
|
|
145
149
|
// Rice has limit of 14 arguments, so use hash
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
150
|
+
auto sample_size = options.get<size_t, Symbol>("sample_size");
|
|
151
|
+
auto ndim = options.get<size_t, Symbol>("ndim");
|
|
152
|
+
auto ntrees = options.get<size_t, Symbol>("ntrees");
|
|
153
|
+
auto ntry = options.get<size_t, Symbol>("ntry");
|
|
154
|
+
auto prob_pick_by_gain_avg = options.get<double, Symbol>("prob_pick_avg_gain");
|
|
155
|
+
auto prob_pick_by_gain_pl = options.get<double, Symbol>("prob_pick_pooled_gain");
|
|
156
|
+
auto min_gain = options.get<double, Symbol>("min_gain");
|
|
157
|
+
auto missing_action = options.get<MissingAction, Symbol>("missing_action");
|
|
158
|
+
auto cat_split_type = options.get<CategSplit, Symbol>("categ_split_type");
|
|
159
|
+
auto new_cat_action = options.get<NewCategAction, Symbol>("new_categ_action");
|
|
160
|
+
auto all_perm = options.get<bool, Symbol>("all_perm");
|
|
161
|
+
auto coef_by_prop = options.get<bool, Symbol>("coef_by_prop");
|
|
162
|
+
auto with_replacement = options.get<bool, Symbol>("sample_with_replacement");
|
|
163
|
+
auto penalize_range = options.get<bool, Symbol>("penalize_range");
|
|
164
|
+
auto weigh_by_kurt = options.get<bool, Symbol>("weigh_by_kurtosis");
|
|
165
|
+
auto coef_type = options.get<CoefType, Symbol>("coefs");
|
|
166
|
+
auto min_imp_obs = options.get<size_t, Symbol>("min_imp_obs");
|
|
167
|
+
auto depth_imp = options.get<UseDepthImp, Symbol>("depth_imp");
|
|
168
|
+
auto weigh_imp_rows = options.get<WeighImpRows, Symbol>("weigh_imp_rows");
|
|
169
|
+
auto random_seed = options.get<uint64_t, Symbol>("random_seed");
|
|
170
|
+
auto use_long_double = options.get<bool, Symbol>("use_long_double");
|
|
171
|
+
auto nthreads = options.get<int, Symbol>("nthreads");
|
|
168
172
|
|
|
169
173
|
// TODO options
|
|
170
|
-
double* sample_weights =
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
+
double* sample_weights = nullptr;
|
|
175
|
+
auto weight_as_sample = options.get<bool, Symbol>("weights_as_sample_prob");
|
|
176
|
+
auto max_depth = options.get<size_t, Symbol>("max_depth");
|
|
177
|
+
auto limit_depth = options.get<bool, Symbol>("limit_depth");
|
|
174
178
|
bool standardize_dist = false;
|
|
175
|
-
double* tmat =
|
|
176
|
-
double* output_depths =
|
|
179
|
+
double* tmat = nullptr;
|
|
180
|
+
double* output_depths = nullptr;
|
|
177
181
|
bool standardize_depth = false;
|
|
178
|
-
real_t* col_weights =
|
|
179
|
-
Imputer* imputer =
|
|
182
|
+
real_t* col_weights = nullptr;
|
|
183
|
+
Imputer* imputer = nullptr;
|
|
180
184
|
bool impute_at_fit = false;
|
|
181
185
|
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
186
|
+
auto ncols_per_tree = options.get<int, Symbol>("ncols_per_tree");
|
|
187
|
+
auto standardize_data = options.get<bool, Symbol>("standardize_data");
|
|
188
|
+
auto scoring_metric = options.get<ScoringMetric, Symbol>("scoring_metric");
|
|
189
|
+
auto fast_bratio = options.get<bool, Symbol>("fast_bratio");
|
|
190
|
+
auto prob_pick_by_full_gain = options.get<double, Symbol>("prob_pick_full_gain");
|
|
191
|
+
auto prob_pick_by_dens = options.get<double, Symbol>("prob_pick_dens");
|
|
192
|
+
auto prob_pick_col_by_range = options.get<double, Symbol>("prob_pick_col_by_range");
|
|
193
|
+
auto prob_pick_col_by_var = options.get<double, Symbol>("prob_pick_col_by_var");
|
|
194
|
+
auto prob_pick_col_by_kurt = options.get<double, Symbol>("prob_pick_col_by_kurt");
|
|
191
195
|
|
|
192
196
|
fit_iforest(
|
|
193
|
-
|
|
197
|
+
nullptr,
|
|
194
198
|
&iso,
|
|
195
199
|
numeric_data,
|
|
196
200
|
ncols_numeric,
|
|
@@ -251,37 +255,37 @@ void Init_ext() {
|
|
|
251
255
|
"predict_iforest",
|
|
252
256
|
[](ExtIsoForest& iso, Hash options) {
|
|
253
257
|
// data
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
258
|
+
auto nrows = options.get<size_t, Symbol>("nrows");
|
|
259
|
+
auto ncols_numeric = options.get<size_t, Symbol>("ncols_numeric");
|
|
260
|
+
auto ncols_categ = options.get<size_t, Symbol>("ncols_categ");
|
|
257
261
|
|
|
258
|
-
real_t* numeric_data =
|
|
262
|
+
real_t* numeric_data = nullptr;
|
|
259
263
|
if (ncols_numeric > 0) {
|
|
260
|
-
numeric_data = (
|
|
264
|
+
numeric_data = reinterpret_cast<real_t*>(const_cast<char*>(options.get<String, Symbol>("numeric_data").c_str()));
|
|
261
265
|
}
|
|
262
266
|
|
|
263
|
-
int* categorical_data =
|
|
267
|
+
int* categorical_data = nullptr;
|
|
264
268
|
if (ncols_categ > 0) {
|
|
265
|
-
categorical_data = (
|
|
269
|
+
categorical_data = reinterpret_cast<int*>(const_cast<char*>(options.get<String, Symbol>("categorical_data").c_str()));
|
|
266
270
|
}
|
|
267
271
|
|
|
268
272
|
// not used (sparse matrices)
|
|
269
|
-
real_t* Xc =
|
|
270
|
-
sparse_ix* Xc_ind =
|
|
271
|
-
sparse_ix* Xc_indptr =
|
|
272
|
-
real_t* Xr =
|
|
273
|
-
sparse_ix* Xr_ind =
|
|
274
|
-
sparse_ix* Xr_indptr =
|
|
273
|
+
real_t* Xc = nullptr;
|
|
274
|
+
sparse_ix* Xc_ind = nullptr;
|
|
275
|
+
sparse_ix* Xc_indptr = nullptr;
|
|
276
|
+
real_t* Xr = nullptr;
|
|
277
|
+
sparse_ix* Xr_ind = nullptr;
|
|
278
|
+
sparse_ix* Xr_indptr = nullptr;
|
|
275
279
|
|
|
276
280
|
// options
|
|
277
|
-
|
|
278
|
-
|
|
281
|
+
auto nthreads = options.get<int, Symbol>("nthreads");
|
|
282
|
+
auto standardize = options.get<bool, Symbol>("standardize");
|
|
279
283
|
std::vector<double> outlier_scores(nrows);
|
|
280
|
-
sparse_ix* tree_num =
|
|
284
|
+
sparse_ix* tree_num = nullptr;
|
|
281
285
|
bool is_col_major = true;
|
|
282
286
|
size_t ld_numeric = 0;
|
|
283
287
|
size_t ld_categ = 0;
|
|
284
|
-
double* per_tree_depths =
|
|
288
|
+
double* per_tree_depths = nullptr;
|
|
285
289
|
|
|
286
290
|
predict_iforest(
|
|
287
291
|
numeric_data,
|
|
@@ -298,49 +302,49 @@ void Init_ext() {
|
|
|
298
302
|
nrows,
|
|
299
303
|
nthreads,
|
|
300
304
|
standardize,
|
|
301
|
-
|
|
305
|
+
nullptr,
|
|
302
306
|
&iso,
|
|
303
307
|
outlier_scores.data(),
|
|
304
308
|
tree_num,
|
|
305
309
|
per_tree_depths,
|
|
306
|
-
|
|
310
|
+
nullptr
|
|
307
311
|
);
|
|
308
312
|
|
|
309
313
|
Array ret;
|
|
310
|
-
for (
|
|
311
|
-
ret.push(
|
|
314
|
+
for (auto v : outlier_scores) {
|
|
315
|
+
ret.push(v, false);
|
|
312
316
|
}
|
|
313
317
|
return ret;
|
|
314
318
|
})
|
|
315
319
|
.define_singleton_function(
|
|
316
320
|
"serialize_combined",
|
|
317
321
|
[](ExtIsoForest& iso, String path, String metadata) {
|
|
318
|
-
|
|
322
|
+
#ifdef _MSC_VER
|
|
319
323
|
// TODO convert to wchar_t
|
|
320
324
|
throw std::runtime_error("Not supported on Windows yet");
|
|
321
|
-
|
|
325
|
+
#else
|
|
322
326
|
std::ofstream file;
|
|
323
327
|
file.open(path.c_str());
|
|
324
328
|
serialize_combined(
|
|
325
|
-
|
|
329
|
+
nullptr,
|
|
326
330
|
&iso,
|
|
327
|
-
|
|
328
|
-
|
|
331
|
+
nullptr,
|
|
332
|
+
nullptr,
|
|
329
333
|
metadata.c_str(),
|
|
330
334
|
// returns bytesize (RSTRING_LEN)
|
|
331
335
|
metadata.length(),
|
|
332
336
|
file
|
|
333
337
|
);
|
|
334
338
|
file.close();
|
|
335
|
-
|
|
339
|
+
#endif
|
|
336
340
|
})
|
|
337
341
|
.define_singleton_function(
|
|
338
342
|
"deserialize_combined",
|
|
339
343
|
[](String path) {
|
|
340
|
-
|
|
344
|
+
#ifdef _MSC_VER
|
|
341
345
|
// TODO convert to wchar_t
|
|
342
346
|
throw std::runtime_error("Not supported on Windows yet");
|
|
343
|
-
|
|
347
|
+
#else
|
|
344
348
|
Array ret;
|
|
345
349
|
|
|
346
350
|
std::ifstream file;
|
|
@@ -386,20 +390,15 @@ void Init_ext() {
|
|
|
386
390
|
ExtIsoForest model_ext = ExtIsoForest();
|
|
387
391
|
Imputer imputer = Imputer();
|
|
388
392
|
TreesIndexer indexer = TreesIndexer();
|
|
389
|
-
char
|
|
390
|
-
if (optional_metadata == NULL) {
|
|
391
|
-
throw std::runtime_error("Cannot allocate memory");
|
|
392
|
-
}
|
|
393
|
+
std::vector<char> optional_metadata(size_metadata, 0);
|
|
393
394
|
|
|
394
|
-
deserialize_combined(file, &model, &model_ext, &imputer, &indexer, optional_metadata);
|
|
395
|
+
deserialize_combined(file, &model, &model_ext, &imputer, &indexer, optional_metadata.data());
|
|
395
396
|
file.close();
|
|
396
397
|
|
|
397
398
|
ret.push(Object(Rice::detail::To_Ruby<ExtIsoForest>().convert(model_ext)), false);
|
|
398
|
-
ret.push(String(std::
|
|
399
|
-
|
|
400
|
-
free(optional_metadata);
|
|
399
|
+
ret.push(String(std::string_view{optional_metadata.data(), optional_metadata.size()}), false);
|
|
401
400
|
|
|
402
401
|
return ret;
|
|
403
|
-
|
|
402
|
+
#endif
|
|
404
403
|
});
|
|
405
404
|
}
|
data/lib/isotree/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: isotree
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.5.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Andrew Kane
|
|
@@ -93,14 +93,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
93
93
|
requirements:
|
|
94
94
|
- - ">="
|
|
95
95
|
- !ruby/object:Gem::Version
|
|
96
|
-
version: '3.
|
|
96
|
+
version: '3.3'
|
|
97
97
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
98
98
|
requirements:
|
|
99
99
|
- - ">="
|
|
100
100
|
- !ruby/object:Gem::Version
|
|
101
101
|
version: '0'
|
|
102
102
|
requirements: []
|
|
103
|
-
rubygems_version:
|
|
103
|
+
rubygems_version: 4.0.6
|
|
104
104
|
specification_version: 4
|
|
105
105
|
summary: Outlier/anomaly detection for Ruby using Isolation Forest
|
|
106
106
|
test_files: []
|