anomaly_detection 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/README.md +3 -2
- data/ext/anomaly_detection/anomaly_detection.cpp +44 -47
- data/ext/anomaly_detection/anomaly_detection.hpp +12 -0
- data/ext/anomaly_detection/ext.cpp +16 -4
- data/lib/anomaly_detection/version.rb +1 -1
- data/lib/anomaly_detection.rb +2 -2
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e8882d64f5b2fe33406fa583404a3efead153d47a434ac6f5a1ec21d10666389
|
4
|
+
data.tar.gz: f0544946e1cb9011e32c7f3f0a8ae4fb90f9d1a2c7acca67c5f9e8f27f8780af
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d30c25bf7a1a7069b7ba7c430e928677f5636261cdb1bc4ff24745103bbb2769ad065a2f368b32440eb1b5fce6483e206667b462542912f44f60a9b61494f360
|
7
|
+
data.tar.gz: 4a3aba5aeed9e5488c77448d38f331cb3241db6cbf518fc5c4f8d00ca480ce13e53819e65b9a66970e8ca175349db00a4ae6b5b6bc8833f82a44219f840419b9
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# AnomalyDetection.rb
|
2
2
|
|
3
|
-
:fire: [AnomalyDetection](https://github.com/twitter/AnomalyDetection) for Ruby
|
3
|
+
:fire: Time series [AnomalyDetection](https://github.com/twitter/AnomalyDetection) for Ruby
|
4
4
|
|
5
5
|
Learn [how it works](https://blog.twitter.com/engineering/en_us/a/2015/introducing-practical-and-robust-anomaly-detection-in-a-time-series)
|
6
6
|
|
@@ -53,7 +53,8 @@ AnomalyDetection.detect(
|
|
53
53
|
period: 7, # number of observations in a single period
|
54
54
|
alpha: 0.05, # level of statistical significance
|
55
55
|
max_anoms: 0.1, # maximum number of anomalies as percent of data
|
56
|
-
direction: "both"
|
56
|
+
direction: "both", # pos, neg, or both
|
57
|
+
verbose: false # show progress
|
57
58
|
)
|
58
59
|
```
|
59
60
|
|
@@ -1,24 +1,27 @@
|
|
1
|
-
#include <
|
1
|
+
#include <functional>
|
2
2
|
#include <iostream>
|
3
3
|
#include <iterator>
|
4
|
+
#include <numeric>
|
5
|
+
#include <string>
|
4
6
|
#include <vector>
|
5
7
|
|
8
|
+
#include "anomaly_detection.hpp"
|
6
9
|
#include "cdflib.hpp"
|
7
10
|
#include "stl.hpp"
|
8
11
|
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
+
namespace anomaly_detection {
|
13
|
+
|
14
|
+
float median(const std::vector<float>& sorted) {
|
12
15
|
return (sorted[(sorted.size() - 1) / 2] + sorted[sorted.size() / 2]) / 2.0;
|
13
16
|
}
|
14
17
|
|
15
|
-
float mad(const std::vector<float>& data) {
|
16
|
-
auto med = median(data);
|
18
|
+
float mad(const std::vector<float>& data, float med) {
|
17
19
|
std::vector<float> res;
|
18
20
|
res.reserve(data.size());
|
19
21
|
for (auto v : data) {
|
20
22
|
res.push_back(fabs(v - med));
|
21
23
|
}
|
24
|
+
std::sort(res.begin(), res.end());
|
22
25
|
return 1.4826 * median(res);
|
23
26
|
}
|
24
27
|
|
@@ -36,45 +39,51 @@ float qt(double p, double df) {
|
|
36
39
|
return t;
|
37
40
|
}
|
38
41
|
|
39
|
-
std::vector<size_t> detect_anoms(const std::vector<float>& data, int num_obs_per_period, float k, float alpha, bool one_tail, bool upper_tail) {
|
40
|
-
auto
|
42
|
+
std::vector<size_t> detect_anoms(const std::vector<float>& data, int num_obs_per_period, float k, float alpha, bool one_tail, bool upper_tail, bool verbose, std::function<void()> interrupt) {
|
43
|
+
auto n = data.size();
|
41
44
|
|
42
45
|
// Check to make sure we have at least two periods worth of data for anomaly context
|
43
|
-
|
46
|
+
if (n < num_obs_per_period * 2) {
|
47
|
+
throw std::invalid_argument("series must contain at least 2 periods");
|
48
|
+
}
|
44
49
|
|
45
50
|
// Handle NANs
|
46
51
|
auto nan = std::count_if(data.begin(), data.end(), [](const auto& value) { return std::isnan(value); });
|
47
52
|
if (nan > 0) {
|
48
|
-
throw std::invalid_argument("
|
53
|
+
throw std::invalid_argument("series contains NANs");
|
49
54
|
}
|
50
55
|
|
51
56
|
// Decompose data. This returns a univarite remainder which will be used for anomaly detection. Optionally, we might NOT decompose.
|
52
|
-
auto seasonal_length =
|
57
|
+
auto seasonal_length = n * 10 + 1;
|
53
58
|
auto data_decomp = stl::params().robust(true).seasonal_length(seasonal_length).fit(data, num_obs_per_period);
|
54
59
|
|
55
60
|
auto seasonal = data_decomp.seasonal;
|
56
61
|
auto med = median(data);
|
57
62
|
std::vector<float> data2;
|
58
|
-
data2.reserve(
|
59
|
-
for (auto i = 0; i <
|
63
|
+
data2.reserve(n);
|
64
|
+
for (auto i = 0; i < n; i++) {
|
60
65
|
data2.push_back(data[i] - seasonal[i] - med);
|
61
66
|
}
|
62
67
|
|
63
|
-
auto max_outliers = (size_t) num_obs * k;
|
64
|
-
assert(max_outliers > 0);
|
65
|
-
|
66
|
-
auto n = data2.size();
|
67
|
-
|
68
68
|
std::vector<size_t> r_idx;
|
69
|
+
auto num_anoms = 0;
|
70
|
+
auto max_outliers = (size_t) n * k;
|
69
71
|
|
70
|
-
|
71
|
-
indexes
|
72
|
-
|
73
|
-
|
74
|
-
|
72
|
+
// Sort data for fast median
|
73
|
+
std::vector<size_t> indexes(n);
|
74
|
+
std::iota(indexes.begin(), indexes.end(), 0);
|
75
|
+
std::stable_sort(indexes.begin(), indexes.end(), [&data2](size_t a, size_t b) { return data2[a] < data2[b]; });
|
76
|
+
std::sort(data2.begin(), data2.end());
|
75
77
|
|
76
78
|
// Compute test statistic until r=max_outliers values have been removed from the sample
|
77
79
|
for (auto i = 1; i <= max_outliers; i++) {
|
80
|
+
// Check for interrupts
|
81
|
+
interrupt();
|
82
|
+
|
83
|
+
if (verbose) {
|
84
|
+
std::cout << i << " / " << max_outliers << " completed" << std::endl;
|
85
|
+
}
|
86
|
+
|
78
87
|
// TODO Improve performance between loop iterations
|
79
88
|
auto ma = median(data2);
|
80
89
|
std::vector<float> ares;
|
@@ -96,19 +105,18 @@ std::vector<size_t> detect_anoms(const std::vector<float>& data, int num_obs_per
|
|
96
105
|
}
|
97
106
|
|
98
107
|
// Protect against constant time series
|
99
|
-
auto data_sigma = mad(data2);
|
108
|
+
auto data_sigma = mad(data2, ma);
|
100
109
|
if (data_sigma == 0.0) {
|
101
110
|
break;
|
102
111
|
}
|
103
112
|
|
104
113
|
auto iter = std::max_element(ares.begin(), ares.end());
|
105
114
|
auto r_idx_i = std::distance(ares.begin(), iter);
|
106
|
-
auto r_idx_i2 = indexes[r_idx_i];
|
107
115
|
|
108
116
|
// Only need to take sigma of r for performance
|
109
117
|
auto r = ares[r_idx_i] / data_sigma;
|
110
118
|
|
111
|
-
|
119
|
+
r_idx.push_back(indexes[r_idx_i]);
|
112
120
|
data2.erase(data2.begin() + r_idx_i);
|
113
121
|
indexes.erase(indexes.begin() + r_idx_i);
|
114
122
|
|
@@ -124,33 +132,22 @@ std::vector<size_t> detect_anoms(const std::vector<float>& data, int num_obs_per
|
|
124
132
|
auto lam = t * (n - i) / sqrt(((n - i - 1) + powf(t, 2.0)) * (n - i + 1));
|
125
133
|
|
126
134
|
if (r > lam) {
|
127
|
-
|
128
|
-
} else {
|
129
|
-
break;
|
135
|
+
num_anoms = i;
|
130
136
|
}
|
131
137
|
}
|
132
138
|
|
139
|
+
std::vector<size_t> anomalies(r_idx.begin(), r_idx.begin() + num_anoms);
|
140
|
+
|
133
141
|
// Sort like R version
|
134
|
-
std::sort(
|
142
|
+
std::sort(anomalies.begin(), anomalies.end());
|
135
143
|
|
136
|
-
return
|
144
|
+
return anomalies;
|
137
145
|
}
|
138
146
|
|
139
|
-
std::vector<size_t> anomalies(const std::vector<float>& x, int period, float k, float alpha,
|
140
|
-
bool one_tail;
|
141
|
-
bool upper_tail;
|
142
|
-
|
143
|
-
|
144
|
-
upper_tail = true;
|
145
|
-
} else if (direction == "neg") {
|
146
|
-
one_tail = true;
|
147
|
-
upper_tail = false;
|
148
|
-
} else if (direction == "both") {
|
149
|
-
one_tail = false;
|
150
|
-
upper_tail = true; // not used
|
151
|
-
} else {
|
152
|
-
throw std::invalid_argument("Bad direction");
|
153
|
-
}
|
147
|
+
std::vector<size_t> anomalies(const std::vector<float>& x, int period, float k, float alpha, Direction direction, bool verbose, std::function<void()> interrupt) {
|
148
|
+
bool one_tail = direction != Direction::Both;
|
149
|
+
bool upper_tail = direction == Direction::Positive;
|
150
|
+
return detect_anoms(x, period, k, alpha, one_tail, upper_tail, verbose, interrupt);
|
151
|
+
}
|
154
152
|
|
155
|
-
return detect_anoms(x, period, k, alpha, one_tail, upper_tail);
|
156
153
|
}
|
@@ -0,0 +1,12 @@
|
|
1
|
+
#pragma once
|
2
|
+
|
3
|
+
#include <string>
|
4
|
+
#include <vector>
|
5
|
+
|
6
|
+
namespace anomaly_detection {
|
7
|
+
|
8
|
+
enum Direction { Positive, Negative, Both };
|
9
|
+
|
10
|
+
std::vector<size_t> anomalies(const std::vector<float>& x, int period, float k, float alpha, Direction direction, bool verbose, std::function<void()> interrupt);
|
11
|
+
|
12
|
+
}
|
@@ -1,8 +1,9 @@
|
|
1
|
-
// rice
|
2
1
|
#include <rice/rice.hpp>
|
3
2
|
#include <rice/stl.hpp>
|
4
3
|
|
5
|
-
|
4
|
+
#include "anomaly_detection.hpp"
|
5
|
+
|
6
|
+
using anomaly_detection::Direction;
|
6
7
|
|
7
8
|
extern "C"
|
8
9
|
void Init_ext() {
|
@@ -11,8 +12,19 @@ void Init_ext() {
|
|
11
12
|
rb_mAnomalyDetection
|
12
13
|
.define_singleton_function(
|
13
14
|
"_detect",
|
14
|
-
[](std::vector<float> x, int period, float k, float alpha, const std::string& direction) {
|
15
|
-
|
15
|
+
[](std::vector<float> x, int period, float k, float alpha, const std::string& direction, bool verbose) {
|
16
|
+
Direction dir;
|
17
|
+
if (direction == "pos") {
|
18
|
+
dir = Direction::Positive;
|
19
|
+
} else if (direction == "neg") {
|
20
|
+
dir = Direction::Negative;
|
21
|
+
} else if (direction == "both") {
|
22
|
+
dir = Direction::Both;
|
23
|
+
} else {
|
24
|
+
throw std::invalid_argument("direction must be pos, neg, or both");
|
25
|
+
}
|
26
|
+
|
27
|
+
auto res = anomaly_detection::anomalies(x, period, k, alpha, dir, verbose, rb_thread_check_ints);
|
16
28
|
|
17
29
|
auto a = Rice::Array();
|
18
30
|
for (auto v : res) {
|
data/lib/anomaly_detection.rb
CHANGED
@@ -5,7 +5,7 @@ require "anomaly_detection/ext"
|
|
5
5
|
require "anomaly_detection/version"
|
6
6
|
|
7
7
|
module AnomalyDetection
|
8
|
-
def self.detect(series, period:, max_anoms: 0.1, alpha: 0.05, direction: "both")
|
8
|
+
def self.detect(series, period:, max_anoms: 0.1, alpha: 0.05, direction: "both", verbose: false)
|
9
9
|
raise ArgumentError, "series must contain at least 2 periods" if series.size < period * 2
|
10
10
|
|
11
11
|
if series.is_a?(Hash)
|
@@ -15,7 +15,7 @@ module AnomalyDetection
|
|
15
15
|
x = series
|
16
16
|
end
|
17
17
|
|
18
|
-
res = _detect(x, period, max_anoms, alpha, direction)
|
18
|
+
res = _detect(x, period, max_anoms, alpha, direction, verbose)
|
19
19
|
res.map! { |i| sorted[i][0] } if series.is_a?(Hash)
|
20
20
|
res
|
21
21
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: anomaly_detection
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-10-
|
11
|
+
date: 2021-10-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rice
|
@@ -36,6 +36,7 @@ files:
|
|
36
36
|
- NOTICE.txt
|
37
37
|
- README.md
|
38
38
|
- ext/anomaly_detection/anomaly_detection.cpp
|
39
|
+
- ext/anomaly_detection/anomaly_detection.hpp
|
39
40
|
- ext/anomaly_detection/cdflib.cpp
|
40
41
|
- ext/anomaly_detection/cdflib.hpp
|
41
42
|
- ext/anomaly_detection/ext.cpp
|
@@ -68,5 +69,5 @@ requirements: []
|
|
68
69
|
rubygems_version: 3.2.22
|
69
70
|
signing_key:
|
70
71
|
specification_version: 4
|
71
|
-
summary:
|
72
|
+
summary: Time series anomaly detection for Ruby
|
72
73
|
test_files: []
|