anomaly_detection 0.1.0 → 0.1.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/README.md +3 -2
- data/ext/anomaly_detection/anomaly_detection.cpp +44 -47
- data/ext/anomaly_detection/anomaly_detection.hpp +12 -0
- data/ext/anomaly_detection/ext.cpp +16 -4
- data/lib/anomaly_detection/version.rb +1 -1
- data/lib/anomaly_detection.rb +2 -2
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e8882d64f5b2fe33406fa583404a3efead153d47a434ac6f5a1ec21d10666389
|
4
|
+
data.tar.gz: f0544946e1cb9011e32c7f3f0a8ae4fb90f9d1a2c7acca67c5f9e8f27f8780af
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: d30c25bf7a1a7069b7ba7c430e928677f5636261cdb1bc4ff24745103bbb2769ad065a2f368b32440eb1b5fce6483e206667b462542912f44f60a9b61494f360
|
7
|
+
data.tar.gz: 4a3aba5aeed9e5488c77448d38f331cb3241db6cbf518fc5c4f8d00ca480ce13e53819e65b9a66970e8ca175349db00a4ae6b5b6bc8833f82a44219f840419b9
|
data/CHANGELOG.md
CHANGED
data/README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
# AnomalyDetection.rb
|
2
2
|
|
3
|
-
:fire: [AnomalyDetection](https://github.com/twitter/AnomalyDetection) for Ruby
|
3
|
+
:fire: Time series [AnomalyDetection](https://github.com/twitter/AnomalyDetection) for Ruby
|
4
4
|
|
5
5
|
Learn [how it works](https://blog.twitter.com/engineering/en_us/a/2015/introducing-practical-and-robust-anomaly-detection-in-a-time-series)
|
6
6
|
|
@@ -53,7 +53,8 @@ AnomalyDetection.detect(
|
|
53
53
|
period: 7, # number of observations in a single period
|
54
54
|
alpha: 0.05, # level of statistical significance
|
55
55
|
max_anoms: 0.1, # maximum number of anomalies as percent of data
|
56
|
-
direction: "both"
|
56
|
+
direction: "both", # pos, neg, or both
|
57
|
+
verbose: false # show progress
|
57
58
|
)
|
58
59
|
```
|
59
60
|
|
@@ -1,24 +1,27 @@
|
|
1
|
-
#include <
|
1
|
+
#include <functional>
|
2
2
|
#include <iostream>
|
3
3
|
#include <iterator>
|
4
|
+
#include <numeric>
|
5
|
+
#include <string>
|
4
6
|
#include <vector>
|
5
7
|
|
8
|
+
#include "anomaly_detection.hpp"
|
6
9
|
#include "cdflib.hpp"
|
7
10
|
#include "stl.hpp"
|
8
11
|
|
9
|
-
|
10
|
-
|
11
|
-
|
12
|
+
namespace anomaly_detection {
|
13
|
+
|
14
|
+
float median(const std::vector<float>& sorted) {
|
12
15
|
return (sorted[(sorted.size() - 1) / 2] + sorted[sorted.size() / 2]) / 2.0;
|
13
16
|
}
|
14
17
|
|
15
|
-
float mad(const std::vector<float>& data) {
|
16
|
-
auto med = median(data);
|
18
|
+
float mad(const std::vector<float>& data, float med) {
|
17
19
|
std::vector<float> res;
|
18
20
|
res.reserve(data.size());
|
19
21
|
for (auto v : data) {
|
20
22
|
res.push_back(fabs(v - med));
|
21
23
|
}
|
24
|
+
std::sort(res.begin(), res.end());
|
22
25
|
return 1.4826 * median(res);
|
23
26
|
}
|
24
27
|
|
@@ -36,45 +39,51 @@ float qt(double p, double df) {
|
|
36
39
|
return t;
|
37
40
|
}
|
38
41
|
|
39
|
-
std::vector<size_t> detect_anoms(const std::vector<float>& data, int num_obs_per_period, float k, float alpha, bool one_tail, bool upper_tail) {
|
40
|
-
auto
|
42
|
+
std::vector<size_t> detect_anoms(const std::vector<float>& data, int num_obs_per_period, float k, float alpha, bool one_tail, bool upper_tail, bool verbose, std::function<void()> interrupt) {
|
43
|
+
auto n = data.size();
|
41
44
|
|
42
45
|
// Check to make sure we have at least two periods worth of data for anomaly context
|
43
|
-
|
46
|
+
if (n < num_obs_per_period * 2) {
|
47
|
+
throw std::invalid_argument("series must contain at least 2 periods");
|
48
|
+
}
|
44
49
|
|
45
50
|
// Handle NANs
|
46
51
|
auto nan = std::count_if(data.begin(), data.end(), [](const auto& value) { return std::isnan(value); });
|
47
52
|
if (nan > 0) {
|
48
|
-
throw std::invalid_argument("
|
53
|
+
throw std::invalid_argument("series contains NANs");
|
49
54
|
}
|
50
55
|
|
51
56
|
// Decompose data. This returns a univarite remainder which will be used for anomaly detection. Optionally, we might NOT decompose.
|
52
|
-
auto seasonal_length =
|
57
|
+
auto seasonal_length = n * 10 + 1;
|
53
58
|
auto data_decomp = stl::params().robust(true).seasonal_length(seasonal_length).fit(data, num_obs_per_period);
|
54
59
|
|
55
60
|
auto seasonal = data_decomp.seasonal;
|
56
61
|
auto med = median(data);
|
57
62
|
std::vector<float> data2;
|
58
|
-
data2.reserve(
|
59
|
-
for (auto i = 0; i <
|
63
|
+
data2.reserve(n);
|
64
|
+
for (auto i = 0; i < n; i++) {
|
60
65
|
data2.push_back(data[i] - seasonal[i] - med);
|
61
66
|
}
|
62
67
|
|
63
|
-
auto max_outliers = (size_t) num_obs * k;
|
64
|
-
assert(max_outliers > 0);
|
65
|
-
|
66
|
-
auto n = data2.size();
|
67
|
-
|
68
68
|
std::vector<size_t> r_idx;
|
69
|
+
auto num_anoms = 0;
|
70
|
+
auto max_outliers = (size_t) n * k;
|
69
71
|
|
70
|
-
|
71
|
-
indexes
|
72
|
-
|
73
|
-
|
74
|
-
|
72
|
+
// Sort data for fast median
|
73
|
+
std::vector<size_t> indexes(n);
|
74
|
+
std::iota(indexes.begin(), indexes.end(), 0);
|
75
|
+
std::stable_sort(indexes.begin(), indexes.end(), [&data2](size_t a, size_t b) { return data2[a] < data2[b]; });
|
76
|
+
std::sort(data2.begin(), data2.end());
|
75
77
|
|
76
78
|
// Compute test statistic until r=max_outliers values have been removed from the sample
|
77
79
|
for (auto i = 1; i <= max_outliers; i++) {
|
80
|
+
// Check for interrupts
|
81
|
+
interrupt();
|
82
|
+
|
83
|
+
if (verbose) {
|
84
|
+
std::cout << i << " / " << max_outliers << " completed" << std::endl;
|
85
|
+
}
|
86
|
+
|
78
87
|
// TODO Improve performance between loop iterations
|
79
88
|
auto ma = median(data2);
|
80
89
|
std::vector<float> ares;
|
@@ -96,19 +105,18 @@ std::vector<size_t> detect_anoms(const std::vector<float>& data, int num_obs_per
|
|
96
105
|
}
|
97
106
|
|
98
107
|
// Protect against constant time series
|
99
|
-
auto data_sigma = mad(data2);
|
108
|
+
auto data_sigma = mad(data2, ma);
|
100
109
|
if (data_sigma == 0.0) {
|
101
110
|
break;
|
102
111
|
}
|
103
112
|
|
104
113
|
auto iter = std::max_element(ares.begin(), ares.end());
|
105
114
|
auto r_idx_i = std::distance(ares.begin(), iter);
|
106
|
-
auto r_idx_i2 = indexes[r_idx_i];
|
107
115
|
|
108
116
|
// Only need to take sigma of r for performance
|
109
117
|
auto r = ares[r_idx_i] / data_sigma;
|
110
118
|
|
111
|
-
|
119
|
+
r_idx.push_back(indexes[r_idx_i]);
|
112
120
|
data2.erase(data2.begin() + r_idx_i);
|
113
121
|
indexes.erase(indexes.begin() + r_idx_i);
|
114
122
|
|
@@ -124,33 +132,22 @@ std::vector<size_t> detect_anoms(const std::vector<float>& data, int num_obs_per
|
|
124
132
|
auto lam = t * (n - i) / sqrt(((n - i - 1) + powf(t, 2.0)) * (n - i + 1));
|
125
133
|
|
126
134
|
if (r > lam) {
|
127
|
-
|
128
|
-
} else {
|
129
|
-
break;
|
135
|
+
num_anoms = i;
|
130
136
|
}
|
131
137
|
}
|
132
138
|
|
139
|
+
std::vector<size_t> anomalies(r_idx.begin(), r_idx.begin() + num_anoms);
|
140
|
+
|
133
141
|
// Sort like R version
|
134
|
-
std::sort(
|
142
|
+
std::sort(anomalies.begin(), anomalies.end());
|
135
143
|
|
136
|
-
return
|
144
|
+
return anomalies;
|
137
145
|
}
|
138
146
|
|
139
|
-
std::vector<size_t> anomalies(const std::vector<float>& x, int period, float k, float alpha,
|
140
|
-
bool one_tail;
|
141
|
-
bool upper_tail;
|
142
|
-
|
143
|
-
|
144
|
-
upper_tail = true;
|
145
|
-
} else if (direction == "neg") {
|
146
|
-
one_tail = true;
|
147
|
-
upper_tail = false;
|
148
|
-
} else if (direction == "both") {
|
149
|
-
one_tail = false;
|
150
|
-
upper_tail = true; // not used
|
151
|
-
} else {
|
152
|
-
throw std::invalid_argument("Bad direction");
|
153
|
-
}
|
147
|
+
std::vector<size_t> anomalies(const std::vector<float>& x, int period, float k, float alpha, Direction direction, bool verbose, std::function<void()> interrupt) {
|
148
|
+
bool one_tail = direction != Direction::Both;
|
149
|
+
bool upper_tail = direction == Direction::Positive;
|
150
|
+
return detect_anoms(x, period, k, alpha, one_tail, upper_tail, verbose, interrupt);
|
151
|
+
}
|
154
152
|
|
155
|
-
return detect_anoms(x, period, k, alpha, one_tail, upper_tail);
|
156
153
|
}
|
@@ -0,0 +1,12 @@
|
|
1
|
+
#pragma once
|
2
|
+
|
3
|
+
#include <string>
|
4
|
+
#include <vector>
|
5
|
+
|
6
|
+
namespace anomaly_detection {
|
7
|
+
|
8
|
+
enum Direction { Positive, Negative, Both };
|
9
|
+
|
10
|
+
std::vector<size_t> anomalies(const std::vector<float>& x, int period, float k, float alpha, Direction direction, bool verbose, std::function<void()> interrupt);
|
11
|
+
|
12
|
+
}
|
@@ -1,8 +1,9 @@
|
|
1
|
-
// rice
|
2
1
|
#include <rice/rice.hpp>
|
3
2
|
#include <rice/stl.hpp>
|
4
3
|
|
5
|
-
|
4
|
+
#include "anomaly_detection.hpp"
|
5
|
+
|
6
|
+
using anomaly_detection::Direction;
|
6
7
|
|
7
8
|
extern "C"
|
8
9
|
void Init_ext() {
|
@@ -11,8 +12,19 @@ void Init_ext() {
|
|
11
12
|
rb_mAnomalyDetection
|
12
13
|
.define_singleton_function(
|
13
14
|
"_detect",
|
14
|
-
[](std::vector<float> x, int period, float k, float alpha, const std::string& direction) {
|
15
|
-
|
15
|
+
[](std::vector<float> x, int period, float k, float alpha, const std::string& direction, bool verbose) {
|
16
|
+
Direction dir;
|
17
|
+
if (direction == "pos") {
|
18
|
+
dir = Direction::Positive;
|
19
|
+
} else if (direction == "neg") {
|
20
|
+
dir = Direction::Negative;
|
21
|
+
} else if (direction == "both") {
|
22
|
+
dir = Direction::Both;
|
23
|
+
} else {
|
24
|
+
throw std::invalid_argument("direction must be pos, neg, or both");
|
25
|
+
}
|
26
|
+
|
27
|
+
auto res = anomaly_detection::anomalies(x, period, k, alpha, dir, verbose, rb_thread_check_ints);
|
16
28
|
|
17
29
|
auto a = Rice::Array();
|
18
30
|
for (auto v : res) {
|
data/lib/anomaly_detection.rb
CHANGED
@@ -5,7 +5,7 @@ require "anomaly_detection/ext"
|
|
5
5
|
require "anomaly_detection/version"
|
6
6
|
|
7
7
|
module AnomalyDetection
|
8
|
-
def self.detect(series, period:, max_anoms: 0.1, alpha: 0.05, direction: "both")
|
8
|
+
def self.detect(series, period:, max_anoms: 0.1, alpha: 0.05, direction: "both", verbose: false)
|
9
9
|
raise ArgumentError, "series must contain at least 2 periods" if series.size < period * 2
|
10
10
|
|
11
11
|
if series.is_a?(Hash)
|
@@ -15,7 +15,7 @@ module AnomalyDetection
|
|
15
15
|
x = series
|
16
16
|
end
|
17
17
|
|
18
|
-
res = _detect(x, period, max_anoms, alpha, direction)
|
18
|
+
res = _detect(x, period, max_anoms, alpha, direction, verbose)
|
19
19
|
res.map! { |i| sorted[i][0] } if series.is_a?(Hash)
|
20
20
|
res
|
21
21
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: anomaly_detection
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2021-10-
|
11
|
+
date: 2021-10-18 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rice
|
@@ -36,6 +36,7 @@ files:
|
|
36
36
|
- NOTICE.txt
|
37
37
|
- README.md
|
38
38
|
- ext/anomaly_detection/anomaly_detection.cpp
|
39
|
+
- ext/anomaly_detection/anomaly_detection.hpp
|
39
40
|
- ext/anomaly_detection/cdflib.cpp
|
40
41
|
- ext/anomaly_detection/cdflib.hpp
|
41
42
|
- ext/anomaly_detection/ext.cpp
|
@@ -68,5 +69,5 @@ requirements: []
|
|
68
69
|
rubygems_version: 3.2.22
|
69
70
|
signing_key:
|
70
71
|
specification_version: 4
|
71
|
-
summary:
|
72
|
+
summary: Time series anomaly detection for Ruby
|
72
73
|
test_files: []
|