anomaly_detection 0.1.1 → 0.1.4

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e8882d64f5b2fe33406fa583404a3efead153d47a434ac6f5a1ec21d10666389
4
- data.tar.gz: f0544946e1cb9011e32c7f3f0a8ae4fb90f9d1a2c7acca67c5f9e8f27f8780af
3
+ metadata.gz: c60bb6d75cb8523ecd0926f391d79413a1cb2eb131cd579fd381bb6683f82da3
4
+ data.tar.gz: '01594d0f0a97ad8cbb7b0b50cb30894bd0d773d4db45b3158345567ce1732efb'
5
5
  SHA512:
6
- metadata.gz: d30c25bf7a1a7069b7ba7c430e928677f5636261cdb1bc4ff24745103bbb2769ad065a2f368b32440eb1b5fce6483e206667b462542912f44f60a9b61494f360
7
- data.tar.gz: 4a3aba5aeed9e5488c77448d38f331cb3241db6cbf518fc5c4f8d00ca480ce13e53819e65b9a66970e8ca175349db00a4ae6b5b6bc8833f82a44219f840419b9
6
+ metadata.gz: fe09cc140a5d6543f3b00983a754861f6a3a3a436f8a8afecc80d202f1112bb6ea180df794072ee4711c044508a559a015c885cfd61d2c5be9378fc7b6590d96
7
+ data.tar.gz: 5616e6075888b4521355e6c0fb33f7a94361c971c7f58f9ae6a61e5d8529a3e1938deba10be22c24ec5849f1909f48cba07b97fd1e6ea8b47ae4f66626eb703e
data/CHANGELOG.md CHANGED
@@ -1,3 +1,15 @@
1
+ ## 0.1.4 (2022-03-19)
2
+
3
+ - Fixed initial median calculation
4
+
5
+ ## 0.1.3 (2022-01-03)
6
+
7
+ - Switched to dist.h
8
+
9
+ ## 0.1.2 (2021-10-20)
10
+
11
+ - Added `plot` method
12
+
1
13
  ## 0.1.1 (2021-10-17)
2
14
 
3
15
  - Added `verbose` option
data/README.md CHANGED
@@ -11,7 +11,7 @@ Learn [how it works](https://blog.twitter.com/engineering/en_us/a/2015/introduci
11
11
  Add this line to your application’s Gemfile:
12
12
 
13
13
  ```ruby
14
- gem 'anomaly_detection'
14
+ gem "anomaly_detection"
15
15
  ```
16
16
 
17
17
  ## Getting Started
@@ -58,9 +58,23 @@ AnomalyDetection.detect(
58
58
  )
59
59
  ```
60
60
 
61
+ ## Plotting
62
+
63
+ Add [Vega](https://github.com/ankane/vega) to your application’s Gemfile:
64
+
65
+ ```ruby
66
+ gem "vega"
67
+ ```
68
+
69
+ And use:
70
+
71
+ ```ruby
72
+ AnomalyDetection.plot(series, anomalies)
73
+ ```
74
+
61
75
  ## Credits
62
76
 
63
- This library was ported from the [AnomalyDetection](https://github.com/twitter/AnomalyDetection) R package and is available under the same license. It uses [cdflib](https://people.sc.fsu.edu/~jburkardt/cpp_src/cdflib/cdflib.html) for the quantile function.
77
+ This library was ported from the [AnomalyDetection](https://github.com/twitter/AnomalyDetection) R package and is available under the same license. It uses [stl-cpp](https://github.com/ankane/stl-cpp) for seasonal-trend decomposition and [dist.h](https://github.com/ankane/dist.h) for the quantile function.
64
78
 
65
79
  ## References
66
80
 
@@ -1,12 +1,203 @@
1
+ /*!
2
+ * AnomalyDetection.cpp v0.1.0
3
+ * https://github.com/ankane/AnomalyDetection.cpp
4
+ * GPL-3.0-or-later License
5
+ */
6
+
1
7
  #pragma once
2
8
 
3
- #include <string>
9
+ #include <functional>
10
+ #include <iostream>
11
+ #include <iterator>
12
+ #include <numeric>
4
13
  #include <vector>
5
14
 
15
+ #include "dist.h"
16
+ #include "stl.hpp"
17
+
6
18
  namespace anomaly_detection {
7
19
 
8
20
  enum Direction { Positive, Negative, Both };
9
21
 
10
- std::vector<size_t> anomalies(const std::vector<float>& x, int period, float k, float alpha, Direction direction, bool verbose, std::function<void()> interrupt);
22
+ float median_sorted(const std::vector<float>& sorted) {
23
+ return (sorted[(sorted.size() - 1) / 2] + sorted[sorted.size() / 2]) / 2.0;
24
+ }
25
+
26
+ float median(const std::vector<float>& data) {
27
+ std::vector<float> sorted(data);
28
+ std::sort(sorted.begin(), sorted.end());
29
+ return median_sorted(sorted);
30
+ }
31
+
32
+ float mad(const std::vector<float>& data, float med) {
33
+ std::vector<float> res;
34
+ res.reserve(data.size());
35
+ for (auto v : data) {
36
+ res.push_back(fabs(v - med));
37
+ }
38
+ std::sort(res.begin(), res.end());
39
+ return 1.4826 * median_sorted(res);
40
+ }
41
+
42
+ std::vector<size_t> detect_anoms(const std::vector<float>& data, int num_obs_per_period, float k, float alpha, bool one_tail, bool upper_tail, bool verbose, std::function<void()> callback) {
43
+ auto n = data.size();
44
+
45
+ // Check to make sure we have at least two periods worth of data for anomaly context
46
+ if (n < num_obs_per_period * 2) {
47
+ throw std::invalid_argument("series must contain at least 2 periods");
48
+ }
49
+
50
+ // Handle NANs
51
+ auto nan = std::count_if(data.begin(), data.end(), [](const auto& value) { return std::isnan(value); });
52
+ if (nan > 0) {
53
+ throw std::invalid_argument("series contains NANs");
54
+ }
55
+
56
+ // Decompose data. This returns a univarite remainder which will be used for anomaly detection. Optionally, we might NOT decompose.
57
+ auto data_decomp = stl::params().robust(true).seasonal_length(data.size() * 10 + 1).fit(data, num_obs_per_period);
58
+ auto seasonal = data_decomp.seasonal;
59
+
60
+ std::vector<float> data2;
61
+ data2.reserve(n);
62
+ auto med = median(data);
63
+ for (auto i = 0; i < n; i++) {
64
+ data2.push_back(data[i] - seasonal[i] - med);
65
+ }
66
+
67
+ auto num_anoms = 0;
68
+ auto max_outliers = (size_t) n * k;
69
+ std::vector<size_t> anomalies;
70
+ anomalies.reserve(max_outliers);
71
+
72
+ // Sort data for fast median
73
+ // Use stable sort for indexes for deterministic results
74
+ std::vector<size_t> indexes(n);
75
+ std::iota(indexes.begin(), indexes.end(), 0);
76
+ std::stable_sort(indexes.begin(), indexes.end(), [&data2](size_t a, size_t b) { return data2[a] < data2[b]; });
77
+ std::sort(data2.begin(), data2.end());
78
+
79
+ // Compute test statistic until r=max_outliers values have been removed from the sample
80
+ for (auto i = 1; i <= max_outliers; i++) {
81
+ if (verbose) {
82
+ std::cout << i << " / " << max_outliers << " completed" << std::endl;
83
+ }
84
+
85
+ // TODO Improve performance between loop iterations
86
+ auto ma = median_sorted(data2);
87
+ std::vector<float> ares;
88
+ ares.reserve(data2.size());
89
+ if (one_tail) {
90
+ if (upper_tail) {
91
+ for (auto v : data2) {
92
+ ares.push_back(v - ma);
93
+ }
94
+ } else {
95
+ for (auto v : data2) {
96
+ ares.push_back(ma - v);
97
+ }
98
+ }
99
+ } else {
100
+ for (auto v : data2) {
101
+ ares.push_back(fabs(v - ma));
102
+ }
103
+ }
104
+
105
+ // Protect against constant time series
106
+ auto data_sigma = mad(data2, ma);
107
+ if (data_sigma == 0.0) {
108
+ break;
109
+ }
110
+
111
+ auto iter = std::max_element(ares.begin(), ares.end());
112
+ auto r_idx_i = std::distance(ares.begin(), iter);
113
+
114
+ // Only need to take sigma of r for performance
115
+ auto r = ares[r_idx_i] / data_sigma;
116
+
117
+ anomalies.push_back(indexes[r_idx_i]);
118
+ data2.erase(data2.begin() + r_idx_i);
119
+ indexes.erase(indexes.begin() + r_idx_i);
120
+
121
+ // Compute critical value
122
+ float p;
123
+ if (one_tail) {
124
+ p = 1.0 - alpha / (n - i + 1);
125
+ } else {
126
+ p = 1.0 - alpha / (2.0 * (n - i + 1));
127
+ }
128
+
129
+ auto t = students_t_ppf(p, n - i - 1);
130
+ auto lam = t * (n - i) / sqrt(((n - i - 1) + t * t) * (n - i + 1));
131
+
132
+ if (r > lam) {
133
+ num_anoms = i;
134
+ }
135
+
136
+ if (callback != nullptr) {
137
+ callback();
138
+ }
139
+ }
140
+
141
+ anomalies.resize(num_anoms);
142
+
143
+ // Sort like R version
144
+ std::sort(anomalies.begin(), anomalies.end());
145
+
146
+ return anomalies;
147
+ }
148
+
149
+ class AnomalyDetectionResult {
150
+ public:
151
+ std::vector<size_t> anomalies;
152
+ };
153
+
154
+ class AnomalyDetectionParams {
155
+ float alpha_ = 0.05;
156
+ float max_anoms_ = 0.1;
157
+ Direction direction_ = Direction::Both;
158
+ bool verbose_ = false;
159
+ std::function<void()> callback_ = nullptr;
160
+
161
+ public:
162
+ inline AnomalyDetectionParams alpha(float alpha) {
163
+ this->alpha_ = alpha;
164
+ return *this;
165
+ };
166
+
167
+ inline AnomalyDetectionParams max_anoms(float max_anoms) {
168
+ this->max_anoms_ = max_anoms;
169
+ return *this;
170
+ };
171
+
172
+ inline AnomalyDetectionParams direction(Direction direction) {
173
+ this->direction_ = direction;
174
+ return *this;
175
+ };
176
+
177
+ inline AnomalyDetectionParams verbose(bool verbose) {
178
+ this->verbose_ = verbose;
179
+ return *this;
180
+ };
181
+
182
+ inline AnomalyDetectionParams callback(std::function<void()> callback) {
183
+ this->callback_ = callback;
184
+ return *this;
185
+ };
186
+
187
+ AnomalyDetectionResult fit(const std::vector<float>& series, size_t period);
188
+ };
189
+
190
+ AnomalyDetectionParams params() {
191
+ return AnomalyDetectionParams();
192
+ }
193
+
194
+ AnomalyDetectionResult AnomalyDetectionParams::fit(const std::vector<float>& series, size_t period) {
195
+ bool one_tail = this->direction_ != Direction::Both;
196
+ bool upper_tail = this->direction_ == Direction::Positive;
197
+
198
+ auto res = AnomalyDetectionResult();
199
+ res.anomalies = detect_anoms(series, period, this->max_anoms_, this->alpha_, one_tail, upper_tail, this->verbose_, this->callback_);
200
+ return res;
201
+ }
11
202
 
12
203
  }
@@ -0,0 +1,190 @@
1
+ /*!
2
+ * dist.h v0.1.1
3
+ * https://github.com/ankane/dist.h
4
+ * Unlicense OR MIT License
5
+ */
6
+
7
+ #pragma once
8
+
9
+ #include <assert.h>
10
+ #include <math.h>
11
+
12
+ #ifdef M_E
13
+ #define DIST_E M_E
14
+ #else
15
+ #define DIST_E 2.71828182845904523536
16
+ #endif
17
+
18
+ #ifdef M_PI
19
+ #define DIST_PI M_PI
20
+ #else
21
+ #define DIST_PI 3.14159265358979323846
22
+ #endif
23
+
24
+ // Winitzki, S. (2008).
25
+ // A handy approximation for the error function and its inverse.
26
+ // https://drive.google.com/file/d/0B2Mt7luZYBrwZlctV3A3eF82VGM/view?resourcekey=0-UQpPhwZgzP0sF4LHBDlLtg
27
+ // from https://sites.google.com/site/winitzki
28
+ double erf(double x) {
29
+ double sign = x < 0 ? -1.0 : 1.0;
30
+ x = x < 0 ? -x : x;
31
+
32
+ double a = 0.14;
33
+ double x2 = x * x;
34
+ return sign * sqrt(1.0 - exp(-x2 * (4.0 / DIST_PI + a * x2) / (1.0 + a * x2)));
35
+ }
36
+
37
+ // Winitzki, S. (2008).
38
+ // A handy approximation for the error function and its inverse.
39
+ // https://drive.google.com/file/d/0B2Mt7luZYBrwZlctV3A3eF82VGM/view?resourcekey=0-UQpPhwZgzP0sF4LHBDlLtg
40
+ // from https://sites.google.com/site/winitzki
41
+ double inverse_erf(double x) {
42
+ double sign = x < 0 ? -1.0 : 1.0;
43
+ x = x < 0 ? -x : x;
44
+
45
+ double a = 0.147;
46
+ double ln = log(1.0 - x * x);
47
+ double f1 = 2.0 / (DIST_PI * a);
48
+ double f2 = ln / 2.0;
49
+ double f3 = f1 + f2;
50
+ double f4 = 1.0 / a * ln;
51
+ return sign * sqrt(-f1 - f2 + sqrt(f3 * f3 - f4));
52
+ }
53
+
54
+ double normal_pdf(double x, double mean, double std_dev) {
55
+ double var = std_dev * std_dev;
56
+ return (1.0 / (var * sqrt(2.0 * DIST_PI))) * pow(DIST_E, -0.5 * pow((x - mean) / var, 2));
57
+ }
58
+
59
+ double normal_cdf(double x, double mean, double std_dev) {
60
+ return 0.5 * (1.0 + erf((x - mean) / (std_dev * std_dev * sqrt(2))));
61
+ }
62
+
63
+ double normal_ppf(double p, double mean, double std_dev) {
64
+ assert(p >= 0 && p <= 1);
65
+
66
+ return mean + (std_dev * std_dev) * sqrt(2) * inverse_erf(2.0 * p - 1.0);
67
+ }
68
+
69
+ double students_t_pdf(double x, unsigned int n) {
70
+ assert(n >= 1);
71
+
72
+ return tgamma((n + 1.0) / 2.0) / (sqrt(n * DIST_PI) * tgamma(n / 2.0)) * pow(1.0 + x * x / n, -(n + 1.0) / 2.0);
73
+ }
74
+
75
+ // Hill, G. W. (1970).
76
+ // Algorithm 395: Student's t-distribution.
77
+ // Communications of the ACM, 13(10), 617-619.
78
+ double students_t_cdf(double x, unsigned int n) {
79
+ assert(n >= 1);
80
+
81
+ double start = x < 0 ? 0 : 1;
82
+ double sign = x < 0 ? 1 : -1;
83
+
84
+ double z = 1.0;
85
+ double t = x * x;
86
+ double y = t / n;
87
+ double b = 1.0 + y;
88
+
89
+ if ((n >= 20 && t < n) || n > 200) {
90
+ // asymptotic series for large or noninteger n
91
+ if (y > 10e-6) {
92
+ y = log(b);
93
+ }
94
+ double a = n - 0.5;
95
+ b = 48.0 * a * a;
96
+ y = a * y;
97
+ y = (((((-0.4 * y - 3.3) * y - 24.0) * y - 85.5) / (0.8 * y * y + 100.0 + b) + y + 3.0) / b + 1.0) * sqrt(y);
98
+ return start + sign * normal_cdf(-y, 0.0, 1.0);
99
+ }
100
+
101
+ if (n < 20 && t < 4.0) {
102
+ // nested summation of cosine series
103
+ y = sqrt(y);
104
+ double a = y;
105
+ if (n == 1) {
106
+ a = 0.0;
107
+ }
108
+
109
+ // loop
110
+ if (n > 1) {
111
+ n -= 2;
112
+ while (n > 1) {
113
+ a = (n - 1) / (b * n) * a + y;
114
+ n -= 2;
115
+ }
116
+ }
117
+ a = n == 0 ? a / sqrt(b) : (atan(y) + a / b) * (2.0 / DIST_PI);
118
+ return start + sign * (z - a) / 2;
119
+ }
120
+
121
+ // tail series expanation for large t-values
122
+ double a = sqrt(b);
123
+ y = a * n;
124
+ int j = 0;
125
+ while (a != z) {
126
+ j += 2;
127
+ z = a;
128
+ y = y * (j - 1) / (b * j);
129
+ a = a + y / (n + j);
130
+ }
131
+ z = 0.0;
132
+ y = 0.0;
133
+ a = -a;
134
+
135
+ // loop (without n + 2 and n - 2)
136
+ while (n > 1) {
137
+ a = (n - 1) / (b * n) * a + y;
138
+ n -= 2;
139
+ }
140
+ a = n == 0 ? a / sqrt(b) : (atan(y) + a / b) * (2.0 / DIST_PI);
141
+ return start + sign * (z - a) / 2;
142
+ }
143
+
144
+ // Hill, G. W. (1970).
145
+ // Algorithm 396: Student's t-quantiles.
146
+ // Communications of the ACM, 13(10), 619-620.
147
+ double students_t_ppf(double p, unsigned int n) {
148
+ assert(p >= 0 && p <= 1);
149
+ assert(n >= 1);
150
+
151
+ // distribution is symmetric
152
+ double sign = p < 0.5 ? -1 : 1;
153
+ p = p < 0.5 ? 1 - p : p;
154
+
155
+ // two-tail to one-tail
156
+ p = 2.0 * (1.0 - p);
157
+
158
+ if (n == 2) {
159
+ return sign * sqrt(2.0 / (p * (2.0 - p)) - 2.0);
160
+ }
161
+
162
+ double half_pi = DIST_PI / 2.0;
163
+
164
+ if (n == 1) {
165
+ p = p * half_pi;
166
+ return sign * cos(p) / sin(p);
167
+ }
168
+
169
+ double a = 1.0 / (n - 0.5);
170
+ double b = 48.0 / (a * a);
171
+ double c = ((20700.0 * a / b - 98.0) * a - 16.0) * a + 96.36;
172
+ double d = ((94.5 / (b + c) - 3.0) / b + 1.0) * sqrt(a * half_pi) * n;
173
+ double x = d * p;
174
+ double y = pow(x, 2.0 / n);
175
+ if (y > 0.05 + a) {
176
+ // asymptotic inverse expansion about normal
177
+ x = normal_ppf(p * 0.5, 0.0, 1.0);
178
+ y = x * x;
179
+ if (n < 5) {
180
+ c += 0.3 * (n - 4.5) * (x + 0.6);
181
+ }
182
+ c = (((0.05 * d * x - 5.0) * x - 7.0) * x - 2.0) * x + b + c;
183
+ y = (((((0.4 * y + 6.3) * y + 36.0) * y + 94.5) / c - y - 3.0) / b + 1.0) * x;
184
+ y = a * y * y;
185
+ y = y > 0.002 ? exp(y) - 1.0 : 0.5 * y * y + y;
186
+ } else {
187
+ y = ((1.0 / (((n + 6.0) / (n * y) - 0.089 * d - 0.822) * (n + 2.0) * 3.0) + 0.5 / (n + 4.0)) * y - 1.0) * (n + 1.0) / (n + 2.0) + 1.0 / y;
188
+ }
189
+ return sign * sqrt(n * y);
190
+ }
@@ -12,7 +12,7 @@ void Init_ext() {
12
12
  rb_mAnomalyDetection
13
13
  .define_singleton_function(
14
14
  "_detect",
15
- [](std::vector<float> x, int period, float k, float alpha, const std::string& direction, bool verbose) {
15
+ [](std::vector<float> series, int period, float k, float alpha, const std::string& direction, bool verbose) {
16
16
  Direction dir;
17
17
  if (direction == "pos") {
18
18
  dir = Direction::Positive;
@@ -24,10 +24,16 @@ void Init_ext() {
24
24
  throw std::invalid_argument("direction must be pos, neg, or both");
25
25
  }
26
26
 
27
- auto res = anomaly_detection::anomalies(x, period, k, alpha, dir, verbose, rb_thread_check_ints);
27
+ auto res = anomaly_detection::params()
28
+ .max_anoms(k)
29
+ .alpha(alpha)
30
+ .direction(dir)
31
+ .verbose(verbose)
32
+ .callback(rb_thread_check_ints)
33
+ .fit(series, period);
28
34
 
29
35
  auto a = Rice::Array();
30
- for (auto v : res) {
36
+ for (auto v : res.anomalies) {
31
37
  a.push(v);
32
38
  }
33
39
  return a;
@@ -1,3 +1,3 @@
1
1
  module AnomalyDetection
2
- VERSION = "0.1.1"
2
+ VERSION = "0.1.4"
3
3
  end
@@ -5,18 +5,72 @@ require "anomaly_detection/ext"
5
5
  require "anomaly_detection/version"
6
6
 
7
7
  module AnomalyDetection
8
- def self.detect(series, period:, max_anoms: 0.1, alpha: 0.05, direction: "both", verbose: false)
9
- raise ArgumentError, "series must contain at least 2 periods" if series.size < period * 2
10
-
11
- if series.is_a?(Hash)
12
- sorted = series.sort_by { |k, _| k }
13
- x = sorted.map(&:last)
14
- else
15
- x = series
8
+ class << self
9
+ def detect(series, period:, max_anoms: 0.1, alpha: 0.05, direction: "both", plot: false, verbose: false)
10
+ raise ArgumentError, "series must contain at least 2 periods" if series.size < period * 2
11
+
12
+ if series.is_a?(Hash)
13
+ sorted = series.sort_by { |k, _| k }
14
+ x = sorted.map(&:last)
15
+ else
16
+ x = series
17
+ end
18
+
19
+ res = _detect(x, period, max_anoms, alpha, direction, verbose)
20
+ res.map! { |i| sorted[i][0] } if series.is_a?(Hash)
21
+ res
16
22
  end
17
23
 
18
- res = _detect(x, period, max_anoms, alpha, direction, verbose)
19
- res.map! { |i| sorted[i][0] } if series.is_a?(Hash)
20
- res
24
+ # TODO add tooltips
25
+ def plot(series, anomalies)
26
+ require "vega"
27
+
28
+ data =
29
+ if series.is_a?(Hash)
30
+ series.map { |k, v| {x: iso8601(k), y: v, anomaly: anomalies.include?(k)} }
31
+ else
32
+ series.map.with_index { |v, i| {x: i, y: v, anomaly: anomalies.include?(i)} }
33
+ end
34
+
35
+ if series.is_a?(Hash)
36
+ x = {field: "x", type: "temporal"}
37
+ x["scale"] = {type: "utc"} if series.keys.first.is_a?(Date)
38
+ else
39
+ x = {field: "x", type: "quantitative"}
40
+ end
41
+
42
+ Vega.lite
43
+ .data(data)
44
+ .layer([
45
+ {
46
+ mark: {type: "line"},
47
+ encoding: {
48
+ x: x,
49
+ y: {field: "y", type: "quantitative", scale: {zero: false}},
50
+ color: {value: "#fa9088"}
51
+ }
52
+ },
53
+ {
54
+ transform: [{"filter": "datum.anomaly == true"}],
55
+ mark: {type: "point", size: 200},
56
+ encoding: {
57
+ x: x,
58
+ y: {field: "y", type: "quantitative"},
59
+ color: {value: "#19c7ca"}
60
+ }
61
+ }
62
+ ])
63
+ .config(axis: {title: nil, labelFontSize: 12})
64
+ end
65
+
66
+ private
67
+
68
+ def iso8601(v)
69
+ if v.is_a?(Date)
70
+ v.strftime("%Y-%m-%d")
71
+ else
72
+ v.strftime("%Y-%m-%dT%H:%M:%S.%L%z")
73
+ end
74
+ end
21
75
  end
22
76
  end