anomaly_detection 0.1.1 → 0.1.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e8882d64f5b2fe33406fa583404a3efead153d47a434ac6f5a1ec21d10666389
4
- data.tar.gz: f0544946e1cb9011e32c7f3f0a8ae4fb90f9d1a2c7acca67c5f9e8f27f8780af
3
+ metadata.gz: c60bb6d75cb8523ecd0926f391d79413a1cb2eb131cd579fd381bb6683f82da3
4
+ data.tar.gz: '01594d0f0a97ad8cbb7b0b50cb30894bd0d773d4db45b3158345567ce1732efb'
5
5
  SHA512:
6
- metadata.gz: d30c25bf7a1a7069b7ba7c430e928677f5636261cdb1bc4ff24745103bbb2769ad065a2f368b32440eb1b5fce6483e206667b462542912f44f60a9b61494f360
7
- data.tar.gz: 4a3aba5aeed9e5488c77448d38f331cb3241db6cbf518fc5c4f8d00ca480ce13e53819e65b9a66970e8ca175349db00a4ae6b5b6bc8833f82a44219f840419b9
6
+ metadata.gz: fe09cc140a5d6543f3b00983a754861f6a3a3a436f8a8afecc80d202f1112bb6ea180df794072ee4711c044508a559a015c885cfd61d2c5be9378fc7b6590d96
7
+ data.tar.gz: 5616e6075888b4521355e6c0fb33f7a94361c971c7f58f9ae6a61e5d8529a3e1938deba10be22c24ec5849f1909f48cba07b97fd1e6ea8b47ae4f66626eb703e
data/CHANGELOG.md CHANGED
@@ -1,3 +1,15 @@
1
+ ## 0.1.4 (2022-03-19)
2
+
3
+ - Fixed initial median calculation
4
+
5
+ ## 0.1.3 (2022-01-03)
6
+
7
+ - Switched to dist.h
8
+
9
+ ## 0.1.2 (2021-10-20)
10
+
11
+ - Added `plot` method
12
+
1
13
  ## 0.1.1 (2021-10-17)
2
14
 
3
15
  - Added `verbose` option
data/README.md CHANGED
@@ -11,7 +11,7 @@ Learn [how it works](https://blog.twitter.com/engineering/en_us/a/2015/introduci
11
11
  Add this line to your application’s Gemfile:
12
12
 
13
13
  ```ruby
14
- gem 'anomaly_detection'
14
+ gem "anomaly_detection"
15
15
  ```
16
16
 
17
17
  ## Getting Started
@@ -58,9 +58,23 @@ AnomalyDetection.detect(
58
58
  )
59
59
  ```
60
60
 
61
+ ## Plotting
62
+
63
+ Add [Vega](https://github.com/ankane/vega) to your application’s Gemfile:
64
+
65
+ ```ruby
66
+ gem "vega"
67
+ ```
68
+
69
+ And use:
70
+
71
+ ```ruby
72
+ AnomalyDetection.plot(series, anomalies)
73
+ ```
74
+
61
75
  ## Credits
62
76
 
63
- This library was ported from the [AnomalyDetection](https://github.com/twitter/AnomalyDetection) R package and is available under the same license. It uses [cdflib](https://people.sc.fsu.edu/~jburkardt/cpp_src/cdflib/cdflib.html) for the quantile function.
77
+ This library was ported from the [AnomalyDetection](https://github.com/twitter/AnomalyDetection) R package and is available under the same license. It uses [stl-cpp](https://github.com/ankane/stl-cpp) for seasonal-trend decomposition and [dist.h](https://github.com/ankane/dist.h) for the quantile function.
64
78
 
65
79
  ## References
66
80
 
@@ -1,12 +1,203 @@
1
+ /*!
2
+ * AnomalyDetection.cpp v0.1.0
3
+ * https://github.com/ankane/AnomalyDetection.cpp
4
+ * GPL-3.0-or-later License
5
+ */
6
+
1
7
  #pragma once
2
8
 
3
- #include <string>
9
+ #include <functional>
10
+ #include <iostream>
11
+ #include <iterator>
12
+ #include <numeric>
4
13
  #include <vector>
5
14
 
15
+ #include "dist.h"
16
+ #include "stl.hpp"
17
+
6
18
  namespace anomaly_detection {
7
19
 
8
20
  enum Direction { Positive, Negative, Both };
9
21
 
10
- std::vector<size_t> anomalies(const std::vector<float>& x, int period, float k, float alpha, Direction direction, bool verbose, std::function<void()> interrupt);
22
+ float median_sorted(const std::vector<float>& sorted) {
23
+ return (sorted[(sorted.size() - 1) / 2] + sorted[sorted.size() / 2]) / 2.0;
24
+ }
25
+
26
+ float median(const std::vector<float>& data) {
27
+ std::vector<float> sorted(data);
28
+ std::sort(sorted.begin(), sorted.end());
29
+ return median_sorted(sorted);
30
+ }
31
+
32
+ float mad(const std::vector<float>& data, float med) {
33
+ std::vector<float> res;
34
+ res.reserve(data.size());
35
+ for (auto v : data) {
36
+ res.push_back(fabs(v - med));
37
+ }
38
+ std::sort(res.begin(), res.end());
39
+ return 1.4826 * median_sorted(res);
40
+ }
41
+
42
+ std::vector<size_t> detect_anoms(const std::vector<float>& data, int num_obs_per_period, float k, float alpha, bool one_tail, bool upper_tail, bool verbose, std::function<void()> callback) {
43
+ auto n = data.size();
44
+
45
+ // Check to make sure we have at least two periods worth of data for anomaly context
46
+ if (n < num_obs_per_period * 2) {
47
+ throw std::invalid_argument("series must contain at least 2 periods");
48
+ }
49
+
50
+ // Handle NANs
51
+ auto nan = std::count_if(data.begin(), data.end(), [](const auto& value) { return std::isnan(value); });
52
+ if (nan > 0) {
53
+ throw std::invalid_argument("series contains NANs");
54
+ }
55
+
56
+ // Decompose data. This returns a univarite remainder which will be used for anomaly detection. Optionally, we might NOT decompose.
57
+ auto data_decomp = stl::params().robust(true).seasonal_length(data.size() * 10 + 1).fit(data, num_obs_per_period);
58
+ auto seasonal = data_decomp.seasonal;
59
+
60
+ std::vector<float> data2;
61
+ data2.reserve(n);
62
+ auto med = median(data);
63
+ for (auto i = 0; i < n; i++) {
64
+ data2.push_back(data[i] - seasonal[i] - med);
65
+ }
66
+
67
+ auto num_anoms = 0;
68
+ auto max_outliers = (size_t) n * k;
69
+ std::vector<size_t> anomalies;
70
+ anomalies.reserve(max_outliers);
71
+
72
+ // Sort data for fast median
73
+ // Use stable sort for indexes for deterministic results
74
+ std::vector<size_t> indexes(n);
75
+ std::iota(indexes.begin(), indexes.end(), 0);
76
+ std::stable_sort(indexes.begin(), indexes.end(), [&data2](size_t a, size_t b) { return data2[a] < data2[b]; });
77
+ std::sort(data2.begin(), data2.end());
78
+
79
+ // Compute test statistic until r=max_outliers values have been removed from the sample
80
+ for (auto i = 1; i <= max_outliers; i++) {
81
+ if (verbose) {
82
+ std::cout << i << " / " << max_outliers << " completed" << std::endl;
83
+ }
84
+
85
+ // TODO Improve performance between loop iterations
86
+ auto ma = median_sorted(data2);
87
+ std::vector<float> ares;
88
+ ares.reserve(data2.size());
89
+ if (one_tail) {
90
+ if (upper_tail) {
91
+ for (auto v : data2) {
92
+ ares.push_back(v - ma);
93
+ }
94
+ } else {
95
+ for (auto v : data2) {
96
+ ares.push_back(ma - v);
97
+ }
98
+ }
99
+ } else {
100
+ for (auto v : data2) {
101
+ ares.push_back(fabs(v - ma));
102
+ }
103
+ }
104
+
105
+ // Protect against constant time series
106
+ auto data_sigma = mad(data2, ma);
107
+ if (data_sigma == 0.0) {
108
+ break;
109
+ }
110
+
111
+ auto iter = std::max_element(ares.begin(), ares.end());
112
+ auto r_idx_i = std::distance(ares.begin(), iter);
113
+
114
+ // Only need to take sigma of r for performance
115
+ auto r = ares[r_idx_i] / data_sigma;
116
+
117
+ anomalies.push_back(indexes[r_idx_i]);
118
+ data2.erase(data2.begin() + r_idx_i);
119
+ indexes.erase(indexes.begin() + r_idx_i);
120
+
121
+ // Compute critical value
122
+ float p;
123
+ if (one_tail) {
124
+ p = 1.0 - alpha / (n - i + 1);
125
+ } else {
126
+ p = 1.0 - alpha / (2.0 * (n - i + 1));
127
+ }
128
+
129
+ auto t = students_t_ppf(p, n - i - 1);
130
+ auto lam = t * (n - i) / sqrt(((n - i - 1) + t * t) * (n - i + 1));
131
+
132
+ if (r > lam) {
133
+ num_anoms = i;
134
+ }
135
+
136
+ if (callback != nullptr) {
137
+ callback();
138
+ }
139
+ }
140
+
141
+ anomalies.resize(num_anoms);
142
+
143
+ // Sort like R version
144
+ std::sort(anomalies.begin(), anomalies.end());
145
+
146
+ return anomalies;
147
+ }
148
+
149
+ class AnomalyDetectionResult {
150
+ public:
151
+ std::vector<size_t> anomalies;
152
+ };
153
+
154
+ class AnomalyDetectionParams {
155
+ float alpha_ = 0.05;
156
+ float max_anoms_ = 0.1;
157
+ Direction direction_ = Direction::Both;
158
+ bool verbose_ = false;
159
+ std::function<void()> callback_ = nullptr;
160
+
161
+ public:
162
+ inline AnomalyDetectionParams alpha(float alpha) {
163
+ this->alpha_ = alpha;
164
+ return *this;
165
+ };
166
+
167
+ inline AnomalyDetectionParams max_anoms(float max_anoms) {
168
+ this->max_anoms_ = max_anoms;
169
+ return *this;
170
+ };
171
+
172
+ inline AnomalyDetectionParams direction(Direction direction) {
173
+ this->direction_ = direction;
174
+ return *this;
175
+ };
176
+
177
+ inline AnomalyDetectionParams verbose(bool verbose) {
178
+ this->verbose_ = verbose;
179
+ return *this;
180
+ };
181
+
182
+ inline AnomalyDetectionParams callback(std::function<void()> callback) {
183
+ this->callback_ = callback;
184
+ return *this;
185
+ };
186
+
187
+ AnomalyDetectionResult fit(const std::vector<float>& series, size_t period);
188
+ };
189
+
190
+ AnomalyDetectionParams params() {
191
+ return AnomalyDetectionParams();
192
+ }
193
+
194
+ AnomalyDetectionResult AnomalyDetectionParams::fit(const std::vector<float>& series, size_t period) {
195
+ bool one_tail = this->direction_ != Direction::Both;
196
+ bool upper_tail = this->direction_ == Direction::Positive;
197
+
198
+ auto res = AnomalyDetectionResult();
199
+ res.anomalies = detect_anoms(series, period, this->max_anoms_, this->alpha_, one_tail, upper_tail, this->verbose_, this->callback_);
200
+ return res;
201
+ }
11
202
 
12
203
  }
@@ -0,0 +1,190 @@
1
+ /*!
2
+ * dist.h v0.1.1
3
+ * https://github.com/ankane/dist.h
4
+ * Unlicense OR MIT License
5
+ */
6
+
7
+ #pragma once
8
+
9
+ #include <assert.h>
10
+ #include <math.h>
11
+
12
+ #ifdef M_E
13
+ #define DIST_E M_E
14
+ #else
15
+ #define DIST_E 2.71828182845904523536
16
+ #endif
17
+
18
+ #ifdef M_PI
19
+ #define DIST_PI M_PI
20
+ #else
21
+ #define DIST_PI 3.14159265358979323846
22
+ #endif
23
+
24
+ // Winitzki, S. (2008).
25
+ // A handy approximation for the error function and its inverse.
26
+ // https://drive.google.com/file/d/0B2Mt7luZYBrwZlctV3A3eF82VGM/view?resourcekey=0-UQpPhwZgzP0sF4LHBDlLtg
27
+ // from https://sites.google.com/site/winitzki
28
+ double erf(double x) {
29
+ double sign = x < 0 ? -1.0 : 1.0;
30
+ x = x < 0 ? -x : x;
31
+
32
+ double a = 0.14;
33
+ double x2 = x * x;
34
+ return sign * sqrt(1.0 - exp(-x2 * (4.0 / DIST_PI + a * x2) / (1.0 + a * x2)));
35
+ }
36
+
37
+ // Winitzki, S. (2008).
38
+ // A handy approximation for the error function and its inverse.
39
+ // https://drive.google.com/file/d/0B2Mt7luZYBrwZlctV3A3eF82VGM/view?resourcekey=0-UQpPhwZgzP0sF4LHBDlLtg
40
+ // from https://sites.google.com/site/winitzki
41
+ double inverse_erf(double x) {
42
+ double sign = x < 0 ? -1.0 : 1.0;
43
+ x = x < 0 ? -x : x;
44
+
45
+ double a = 0.147;
46
+ double ln = log(1.0 - x * x);
47
+ double f1 = 2.0 / (DIST_PI * a);
48
+ double f2 = ln / 2.0;
49
+ double f3 = f1 + f2;
50
+ double f4 = 1.0 / a * ln;
51
+ return sign * sqrt(-f1 - f2 + sqrt(f3 * f3 - f4));
52
+ }
53
+
54
+ double normal_pdf(double x, double mean, double std_dev) {
55
+ double var = std_dev * std_dev;
56
+ return (1.0 / (var * sqrt(2.0 * DIST_PI))) * pow(DIST_E, -0.5 * pow((x - mean) / var, 2));
57
+ }
58
+
59
+ double normal_cdf(double x, double mean, double std_dev) {
60
+ return 0.5 * (1.0 + erf((x - mean) / (std_dev * std_dev * sqrt(2))));
61
+ }
62
+
63
+ double normal_ppf(double p, double mean, double std_dev) {
64
+ assert(p >= 0 && p <= 1);
65
+
66
+ return mean + (std_dev * std_dev) * sqrt(2) * inverse_erf(2.0 * p - 1.0);
67
+ }
68
+
69
+ double students_t_pdf(double x, unsigned int n) {
70
+ assert(n >= 1);
71
+
72
+ return tgamma((n + 1.0) / 2.0) / (sqrt(n * DIST_PI) * tgamma(n / 2.0)) * pow(1.0 + x * x / n, -(n + 1.0) / 2.0);
73
+ }
74
+
75
+ // Hill, G. W. (1970).
76
+ // Algorithm 395: Student's t-distribution.
77
+ // Communications of the ACM, 13(10), 617-619.
78
+ double students_t_cdf(double x, unsigned int n) {
79
+ assert(n >= 1);
80
+
81
+ double start = x < 0 ? 0 : 1;
82
+ double sign = x < 0 ? 1 : -1;
83
+
84
+ double z = 1.0;
85
+ double t = x * x;
86
+ double y = t / n;
87
+ double b = 1.0 + y;
88
+
89
+ if ((n >= 20 && t < n) || n > 200) {
90
+ // asymptotic series for large or noninteger n
91
+ if (y > 10e-6) {
92
+ y = log(b);
93
+ }
94
+ double a = n - 0.5;
95
+ b = 48.0 * a * a;
96
+ y = a * y;
97
+ y = (((((-0.4 * y - 3.3) * y - 24.0) * y - 85.5) / (0.8 * y * y + 100.0 + b) + y + 3.0) / b + 1.0) * sqrt(y);
98
+ return start + sign * normal_cdf(-y, 0.0, 1.0);
99
+ }
100
+
101
+ if (n < 20 && t < 4.0) {
102
+ // nested summation of cosine series
103
+ y = sqrt(y);
104
+ double a = y;
105
+ if (n == 1) {
106
+ a = 0.0;
107
+ }
108
+
109
+ // loop
110
+ if (n > 1) {
111
+ n -= 2;
112
+ while (n > 1) {
113
+ a = (n - 1) / (b * n) * a + y;
114
+ n -= 2;
115
+ }
116
+ }
117
+ a = n == 0 ? a / sqrt(b) : (atan(y) + a / b) * (2.0 / DIST_PI);
118
+ return start + sign * (z - a) / 2;
119
+ }
120
+
121
+ // tail series expanation for large t-values
122
+ double a = sqrt(b);
123
+ y = a * n;
124
+ int j = 0;
125
+ while (a != z) {
126
+ j += 2;
127
+ z = a;
128
+ y = y * (j - 1) / (b * j);
129
+ a = a + y / (n + j);
130
+ }
131
+ z = 0.0;
132
+ y = 0.0;
133
+ a = -a;
134
+
135
+ // loop (without n + 2 and n - 2)
136
+ while (n > 1) {
137
+ a = (n - 1) / (b * n) * a + y;
138
+ n -= 2;
139
+ }
140
+ a = n == 0 ? a / sqrt(b) : (atan(y) + a / b) * (2.0 / DIST_PI);
141
+ return start + sign * (z - a) / 2;
142
+ }
143
+
144
+ // Hill, G. W. (1970).
145
+ // Algorithm 396: Student's t-quantiles.
146
+ // Communications of the ACM, 13(10), 619-620.
147
+ double students_t_ppf(double p, unsigned int n) {
148
+ assert(p >= 0 && p <= 1);
149
+ assert(n >= 1);
150
+
151
+ // distribution is symmetric
152
+ double sign = p < 0.5 ? -1 : 1;
153
+ p = p < 0.5 ? 1 - p : p;
154
+
155
+ // two-tail to one-tail
156
+ p = 2.0 * (1.0 - p);
157
+
158
+ if (n == 2) {
159
+ return sign * sqrt(2.0 / (p * (2.0 - p)) - 2.0);
160
+ }
161
+
162
+ double half_pi = DIST_PI / 2.0;
163
+
164
+ if (n == 1) {
165
+ p = p * half_pi;
166
+ return sign * cos(p) / sin(p);
167
+ }
168
+
169
+ double a = 1.0 / (n - 0.5);
170
+ double b = 48.0 / (a * a);
171
+ double c = ((20700.0 * a / b - 98.0) * a - 16.0) * a + 96.36;
172
+ double d = ((94.5 / (b + c) - 3.0) / b + 1.0) * sqrt(a * half_pi) * n;
173
+ double x = d * p;
174
+ double y = pow(x, 2.0 / n);
175
+ if (y > 0.05 + a) {
176
+ // asymptotic inverse expansion about normal
177
+ x = normal_ppf(p * 0.5, 0.0, 1.0);
178
+ y = x * x;
179
+ if (n < 5) {
180
+ c += 0.3 * (n - 4.5) * (x + 0.6);
181
+ }
182
+ c = (((0.05 * d * x - 5.0) * x - 7.0) * x - 2.0) * x + b + c;
183
+ y = (((((0.4 * y + 6.3) * y + 36.0) * y + 94.5) / c - y - 3.0) / b + 1.0) * x;
184
+ y = a * y * y;
185
+ y = y > 0.002 ? exp(y) - 1.0 : 0.5 * y * y + y;
186
+ } else {
187
+ y = ((1.0 / (((n + 6.0) / (n * y) - 0.089 * d - 0.822) * (n + 2.0) * 3.0) + 0.5 / (n + 4.0)) * y - 1.0) * (n + 1.0) / (n + 2.0) + 1.0 / y;
188
+ }
189
+ return sign * sqrt(n * y);
190
+ }
@@ -12,7 +12,7 @@ void Init_ext() {
12
12
  rb_mAnomalyDetection
13
13
  .define_singleton_function(
14
14
  "_detect",
15
- [](std::vector<float> x, int period, float k, float alpha, const std::string& direction, bool verbose) {
15
+ [](std::vector<float> series, int period, float k, float alpha, const std::string& direction, bool verbose) {
16
16
  Direction dir;
17
17
  if (direction == "pos") {
18
18
  dir = Direction::Positive;
@@ -24,10 +24,16 @@ void Init_ext() {
24
24
  throw std::invalid_argument("direction must be pos, neg, or both");
25
25
  }
26
26
 
27
- auto res = anomaly_detection::anomalies(x, period, k, alpha, dir, verbose, rb_thread_check_ints);
27
+ auto res = anomaly_detection::params()
28
+ .max_anoms(k)
29
+ .alpha(alpha)
30
+ .direction(dir)
31
+ .verbose(verbose)
32
+ .callback(rb_thread_check_ints)
33
+ .fit(series, period);
28
34
 
29
35
  auto a = Rice::Array();
30
- for (auto v : res) {
36
+ for (auto v : res.anomalies) {
31
37
  a.push(v);
32
38
  }
33
39
  return a;
@@ -1,3 +1,3 @@
1
1
  module AnomalyDetection
2
- VERSION = "0.1.1"
2
+ VERSION = "0.1.4"
3
3
  end
@@ -5,18 +5,72 @@ require "anomaly_detection/ext"
5
5
  require "anomaly_detection/version"
6
6
 
7
7
  module AnomalyDetection
8
- def self.detect(series, period:, max_anoms: 0.1, alpha: 0.05, direction: "both", verbose: false)
9
- raise ArgumentError, "series must contain at least 2 periods" if series.size < period * 2
10
-
11
- if series.is_a?(Hash)
12
- sorted = series.sort_by { |k, _| k }
13
- x = sorted.map(&:last)
14
- else
15
- x = series
8
+ class << self
9
+ def detect(series, period:, max_anoms: 0.1, alpha: 0.05, direction: "both", plot: false, verbose: false)
10
+ raise ArgumentError, "series must contain at least 2 periods" if series.size < period * 2
11
+
12
+ if series.is_a?(Hash)
13
+ sorted = series.sort_by { |k, _| k }
14
+ x = sorted.map(&:last)
15
+ else
16
+ x = series
17
+ end
18
+
19
+ res = _detect(x, period, max_anoms, alpha, direction, verbose)
20
+ res.map! { |i| sorted[i][0] } if series.is_a?(Hash)
21
+ res
16
22
  end
17
23
 
18
- res = _detect(x, period, max_anoms, alpha, direction, verbose)
19
- res.map! { |i| sorted[i][0] } if series.is_a?(Hash)
20
- res
24
+ # TODO add tooltips
25
+ def plot(series, anomalies)
26
+ require "vega"
27
+
28
+ data =
29
+ if series.is_a?(Hash)
30
+ series.map { |k, v| {x: iso8601(k), y: v, anomaly: anomalies.include?(k)} }
31
+ else
32
+ series.map.with_index { |v, i| {x: i, y: v, anomaly: anomalies.include?(i)} }
33
+ end
34
+
35
+ if series.is_a?(Hash)
36
+ x = {field: "x", type: "temporal"}
37
+ x["scale"] = {type: "utc"} if series.keys.first.is_a?(Date)
38
+ else
39
+ x = {field: "x", type: "quantitative"}
40
+ end
41
+
42
+ Vega.lite
43
+ .data(data)
44
+ .layer([
45
+ {
46
+ mark: {type: "line"},
47
+ encoding: {
48
+ x: x,
49
+ y: {field: "y", type: "quantitative", scale: {zero: false}},
50
+ color: {value: "#fa9088"}
51
+ }
52
+ },
53
+ {
54
+ transform: [{"filter": "datum.anomaly == true"}],
55
+ mark: {type: "point", size: 200},
56
+ encoding: {
57
+ x: x,
58
+ y: {field: "y", type: "quantitative"},
59
+ color: {value: "#19c7ca"}
60
+ }
61
+ }
62
+ ])
63
+ .config(axis: {title: nil, labelFontSize: 12})
64
+ end
65
+
66
+ private
67
+
68
+ def iso8601(v)
69
+ if v.is_a?(Date)
70
+ v.strftime("%Y-%m-%d")
71
+ else
72
+ v.strftime("%Y-%m-%dT%H:%M:%S.%L%z")
73
+ end
74
+ end
21
75
  end
22
76
  end