anomaly_detection 0.1.3 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: cfb709e43863f4221a67e8f675f28b5361f6bf33a0d0f6fa4f52cdc0cad01796
4
- data.tar.gz: 40965f08bb75cdb673d43e42c7fc47403fbfb1b082de10cab86bac569916d5b8
3
+ metadata.gz: da5eb71023f77a4c05e6322c020ef602e8e22b7b5ba516fce99679af702c881d
4
+ data.tar.gz: 26560c8dd893c491bd3094202ff82ae33eefdcdba74fe4386b006f7f522906df
5
5
  SHA512:
6
- metadata.gz: '0496d044ecbe143be64164bd88092c5a5bd660f5fec6425e9f6ea0759f2ab7dbaa41a055c9d03a78c8e5a661b86bb25629c5e7a809614ba415d941f4d63fc9cb'
7
- data.tar.gz: 8cc3f28c981d0be5cdb3dbfd054910d45469d258a662e163d7b01379af93e6714874d043ff1521ae19506f742b63eb93bf631f6f895c11842c4c15028c66a4b8
6
+ metadata.gz: ec2e1459ca2410ee6ab1bce3fe9c528d6419b75e10c6448f1fe5b3030a2e3d8de320a23a9bded17702a01fd23d112007b909c8611e2da6c1ff4f8521352c89ac
7
+ data.tar.gz: ad150705d6e32a111c3bc044ef7f99910beebe572e07799719e72118422b7a9e6439943cac8b86d613537d7d0fb52cba86a668faf78d135abc888ce3737f8104
data/CHANGELOG.md CHANGED
@@ -1,3 +1,13 @@
1
+ ## 0.2.0 (2023-01-31)
2
+
3
+ - Added experimental support for auto-detecting period
4
+ - Fixed result when no seasonality (period is less than 2)
5
+ - Dropped support for Ruby < 2.7
6
+
7
+ ## 0.1.4 (2022-03-19)
8
+
9
+ - Fixed initial median calculation
10
+
1
11
  ## 0.1.3 (2022-01-03)
2
12
 
3
13
  - Switched to dist.h
data/NOTICE.txt CHANGED
@@ -1,5 +1,5 @@
1
1
  Copyright (C) 2015 Twitter, Inc and other contributors
2
- Copyright (C) 2021 Andrew Kane
2
+ Copyright (C) 2021-2023 Andrew Kane
3
3
 
4
4
  This program is free software: you can redistribute it and/or modify
5
5
  it under the terms of the GNU General Public License as published by
data/README.md CHANGED
@@ -11,7 +11,7 @@ Learn [how it works](https://blog.twitter.com/engineering/en_us/a/2015/introduci
11
11
  Add this line to your application’s Gemfile:
12
12
 
13
13
  ```ruby
14
- gem 'anomaly_detection'
14
+ gem "anomaly_detection"
15
15
  ```
16
16
 
17
17
  ## Getting Started
@@ -63,7 +63,7 @@ AnomalyDetection.detect(
63
63
  Add [Vega](https://github.com/ankane/vega) to your application’s Gemfile:
64
64
 
65
65
  ```ruby
66
- gem 'vega'
66
+ gem "vega"
67
67
  ```
68
68
 
69
69
  And use:
@@ -1,12 +1,210 @@
1
+ /*!
2
+ * AnomalyDetection.cpp v0.1.3
3
+ * https://github.com/ankane/AnomalyDetection.cpp
4
+ * GPL-3.0-or-later License
5
+ */
6
+
1
7
  #pragma once
2
8
 
3
- #include <string>
9
+ #include <functional>
10
+ #include <iostream>
11
+ #include <iterator>
12
+ #include <numeric>
4
13
  #include <vector>
5
14
 
15
+ #include "dist.h"
16
+ #include "stl.hpp"
17
+
6
18
  namespace anomaly_detection {
7
19
 
8
20
  enum Direction { Positive, Negative, Both };
9
21
 
10
- std::vector<size_t> anomalies(const std::vector<float>& x, int period, float k, float alpha, Direction direction, bool verbose, std::function<void()> interrupt);
22
+ float median_sorted(const std::vector<float>& sorted) {
23
+ return (sorted[(sorted.size() - 1) / 2] + sorted[sorted.size() / 2]) / 2.0;
24
+ }
25
+
26
+ float median(const std::vector<float>& data) {
27
+ std::vector<float> sorted(data);
28
+ std::sort(sorted.begin(), sorted.end());
29
+ return median_sorted(sorted);
30
+ }
31
+
32
+ float mad(const std::vector<float>& data, float med) {
33
+ std::vector<float> res;
34
+ res.reserve(data.size());
35
+ for (auto v : data) {
36
+ res.push_back(fabs(v - med));
37
+ }
38
+ std::sort(res.begin(), res.end());
39
+ return 1.4826 * median_sorted(res);
40
+ }
41
+
42
+ std::vector<size_t> detect_anoms(const std::vector<float>& data, size_t num_obs_per_period, float k, float alpha, bool one_tail, bool upper_tail, bool verbose, std::function<void()> callback) {
43
+ auto n = data.size();
44
+
45
+ // Check to make sure we have at least two periods worth of data for anomaly context
46
+ if (n < num_obs_per_period * 2) {
47
+ throw std::invalid_argument("series must contain at least 2 periods");
48
+ }
49
+
50
+ // Handle NANs
51
+ auto nan = std::count_if(data.begin(), data.end(), [](const auto& value) { return std::isnan(value); });
52
+ if (nan > 0) {
53
+ throw std::invalid_argument("series contains NANs");
54
+ }
55
+
56
+ std::vector<float> data2;
57
+ data2.reserve(n);
58
+ auto med = median(data);
59
+
60
+ if (num_obs_per_period > 1) {
61
+ // Decompose data. This returns a univarite remainder which will be used for anomaly detection. Optionally, we might NOT decompose.
62
+ auto data_decomp = stl::params().robust(true).seasonal_length(data.size() * 10 + 1).fit(data, num_obs_per_period);
63
+ auto seasonal = data_decomp.seasonal;
64
+
65
+ for (size_t i = 0; i < n; i++) {
66
+ data2.push_back(data[i] - seasonal[i] - med);
67
+ }
68
+ } else {
69
+ for (size_t i = 0; i < n; i++) {
70
+ data2.push_back(data[i] - med);
71
+ }
72
+ }
73
+
74
+ auto num_anoms = 0;
75
+ auto max_outliers = (size_t) n * k;
76
+ std::vector<size_t> anomalies;
77
+ anomalies.reserve(max_outliers);
78
+
79
+ // Sort data for fast median
80
+ // Use stable sort for indexes for deterministic results
81
+ std::vector<size_t> indexes(n);
82
+ std::iota(indexes.begin(), indexes.end(), 0);
83
+ std::stable_sort(indexes.begin(), indexes.end(), [&data2](size_t a, size_t b) { return data2[a] < data2[b]; });
84
+ std::sort(data2.begin(), data2.end());
85
+
86
+ // Compute test statistic until r=max_outliers values have been removed from the sample
87
+ for (auto i = 1; i <= max_outliers; i++) {
88
+ if (verbose) {
89
+ std::cout << i << " / " << max_outliers << " completed" << std::endl;
90
+ }
91
+
92
+ // TODO Improve performance between loop iterations
93
+ auto ma = median_sorted(data2);
94
+ std::vector<float> ares;
95
+ ares.reserve(data2.size());
96
+ if (one_tail) {
97
+ if (upper_tail) {
98
+ for (auto v : data2) {
99
+ ares.push_back(v - ma);
100
+ }
101
+ } else {
102
+ for (auto v : data2) {
103
+ ares.push_back(ma - v);
104
+ }
105
+ }
106
+ } else {
107
+ for (auto v : data2) {
108
+ ares.push_back(fabs(v - ma));
109
+ }
110
+ }
111
+
112
+ // Protect against constant time series
113
+ auto data_sigma = mad(data2, ma);
114
+ if (data_sigma == 0.0) {
115
+ break;
116
+ }
117
+
118
+ auto iter = std::max_element(ares.begin(), ares.end());
119
+ auto r_idx_i = std::distance(ares.begin(), iter);
120
+
121
+ // Only need to take sigma of r for performance
122
+ auto r = ares[r_idx_i] / data_sigma;
123
+
124
+ anomalies.push_back(indexes[r_idx_i]);
125
+ data2.erase(data2.begin() + r_idx_i);
126
+ indexes.erase(indexes.begin() + r_idx_i);
127
+
128
+ // Compute critical value
129
+ float p;
130
+ if (one_tail) {
131
+ p = 1.0 - alpha / (n - i + 1);
132
+ } else {
133
+ p = 1.0 - alpha / (2.0 * (n - i + 1));
134
+ }
135
+
136
+ auto t = students_t_ppf(p, n - i - 1);
137
+ auto lam = t * (n - i) / sqrt(((n - i - 1) + t * t) * (n - i + 1));
138
+
139
+ if (r > lam) {
140
+ num_anoms = i;
141
+ }
142
+
143
+ if (callback != nullptr) {
144
+ callback();
145
+ }
146
+ }
147
+
148
+ anomalies.resize(num_anoms);
149
+
150
+ // Sort like R version
151
+ std::sort(anomalies.begin(), anomalies.end());
152
+
153
+ return anomalies;
154
+ }
155
+
156
+ class AnomalyDetectionResult {
157
+ public:
158
+ std::vector<size_t> anomalies;
159
+ };
160
+
161
+ class AnomalyDetectionParams {
162
+ float alpha_ = 0.05;
163
+ float max_anoms_ = 0.1;
164
+ Direction direction_ = Direction::Both;
165
+ bool verbose_ = false;
166
+ std::function<void()> callback_ = nullptr;
167
+
168
+ public:
169
+ inline AnomalyDetectionParams alpha(float alpha) {
170
+ this->alpha_ = alpha;
171
+ return *this;
172
+ };
173
+
174
+ inline AnomalyDetectionParams max_anoms(float max_anoms) {
175
+ this->max_anoms_ = max_anoms;
176
+ return *this;
177
+ };
178
+
179
+ inline AnomalyDetectionParams direction(Direction direction) {
180
+ this->direction_ = direction;
181
+ return *this;
182
+ };
183
+
184
+ inline AnomalyDetectionParams verbose(bool verbose) {
185
+ this->verbose_ = verbose;
186
+ return *this;
187
+ };
188
+
189
+ inline AnomalyDetectionParams callback(std::function<void()> callback) {
190
+ this->callback_ = callback;
191
+ return *this;
192
+ };
193
+
194
+ AnomalyDetectionResult fit(const std::vector<float>& series, size_t period);
195
+ };
196
+
197
+ AnomalyDetectionParams params() {
198
+ return AnomalyDetectionParams();
199
+ }
200
+
201
+ AnomalyDetectionResult AnomalyDetectionParams::fit(const std::vector<float>& series, size_t period) {
202
+ bool one_tail = this->direction_ != Direction::Both;
203
+ bool upper_tail = this->direction_ == Direction::Positive;
204
+
205
+ auto res = AnomalyDetectionResult();
206
+ res.anomalies = detect_anoms(series, period, this->max_anoms_, this->alpha_, one_tail, upper_tail, this->verbose_, this->callback_);
207
+ return res;
208
+ }
11
209
 
12
210
  }
@@ -1,72 +1,119 @@
1
1
  /*!
2
- * dist.h v0.1.0
2
+ * dist.h v0.3.0
3
3
  * https://github.com/ankane/dist.h
4
4
  * Unlicense OR MIT License
5
5
  */
6
6
 
7
7
  #pragma once
8
8
 
9
- #define _USE_MATH_DEFINES
10
-
11
- #include <assert.h>
12
9
  #include <math.h>
13
10
 
14
- // Winitzki, S. (2008).
15
- // A handy approximation for the error function and its inverse.
16
- // https://drive.google.com/file/d/0B2Mt7luZYBrwZlctV3A3eF82VGM/view?resourcekey=0-UQpPhwZgzP0sF4LHBDlLtg
17
- // from https://sites.google.com/site/winitzki
18
- double erf(double x) {
19
- double sign = x < 0 ? -1.0 : 1.0;
20
- x = x < 0 ? -x : x;
21
-
22
- double a = 0.14;
23
- double x2 = x * x;
24
- return sign * sqrt(1.0 - exp(-x2 * (4.0 / M_PI + a * x2) / (1.0 + a * x2)));
25
- }
11
+ #ifdef M_E
12
+ #define DIST_E M_E
13
+ #else
14
+ #define DIST_E 2.71828182845904523536
15
+ #endif
26
16
 
27
- // Winitzki, S. (2008).
28
- // A handy approximation for the error function and its inverse.
29
- // https://drive.google.com/file/d/0B2Mt7luZYBrwZlctV3A3eF82VGM/view?resourcekey=0-UQpPhwZgzP0sF4LHBDlLtg
30
- // from https://sites.google.com/site/winitzki
31
- double inverse_erf(double x) {
32
- double sign = x < 0 ? -1.0 : 1.0;
33
- x = x < 0 ? -x : x;
34
-
35
- double a = 0.147;
36
- double ln = log(1.0 - x * x);
37
- double f1 = 2.0 / (M_PI * a);
38
- double f2 = ln / 2.0;
39
- double f3 = f1 + f2;
40
- double f4 = 1.0 / a * ln;
41
- return sign * sqrt(-f1 - f2 + sqrt(f3 * f3 - f4));
42
- }
17
+ #ifdef M_PI
18
+ #define DIST_PI M_PI
19
+ #else
20
+ #define DIST_PI 3.14159265358979323846
21
+ #endif
22
+
23
+ #ifdef M_SQRT2
24
+ #define DIST_SQRT2 M_SQRT2
25
+ #else
26
+ #define DIST_SQRT2 1.41421356237309504880
27
+ #endif
43
28
 
44
29
  double normal_pdf(double x, double mean, double std_dev) {
45
- double var = std_dev * std_dev;
46
- return (1.0 / (var * sqrt(2.0 * M_PI))) * pow(M_E, -0.5 * pow((x - mean) / var, 2));
30
+ if (std_dev <= 0) {
31
+ return NAN;
32
+ }
33
+
34
+ double n = (x - mean) / std_dev;
35
+ return (1.0 / (std_dev * sqrt(2.0 * DIST_PI))) * pow(DIST_E, -0.5 * n * n);
47
36
  }
48
37
 
49
38
  double normal_cdf(double x, double mean, double std_dev) {
50
- return 0.5 * (1.0 + erf((x - mean) / (std_dev * std_dev * sqrt(2))));
39
+ if (std_dev <= 0) {
40
+ return NAN;
41
+ }
42
+
43
+ return 0.5 * (1.0 + erf((x - mean) / (std_dev * DIST_SQRT2)));
51
44
  }
52
45
 
46
+ // Wichura, M. J. (1988).
47
+ // Algorithm AS 241: The Percentage Points of the Normal Distribution.
48
+ // Journal of the Royal Statistical Society. Series C (Applied Statistics), 37(3), 477-484.
53
49
  double normal_ppf(double p, double mean, double std_dev) {
54
- assert(p >= 0 && p <= 1);
50
+ if (p < 0 || p > 1 || std_dev <= 0 || isnan(mean) || isnan(std_dev)) {
51
+ return NAN;
52
+ }
53
+
54
+ if (p == 0) {
55
+ return -INFINITY;
56
+ }
55
57
 
56
- return mean + (std_dev * std_dev) * sqrt(2) * inverse_erf(2.0 * p - 1.0);
58
+ if (p == 1) {
59
+ return INFINITY;
60
+ }
61
+
62
+ double q = p - 0.5;
63
+ if (fabs(q) < 0.425) {
64
+ double r = 0.180625 - q * q;
65
+ return mean + std_dev * q *
66
+ (((((((2.5090809287301226727e3 * r + 3.3430575583588128105e4) * r + 6.7265770927008700853e4) * r + 4.5921953931549871457e4) * r + 1.3731693765509461125e4) * r + 1.9715909503065514427e3) * r + 1.3314166789178437745e2) * r + 3.3871328727963666080e0) /
67
+ (((((((5.2264952788528545610e3 * r + 2.8729085735721942674e4) * r + 3.9307895800092710610e4) * r + 2.1213794301586595867e4) * r + 5.3941960214247511077e3) * r + 6.8718700749205790830e2) * r + 4.2313330701600911252e1) * r + 1);
68
+ } else {
69
+ double r = q < 0 ? p : 1 - p;
70
+ r = sqrt(-log(r));
71
+ double sign = q < 0 ? -1 : 1;
72
+ if (r < 5) {
73
+ r -= 1.6;
74
+ return mean + std_dev * sign *
75
+ (((((((7.74545014278341407640e-4 * r + 2.27238449892691845833e-2) * r + 2.41780725177450611770e-1) * r + 1.27045825245236838258e0) * r + 3.64784832476320460504e0) * r + 5.76949722146069140550e0) * r + 4.63033784615654529590e0) * r + 1.42343711074968357734e0) /
76
+ (((((((1.05075007164441684324e-9 * r + 5.47593808499534494600e-4) * r + 1.51986665636164571966e-2) * r + 1.48103976427480074590e-1) * r + 6.89767334985100004550e-1) * r + 1.67638483018380384940e0) * r + 2.05319162663775882187e0) * r + 1);
77
+ } else {
78
+ r -= 5;
79
+ return mean + std_dev * sign *
80
+ (((((((2.01033439929228813265e-7 * r + 2.71155556874348757815e-5) * r + 1.24266094738807843860e-3) * r + 2.65321895265761230930e-2) * r + 2.96560571828504891230e-1) * r + 1.78482653991729133580e0) * r + 5.46378491116411436990e0) * r + 6.65790464350110377720e0) /
81
+ (((((((2.04426310338993978564e-15 * r + 1.42151175831644588870e-7) * r + 1.84631831751005468180e-5) * r + 7.86869131145613259100e-4) * r + 1.48753612908506148525e-2) * r + 1.36929880922735805310e-1) * r + 5.99832206555887937690e-1) * r + 1);
82
+ }
83
+ }
57
84
  }
58
85
 
59
- double students_t_pdf(double x, unsigned int n) {
60
- assert(n >= 1);
86
+ double students_t_pdf(double x, double n) {
87
+ if (n <= 0) {
88
+ return NAN;
89
+ }
90
+
91
+ if (n == INFINITY) {
92
+ return normal_pdf(x, 0, 1);
93
+ }
61
94
 
62
- return tgamma((n + 1.0) / 2.0) / (sqrt(n * M_PI) * tgamma(n / 2.0)) * pow(1.0 + x * x / n, -(n + 1.0) / 2.0);
95
+ return tgamma((n + 1.0) / 2.0) / (sqrt(n * DIST_PI) * tgamma(n / 2.0)) * pow(1.0 + x * x / n, -(n + 1.0) / 2.0);
63
96
  }
64
97
 
65
98
  // Hill, G. W. (1970).
66
99
  // Algorithm 395: Student's t-distribution.
67
100
  // Communications of the ACM, 13(10), 617-619.
68
- double students_t_cdf(double x, unsigned int n) {
69
- assert(n >= 1);
101
+ double students_t_cdf(double x, double n) {
102
+ if (n < 1) {
103
+ return NAN;
104
+ }
105
+
106
+ if (isnan(x)) {
107
+ return NAN;
108
+ }
109
+
110
+ if (!isfinite(x)) {
111
+ return x < 0 ? 0 : 1;
112
+ }
113
+
114
+ if (n == INFINITY) {
115
+ return normal_cdf(x, 0, 1);
116
+ }
70
117
 
71
118
  double start = x < 0 ? 0 : 1;
72
119
  double sign = x < 0 ? 1 : -1;
@@ -76,7 +123,7 @@ double students_t_cdf(double x, unsigned int n) {
76
123
  double y = t / n;
77
124
  double b = 1.0 + y;
78
125
 
79
- if ((n >= 20 && t < n) || n > 200) {
126
+ if (n > floor(n) || (n >= 20 && t < n) || n > 200) {
80
127
  // asymptotic series for large or noninteger n
81
128
  if (y > 10e-6) {
82
129
  y = log(b);
@@ -88,6 +135,10 @@ double students_t_cdf(double x, unsigned int n) {
88
135
  return start + sign * normal_cdf(-y, 0.0, 1.0);
89
136
  }
90
137
 
138
+ // make n int
139
+ // n is int between 1 and 200 if made it here
140
+ n = (int) n;
141
+
91
142
  if (n < 20 && t < 4.0) {
92
143
  // nested summation of cosine series
93
144
  y = sqrt(y);
@@ -104,7 +155,7 @@ double students_t_cdf(double x, unsigned int n) {
104
155
  n -= 2;
105
156
  }
106
157
  }
107
- a = n == 0 ? a / sqrt(b) : (atan(y) + a / b) * (2.0 / M_PI);
158
+ a = n == 0 ? a / sqrt(b) : (atan(y) + a / b) * (2.0 / DIST_PI);
108
159
  return start + sign * (z - a) / 2;
109
160
  }
110
161
 
@@ -127,16 +178,21 @@ double students_t_cdf(double x, unsigned int n) {
127
178
  a = (n - 1) / (b * n) * a + y;
128
179
  n -= 2;
129
180
  }
130
- a = n == 0 ? a / sqrt(b) : (atan(y) + a / b) * (2.0 / M_PI);
181
+ a = n == 0 ? a / sqrt(b) : (atan(y) + a / b) * (2.0 / DIST_PI);
131
182
  return start + sign * (z - a) / 2;
132
183
  }
133
184
 
134
185
  // Hill, G. W. (1970).
135
186
  // Algorithm 396: Student's t-quantiles.
136
187
  // Communications of the ACM, 13(10), 619-620.
137
- double students_t_ppf(double p, unsigned int n) {
138
- assert(p >= 0 && p <= 1);
139
- assert(n >= 1);
188
+ double students_t_ppf(double p, double n) {
189
+ if (p < 0 || p > 1 || n < 1) {
190
+ return NAN;
191
+ }
192
+
193
+ if (n == INFINITY) {
194
+ return normal_ppf(p, 0, 1);
195
+ }
140
196
 
141
197
  // distribution is symmetric
142
198
  double sign = p < 0.5 ? -1 : 1;
@@ -149,7 +205,7 @@ double students_t_ppf(double p, unsigned int n) {
149
205
  return sign * sqrt(2.0 / (p * (2.0 - p)) - 2.0);
150
206
  }
151
207
 
152
- double half_pi = M_PI / 2.0;
208
+ double half_pi = DIST_PI / 2.0;
153
209
 
154
210
  if (n == 1) {
155
211
  p = p * half_pi;
@@ -12,7 +12,7 @@ void Init_ext() {
12
12
  rb_mAnomalyDetection
13
13
  .define_singleton_function(
14
14
  "_detect",
15
- [](std::vector<float> x, int period, float k, float alpha, const std::string& direction, bool verbose) {
15
+ [](std::vector<float> series, int period, float k, float alpha, const std::string& direction, bool verbose) {
16
16
  Direction dir;
17
17
  if (direction == "pos") {
18
18
  dir = Direction::Positive;
@@ -24,10 +24,16 @@ void Init_ext() {
24
24
  throw std::invalid_argument("direction must be pos, neg, or both");
25
25
  }
26
26
 
27
- auto res = anomaly_detection::anomalies(x, period, k, alpha, dir, verbose, rb_thread_check_ints);
27
+ auto res = anomaly_detection::params()
28
+ .max_anoms(k)
29
+ .alpha(alpha)
30
+ .direction(dir)
31
+ .verbose(verbose)
32
+ .callback(rb_thread_check_ints)
33
+ .fit(series, period);
28
34
 
29
35
  auto a = Rice::Array();
30
- for (auto v : res) {
36
+ for (auto v : res.anomalies) {
31
37
  a.push(v);
32
38
  }
33
39
  return a;