anomaly_detection 0.1.3 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: cfb709e43863f4221a67e8f675f28b5361f6bf33a0d0f6fa4f52cdc0cad01796
4
- data.tar.gz: 40965f08bb75cdb673d43e42c7fc47403fbfb1b082de10cab86bac569916d5b8
3
+ metadata.gz: da5eb71023f77a4c05e6322c020ef602e8e22b7b5ba516fce99679af702c881d
4
+ data.tar.gz: 26560c8dd893c491bd3094202ff82ae33eefdcdba74fe4386b006f7f522906df
5
5
  SHA512:
6
- metadata.gz: '0496d044ecbe143be64164bd88092c5a5bd660f5fec6425e9f6ea0759f2ab7dbaa41a055c9d03a78c8e5a661b86bb25629c5e7a809614ba415d941f4d63fc9cb'
7
- data.tar.gz: 8cc3f28c981d0be5cdb3dbfd054910d45469d258a662e163d7b01379af93e6714874d043ff1521ae19506f742b63eb93bf631f6f895c11842c4c15028c66a4b8
6
+ metadata.gz: ec2e1459ca2410ee6ab1bce3fe9c528d6419b75e10c6448f1fe5b3030a2e3d8de320a23a9bded17702a01fd23d112007b909c8611e2da6c1ff4f8521352c89ac
7
+ data.tar.gz: ad150705d6e32a111c3bc044ef7f99910beebe572e07799719e72118422b7a9e6439943cac8b86d613537d7d0fb52cba86a668faf78d135abc888ce3737f8104
data/CHANGELOG.md CHANGED
@@ -1,3 +1,13 @@
1
+ ## 0.2.0 (2023-01-31)
2
+
3
+ - Added experimental support for auto-detecting period
4
+ - Fixed result when no seasonality (period is less than 2)
5
+ - Dropped support for Ruby < 2.7
6
+
7
+ ## 0.1.4 (2022-03-19)
8
+
9
+ - Fixed initial median calculation
10
+
1
11
  ## 0.1.3 (2022-01-03)
2
12
 
3
13
  - Switched to dist.h
data/NOTICE.txt CHANGED
@@ -1,5 +1,5 @@
1
1
  Copyright (C) 2015 Twitter, Inc and other contributors
2
- Copyright (C) 2021 Andrew Kane
2
+ Copyright (C) 2021-2023 Andrew Kane
3
3
 
4
4
  This program is free software: you can redistribute it and/or modify
5
5
  it under the terms of the GNU General Public License as published by
data/README.md CHANGED
@@ -11,7 +11,7 @@ Learn [how it works](https://blog.twitter.com/engineering/en_us/a/2015/introduci
11
11
  Add this line to your application’s Gemfile:
12
12
 
13
13
  ```ruby
14
- gem 'anomaly_detection'
14
+ gem "anomaly_detection"
15
15
  ```
16
16
 
17
17
  ## Getting Started
@@ -63,7 +63,7 @@ AnomalyDetection.detect(
63
63
  Add [Vega](https://github.com/ankane/vega) to your application’s Gemfile:
64
64
 
65
65
  ```ruby
66
- gem 'vega'
66
+ gem "vega"
67
67
  ```
68
68
 
69
69
  And use:
@@ -1,12 +1,210 @@
1
+ /*!
2
+ * AnomalyDetection.cpp v0.1.3
3
+ * https://github.com/ankane/AnomalyDetection.cpp
4
+ * GPL-3.0-or-later License
5
+ */
6
+
1
7
  #pragma once
2
8
 
3
- #include <string>
9
+ #include <functional>
10
+ #include <iostream>
11
+ #include <iterator>
12
+ #include <numeric>
4
13
  #include <vector>
5
14
 
15
+ #include "dist.h"
16
+ #include "stl.hpp"
17
+
6
18
  namespace anomaly_detection {
7
19
 
8
20
  enum Direction { Positive, Negative, Both };
9
21
 
10
- std::vector<size_t> anomalies(const std::vector<float>& x, int period, float k, float alpha, Direction direction, bool verbose, std::function<void()> interrupt);
22
+ float median_sorted(const std::vector<float>& sorted) {
23
+ return (sorted[(sorted.size() - 1) / 2] + sorted[sorted.size() / 2]) / 2.0;
24
+ }
25
+
26
+ float median(const std::vector<float>& data) {
27
+ std::vector<float> sorted(data);
28
+ std::sort(sorted.begin(), sorted.end());
29
+ return median_sorted(sorted);
30
+ }
31
+
32
+ float mad(const std::vector<float>& data, float med) {
33
+ std::vector<float> res;
34
+ res.reserve(data.size());
35
+ for (auto v : data) {
36
+ res.push_back(fabs(v - med));
37
+ }
38
+ std::sort(res.begin(), res.end());
39
+ return 1.4826 * median_sorted(res);
40
+ }
41
+
42
+ std::vector<size_t> detect_anoms(const std::vector<float>& data, size_t num_obs_per_period, float k, float alpha, bool one_tail, bool upper_tail, bool verbose, std::function<void()> callback) {
43
+ auto n = data.size();
44
+
45
+ // Check to make sure we have at least two periods worth of data for anomaly context
46
+ if (n < num_obs_per_period * 2) {
47
+ throw std::invalid_argument("series must contain at least 2 periods");
48
+ }
49
+
50
+ // Handle NANs
51
+ auto nan = std::count_if(data.begin(), data.end(), [](const auto& value) { return std::isnan(value); });
52
+ if (nan > 0) {
53
+ throw std::invalid_argument("series contains NANs");
54
+ }
55
+
56
+ std::vector<float> data2;
57
+ data2.reserve(n);
58
+ auto med = median(data);
59
+
60
+ if (num_obs_per_period > 1) {
61
+ // Decompose data. This returns a univarite remainder which will be used for anomaly detection. Optionally, we might NOT decompose.
62
+ auto data_decomp = stl::params().robust(true).seasonal_length(data.size() * 10 + 1).fit(data, num_obs_per_period);
63
+ auto seasonal = data_decomp.seasonal;
64
+
65
+ for (size_t i = 0; i < n; i++) {
66
+ data2.push_back(data[i] - seasonal[i] - med);
67
+ }
68
+ } else {
69
+ for (size_t i = 0; i < n; i++) {
70
+ data2.push_back(data[i] - med);
71
+ }
72
+ }
73
+
74
+ auto num_anoms = 0;
75
+ auto max_outliers = (size_t) n * k;
76
+ std::vector<size_t> anomalies;
77
+ anomalies.reserve(max_outliers);
78
+
79
+ // Sort data for fast median
80
+ // Use stable sort for indexes for deterministic results
81
+ std::vector<size_t> indexes(n);
82
+ std::iota(indexes.begin(), indexes.end(), 0);
83
+ std::stable_sort(indexes.begin(), indexes.end(), [&data2](size_t a, size_t b) { return data2[a] < data2[b]; });
84
+ std::sort(data2.begin(), data2.end());
85
+
86
+ // Compute test statistic until r=max_outliers values have been removed from the sample
87
+ for (auto i = 1; i <= max_outliers; i++) {
88
+ if (verbose) {
89
+ std::cout << i << " / " << max_outliers << " completed" << std::endl;
90
+ }
91
+
92
+ // TODO Improve performance between loop iterations
93
+ auto ma = median_sorted(data2);
94
+ std::vector<float> ares;
95
+ ares.reserve(data2.size());
96
+ if (one_tail) {
97
+ if (upper_tail) {
98
+ for (auto v : data2) {
99
+ ares.push_back(v - ma);
100
+ }
101
+ } else {
102
+ for (auto v : data2) {
103
+ ares.push_back(ma - v);
104
+ }
105
+ }
106
+ } else {
107
+ for (auto v : data2) {
108
+ ares.push_back(fabs(v - ma));
109
+ }
110
+ }
111
+
112
+ // Protect against constant time series
113
+ auto data_sigma = mad(data2, ma);
114
+ if (data_sigma == 0.0) {
115
+ break;
116
+ }
117
+
118
+ auto iter = std::max_element(ares.begin(), ares.end());
119
+ auto r_idx_i = std::distance(ares.begin(), iter);
120
+
121
+ // Only need to take sigma of r for performance
122
+ auto r = ares[r_idx_i] / data_sigma;
123
+
124
+ anomalies.push_back(indexes[r_idx_i]);
125
+ data2.erase(data2.begin() + r_idx_i);
126
+ indexes.erase(indexes.begin() + r_idx_i);
127
+
128
+ // Compute critical value
129
+ float p;
130
+ if (one_tail) {
131
+ p = 1.0 - alpha / (n - i + 1);
132
+ } else {
133
+ p = 1.0 - alpha / (2.0 * (n - i + 1));
134
+ }
135
+
136
+ auto t = students_t_ppf(p, n - i - 1);
137
+ auto lam = t * (n - i) / sqrt(((n - i - 1) + t * t) * (n - i + 1));
138
+
139
+ if (r > lam) {
140
+ num_anoms = i;
141
+ }
142
+
143
+ if (callback != nullptr) {
144
+ callback();
145
+ }
146
+ }
147
+
148
+ anomalies.resize(num_anoms);
149
+
150
+ // Sort like R version
151
+ std::sort(anomalies.begin(), anomalies.end());
152
+
153
+ return anomalies;
154
+ }
155
+
156
+ class AnomalyDetectionResult {
157
+ public:
158
+ std::vector<size_t> anomalies;
159
+ };
160
+
161
+ class AnomalyDetectionParams {
162
+ float alpha_ = 0.05;
163
+ float max_anoms_ = 0.1;
164
+ Direction direction_ = Direction::Both;
165
+ bool verbose_ = false;
166
+ std::function<void()> callback_ = nullptr;
167
+
168
+ public:
169
+ inline AnomalyDetectionParams alpha(float alpha) {
170
+ this->alpha_ = alpha;
171
+ return *this;
172
+ };
173
+
174
+ inline AnomalyDetectionParams max_anoms(float max_anoms) {
175
+ this->max_anoms_ = max_anoms;
176
+ return *this;
177
+ };
178
+
179
+ inline AnomalyDetectionParams direction(Direction direction) {
180
+ this->direction_ = direction;
181
+ return *this;
182
+ };
183
+
184
+ inline AnomalyDetectionParams verbose(bool verbose) {
185
+ this->verbose_ = verbose;
186
+ return *this;
187
+ };
188
+
189
+ inline AnomalyDetectionParams callback(std::function<void()> callback) {
190
+ this->callback_ = callback;
191
+ return *this;
192
+ };
193
+
194
+ AnomalyDetectionResult fit(const std::vector<float>& series, size_t period);
195
+ };
196
+
197
+ AnomalyDetectionParams params() {
198
+ return AnomalyDetectionParams();
199
+ }
200
+
201
+ AnomalyDetectionResult AnomalyDetectionParams::fit(const std::vector<float>& series, size_t period) {
202
+ bool one_tail = this->direction_ != Direction::Both;
203
+ bool upper_tail = this->direction_ == Direction::Positive;
204
+
205
+ auto res = AnomalyDetectionResult();
206
+ res.anomalies = detect_anoms(series, period, this->max_anoms_, this->alpha_, one_tail, upper_tail, this->verbose_, this->callback_);
207
+ return res;
208
+ }
11
209
 
12
210
  }
@@ -1,72 +1,119 @@
1
1
  /*!
2
- * dist.h v0.1.0
2
+ * dist.h v0.3.0
3
3
  * https://github.com/ankane/dist.h
4
4
  * Unlicense OR MIT License
5
5
  */
6
6
 
7
7
  #pragma once
8
8
 
9
- #define _USE_MATH_DEFINES
10
-
11
- #include <assert.h>
12
9
  #include <math.h>
13
10
 
14
- // Winitzki, S. (2008).
15
- // A handy approximation for the error function and its inverse.
16
- // https://drive.google.com/file/d/0B2Mt7luZYBrwZlctV3A3eF82VGM/view?resourcekey=0-UQpPhwZgzP0sF4LHBDlLtg
17
- // from https://sites.google.com/site/winitzki
18
- double erf(double x) {
19
- double sign = x < 0 ? -1.0 : 1.0;
20
- x = x < 0 ? -x : x;
21
-
22
- double a = 0.14;
23
- double x2 = x * x;
24
- return sign * sqrt(1.0 - exp(-x2 * (4.0 / M_PI + a * x2) / (1.0 + a * x2)));
25
- }
11
+ #ifdef M_E
12
+ #define DIST_E M_E
13
+ #else
14
+ #define DIST_E 2.71828182845904523536
15
+ #endif
26
16
 
27
- // Winitzki, S. (2008).
28
- // A handy approximation for the error function and its inverse.
29
- // https://drive.google.com/file/d/0B2Mt7luZYBrwZlctV3A3eF82VGM/view?resourcekey=0-UQpPhwZgzP0sF4LHBDlLtg
30
- // from https://sites.google.com/site/winitzki
31
- double inverse_erf(double x) {
32
- double sign = x < 0 ? -1.0 : 1.0;
33
- x = x < 0 ? -x : x;
34
-
35
- double a = 0.147;
36
- double ln = log(1.0 - x * x);
37
- double f1 = 2.0 / (M_PI * a);
38
- double f2 = ln / 2.0;
39
- double f3 = f1 + f2;
40
- double f4 = 1.0 / a * ln;
41
- return sign * sqrt(-f1 - f2 + sqrt(f3 * f3 - f4));
42
- }
17
+ #ifdef M_PI
18
+ #define DIST_PI M_PI
19
+ #else
20
+ #define DIST_PI 3.14159265358979323846
21
+ #endif
22
+
23
+ #ifdef M_SQRT2
24
+ #define DIST_SQRT2 M_SQRT2
25
+ #else
26
+ #define DIST_SQRT2 1.41421356237309504880
27
+ #endif
43
28
 
44
29
  double normal_pdf(double x, double mean, double std_dev) {
45
- double var = std_dev * std_dev;
46
- return (1.0 / (var * sqrt(2.0 * M_PI))) * pow(M_E, -0.5 * pow((x - mean) / var, 2));
30
+ if (std_dev <= 0) {
31
+ return NAN;
32
+ }
33
+
34
+ double n = (x - mean) / std_dev;
35
+ return (1.0 / (std_dev * sqrt(2.0 * DIST_PI))) * pow(DIST_E, -0.5 * n * n);
47
36
  }
48
37
 
49
38
  double normal_cdf(double x, double mean, double std_dev) {
50
- return 0.5 * (1.0 + erf((x - mean) / (std_dev * std_dev * sqrt(2))));
39
+ if (std_dev <= 0) {
40
+ return NAN;
41
+ }
42
+
43
+ return 0.5 * (1.0 + erf((x - mean) / (std_dev * DIST_SQRT2)));
51
44
  }
52
45
 
46
+ // Wichura, M. J. (1988).
47
+ // Algorithm AS 241: The Percentage Points of the Normal Distribution.
48
+ // Journal of the Royal Statistical Society. Series C (Applied Statistics), 37(3), 477-484.
53
49
  double normal_ppf(double p, double mean, double std_dev) {
54
- assert(p >= 0 && p <= 1);
50
+ if (p < 0 || p > 1 || std_dev <= 0 || isnan(mean) || isnan(std_dev)) {
51
+ return NAN;
52
+ }
53
+
54
+ if (p == 0) {
55
+ return -INFINITY;
56
+ }
55
57
 
56
- return mean + (std_dev * std_dev) * sqrt(2) * inverse_erf(2.0 * p - 1.0);
58
+ if (p == 1) {
59
+ return INFINITY;
60
+ }
61
+
62
+ double q = p - 0.5;
63
+ if (fabs(q) < 0.425) {
64
+ double r = 0.180625 - q * q;
65
+ return mean + std_dev * q *
66
+ (((((((2.5090809287301226727e3 * r + 3.3430575583588128105e4) * r + 6.7265770927008700853e4) * r + 4.5921953931549871457e4) * r + 1.3731693765509461125e4) * r + 1.9715909503065514427e3) * r + 1.3314166789178437745e2) * r + 3.3871328727963666080e0) /
67
+ (((((((5.2264952788528545610e3 * r + 2.8729085735721942674e4) * r + 3.9307895800092710610e4) * r + 2.1213794301586595867e4) * r + 5.3941960214247511077e3) * r + 6.8718700749205790830e2) * r + 4.2313330701600911252e1) * r + 1);
68
+ } else {
69
+ double r = q < 0 ? p : 1 - p;
70
+ r = sqrt(-log(r));
71
+ double sign = q < 0 ? -1 : 1;
72
+ if (r < 5) {
73
+ r -= 1.6;
74
+ return mean + std_dev * sign *
75
+ (((((((7.74545014278341407640e-4 * r + 2.27238449892691845833e-2) * r + 2.41780725177450611770e-1) * r + 1.27045825245236838258e0) * r + 3.64784832476320460504e0) * r + 5.76949722146069140550e0) * r + 4.63033784615654529590e0) * r + 1.42343711074968357734e0) /
76
+ (((((((1.05075007164441684324e-9 * r + 5.47593808499534494600e-4) * r + 1.51986665636164571966e-2) * r + 1.48103976427480074590e-1) * r + 6.89767334985100004550e-1) * r + 1.67638483018380384940e0) * r + 2.05319162663775882187e0) * r + 1);
77
+ } else {
78
+ r -= 5;
79
+ return mean + std_dev * sign *
80
+ (((((((2.01033439929228813265e-7 * r + 2.71155556874348757815e-5) * r + 1.24266094738807843860e-3) * r + 2.65321895265761230930e-2) * r + 2.96560571828504891230e-1) * r + 1.78482653991729133580e0) * r + 5.46378491116411436990e0) * r + 6.65790464350110377720e0) /
81
+ (((((((2.04426310338993978564e-15 * r + 1.42151175831644588870e-7) * r + 1.84631831751005468180e-5) * r + 7.86869131145613259100e-4) * r + 1.48753612908506148525e-2) * r + 1.36929880922735805310e-1) * r + 5.99832206555887937690e-1) * r + 1);
82
+ }
83
+ }
57
84
  }
58
85
 
59
- double students_t_pdf(double x, unsigned int n) {
60
- assert(n >= 1);
86
+ double students_t_pdf(double x, double n) {
87
+ if (n <= 0) {
88
+ return NAN;
89
+ }
90
+
91
+ if (n == INFINITY) {
92
+ return normal_pdf(x, 0, 1);
93
+ }
61
94
 
62
- return tgamma((n + 1.0) / 2.0) / (sqrt(n * M_PI) * tgamma(n / 2.0)) * pow(1.0 + x * x / n, -(n + 1.0) / 2.0);
95
+ return tgamma((n + 1.0) / 2.0) / (sqrt(n * DIST_PI) * tgamma(n / 2.0)) * pow(1.0 + x * x / n, -(n + 1.0) / 2.0);
63
96
  }
64
97
 
65
98
  // Hill, G. W. (1970).
66
99
  // Algorithm 395: Student's t-distribution.
67
100
  // Communications of the ACM, 13(10), 617-619.
68
- double students_t_cdf(double x, unsigned int n) {
69
- assert(n >= 1);
101
+ double students_t_cdf(double x, double n) {
102
+ if (n < 1) {
103
+ return NAN;
104
+ }
105
+
106
+ if (isnan(x)) {
107
+ return NAN;
108
+ }
109
+
110
+ if (!isfinite(x)) {
111
+ return x < 0 ? 0 : 1;
112
+ }
113
+
114
+ if (n == INFINITY) {
115
+ return normal_cdf(x, 0, 1);
116
+ }
70
117
 
71
118
  double start = x < 0 ? 0 : 1;
72
119
  double sign = x < 0 ? 1 : -1;
@@ -76,7 +123,7 @@ double students_t_cdf(double x, unsigned int n) {
76
123
  double y = t / n;
77
124
  double b = 1.0 + y;
78
125
 
79
- if ((n >= 20 && t < n) || n > 200) {
126
+ if (n > floor(n) || (n >= 20 && t < n) || n > 200) {
80
127
  // asymptotic series for large or noninteger n
81
128
  if (y > 10e-6) {
82
129
  y = log(b);
@@ -88,6 +135,10 @@ double students_t_cdf(double x, unsigned int n) {
88
135
  return start + sign * normal_cdf(-y, 0.0, 1.0);
89
136
  }
90
137
 
138
+ // make n int
139
+ // n is int between 1 and 200 if made it here
140
+ n = (int) n;
141
+
91
142
  if (n < 20 && t < 4.0) {
92
143
  // nested summation of cosine series
93
144
  y = sqrt(y);
@@ -104,7 +155,7 @@ double students_t_cdf(double x, unsigned int n) {
104
155
  n -= 2;
105
156
  }
106
157
  }
107
- a = n == 0 ? a / sqrt(b) : (atan(y) + a / b) * (2.0 / M_PI);
158
+ a = n == 0 ? a / sqrt(b) : (atan(y) + a / b) * (2.0 / DIST_PI);
108
159
  return start + sign * (z - a) / 2;
109
160
  }
110
161
 
@@ -127,16 +178,21 @@ double students_t_cdf(double x, unsigned int n) {
127
178
  a = (n - 1) / (b * n) * a + y;
128
179
  n -= 2;
129
180
  }
130
- a = n == 0 ? a / sqrt(b) : (atan(y) + a / b) * (2.0 / M_PI);
181
+ a = n == 0 ? a / sqrt(b) : (atan(y) + a / b) * (2.0 / DIST_PI);
131
182
  return start + sign * (z - a) / 2;
132
183
  }
133
184
 
134
185
  // Hill, G. W. (1970).
135
186
  // Algorithm 396: Student's t-quantiles.
136
187
  // Communications of the ACM, 13(10), 619-620.
137
- double students_t_ppf(double p, unsigned int n) {
138
- assert(p >= 0 && p <= 1);
139
- assert(n >= 1);
188
+ double students_t_ppf(double p, double n) {
189
+ if (p < 0 || p > 1 || n < 1) {
190
+ return NAN;
191
+ }
192
+
193
+ if (n == INFINITY) {
194
+ return normal_ppf(p, 0, 1);
195
+ }
140
196
 
141
197
  // distribution is symmetric
142
198
  double sign = p < 0.5 ? -1 : 1;
@@ -149,7 +205,7 @@ double students_t_ppf(double p, unsigned int n) {
149
205
  return sign * sqrt(2.0 / (p * (2.0 - p)) - 2.0);
150
206
  }
151
207
 
152
- double half_pi = M_PI / 2.0;
208
+ double half_pi = DIST_PI / 2.0;
153
209
 
154
210
  if (n == 1) {
155
211
  p = p * half_pi;
@@ -12,7 +12,7 @@ void Init_ext() {
12
12
  rb_mAnomalyDetection
13
13
  .define_singleton_function(
14
14
  "_detect",
15
- [](std::vector<float> x, int period, float k, float alpha, const std::string& direction, bool verbose) {
15
+ [](std::vector<float> series, int period, float k, float alpha, const std::string& direction, bool verbose) {
16
16
  Direction dir;
17
17
  if (direction == "pos") {
18
18
  dir = Direction::Positive;
@@ -24,10 +24,16 @@ void Init_ext() {
24
24
  throw std::invalid_argument("direction must be pos, neg, or both");
25
25
  }
26
26
 
27
- auto res = anomaly_detection::anomalies(x, period, k, alpha, dir, verbose, rb_thread_check_ints);
27
+ auto res = anomaly_detection::params()
28
+ .max_anoms(k)
29
+ .alpha(alpha)
30
+ .direction(dir)
31
+ .verbose(verbose)
32
+ .callback(rb_thread_check_ints)
33
+ .fit(series, period);
28
34
 
29
35
  auto a = Rice::Array();
30
- for (auto v : res) {
36
+ for (auto v : res.anomalies) {
31
37
  a.push(v);
32
38
  }
33
39
  return a;