anomaly_detection 0.1.0 → 0.1.1

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f494384b8aaf48e96317865acbdeef3d125fd142f3e155359da5a3745ea8b016
4
- data.tar.gz: 5b7d171b3b11fd8041ab32aa02580adb59e2bcb7c31e2a513a9d996bef24b6c4
3
+ metadata.gz: e8882d64f5b2fe33406fa583404a3efead153d47a434ac6f5a1ec21d10666389
4
+ data.tar.gz: f0544946e1cb9011e32c7f3f0a8ae4fb90f9d1a2c7acca67c5f9e8f27f8780af
5
5
  SHA512:
6
- metadata.gz: 0ce7dd8d360d57008134c8f47196b90f212bc322371a6f9147feeb46de5eb364b78e00045b0147a8823c7eea0aef84e696784b96fce20d02ff8ebe0f270e0701
7
- data.tar.gz: 552133db65b2b7e5ebfd813d7ad73f54653efa54f991eaa0b977a05b9678492344c4fcf4aedd968f0c0953d8070c638d33ecb54390be9f9c20a301301710b50e
6
+ metadata.gz: d30c25bf7a1a7069b7ba7c430e928677f5636261cdb1bc4ff24745103bbb2769ad065a2f368b32440eb1b5fce6483e206667b462542912f44f60a9b61494f360
7
+ data.tar.gz: 4a3aba5aeed9e5488c77448d38f331cb3241db6cbf518fc5c4f8d00ca480ce13e53819e65b9a66970e8ca175349db00a4ae6b5b6bc8833f82a44219f840419b9
data/CHANGELOG.md CHANGED
@@ -1,3 +1,9 @@
1
+ ## 0.1.1 (2021-10-17)
2
+
3
+ - Added `verbose` option
4
+ - Improved interrupt handling
5
+ - Fixed issue with stopping too early
6
+
1
7
  ## 0.1.0 (2021-10-15)
2
8
 
3
9
  - First release
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # AnomalyDetection.rb
2
2
 
3
- :fire: [AnomalyDetection](https://github.com/twitter/AnomalyDetection) for Ruby
3
+ :fire: Time series [AnomalyDetection](https://github.com/twitter/AnomalyDetection) for Ruby
4
4
 
5
5
  Learn [how it works](https://blog.twitter.com/engineering/en_us/a/2015/introducing-practical-and-robust-anomaly-detection-in-a-time-series)
6
6
 
@@ -53,7 +53,8 @@ AnomalyDetection.detect(
53
53
  period: 7, # number of observations in a single period
54
54
  alpha: 0.05, # level of statistical significance
55
55
  max_anoms: 0.1, # maximum number of anomalies as percent of data
56
- direction: "both" # pos, neg, or both
56
+ direction: "both", # pos, neg, or both
57
+ verbose: false # show progress
57
58
  )
58
59
  ```
59
60
 
@@ -1,24 +1,27 @@
1
- #include <cassert>
1
+ #include <functional>
2
2
  #include <iostream>
3
3
  #include <iterator>
4
+ #include <numeric>
5
+ #include <string>
4
6
  #include <vector>
5
7
 
8
+ #include "anomaly_detection.hpp"
6
9
  #include "cdflib.hpp"
7
10
  #include "stl.hpp"
8
11
 
9
- float median(const std::vector<float>& data) {
10
- std::vector<float> sorted(data);
11
- std::sort(sorted.begin(), sorted.end());
12
+ namespace anomaly_detection {
13
+
14
+ float median(const std::vector<float>& sorted) {
12
15
  return (sorted[(sorted.size() - 1) / 2] + sorted[sorted.size() / 2]) / 2.0;
13
16
  }
14
17
 
15
- float mad(const std::vector<float>& data) {
16
- auto med = median(data);
18
+ float mad(const std::vector<float>& data, float med) {
17
19
  std::vector<float> res;
18
20
  res.reserve(data.size());
19
21
  for (auto v : data) {
20
22
  res.push_back(fabs(v - med));
21
23
  }
24
+ std::sort(res.begin(), res.end());
22
25
  return 1.4826 * median(res);
23
26
  }
24
27
 
@@ -36,45 +39,51 @@ float qt(double p, double df) {
36
39
  return t;
37
40
  }
38
41
 
39
- std::vector<size_t> detect_anoms(const std::vector<float>& data, int num_obs_per_period, float k, float alpha, bool one_tail, bool upper_tail) {
40
- auto num_obs = data.size();
42
+ std::vector<size_t> detect_anoms(const std::vector<float>& data, int num_obs_per_period, float k, float alpha, bool one_tail, bool upper_tail, bool verbose, std::function<void()> interrupt) {
43
+ auto n = data.size();
41
44
 
42
45
  // Check to make sure we have at least two periods worth of data for anomaly context
43
- assert(num_obs >= num_obs_per_period * 2);
46
+ if (n < num_obs_per_period * 2) {
47
+ throw std::invalid_argument("series must contain at least 2 periods");
48
+ }
44
49
 
45
50
  // Handle NANs
46
51
  auto nan = std::count_if(data.begin(), data.end(), [](const auto& value) { return std::isnan(value); });
47
52
  if (nan > 0) {
48
- throw std::invalid_argument("Data contains NANs");
53
+ throw std::invalid_argument("series contains NANs");
49
54
  }
50
55
 
51
56
  // Decompose data. This returns a univarite remainder which will be used for anomaly detection. Optionally, we might NOT decompose.
52
- auto seasonal_length = data.size() * 10 + 1;
57
+ auto seasonal_length = n * 10 + 1;
53
58
  auto data_decomp = stl::params().robust(true).seasonal_length(seasonal_length).fit(data, num_obs_per_period);
54
59
 
55
60
  auto seasonal = data_decomp.seasonal;
56
61
  auto med = median(data);
57
62
  std::vector<float> data2;
58
- data2.reserve(data.size());
59
- for (auto i = 0; i < data.size(); i++) {
63
+ data2.reserve(n);
64
+ for (auto i = 0; i < n; i++) {
60
65
  data2.push_back(data[i] - seasonal[i] - med);
61
66
  }
62
67
 
63
- auto max_outliers = (size_t) num_obs * k;
64
- assert(max_outliers > 0);
65
-
66
- auto n = data2.size();
67
-
68
68
  std::vector<size_t> r_idx;
69
+ auto num_anoms = 0;
70
+ auto max_outliers = (size_t) n * k;
69
71
 
70
- std::vector<size_t> indexes;
71
- indexes.reserve(data2.size());
72
- for (auto i = 0; i < data2.size(); i++) {
73
- indexes.push_back(i);
74
- }
72
+ // Sort data for fast median
73
+ std::vector<size_t> indexes(n);
74
+ std::iota(indexes.begin(), indexes.end(), 0);
75
+ std::stable_sort(indexes.begin(), indexes.end(), [&data2](size_t a, size_t b) { return data2[a] < data2[b]; });
76
+ std::sort(data2.begin(), data2.end());
75
77
 
76
78
  // Compute test statistic until r=max_outliers values have been removed from the sample
77
79
  for (auto i = 1; i <= max_outliers; i++) {
80
+ // Check for interrupts
81
+ interrupt();
82
+
83
+ if (verbose) {
84
+ std::cout << i << " / " << max_outliers << " completed" << std::endl;
85
+ }
86
+
78
87
  // TODO Improve performance between loop iterations
79
88
  auto ma = median(data2);
80
89
  std::vector<float> ares;
@@ -96,19 +105,18 @@ std::vector<size_t> detect_anoms(const std::vector<float>& data, int num_obs_per
96
105
  }
97
106
 
98
107
  // Protect against constant time series
99
- auto data_sigma = mad(data2);
108
+ auto data_sigma = mad(data2, ma);
100
109
  if (data_sigma == 0.0) {
101
110
  break;
102
111
  }
103
112
 
104
113
  auto iter = std::max_element(ares.begin(), ares.end());
105
114
  auto r_idx_i = std::distance(ares.begin(), iter);
106
- auto r_idx_i2 = indexes[r_idx_i];
107
115
 
108
116
  // Only need to take sigma of r for performance
109
117
  auto r = ares[r_idx_i] / data_sigma;
110
118
 
111
- // TODO Swap to last position and delete
119
+ r_idx.push_back(indexes[r_idx_i]);
112
120
  data2.erase(data2.begin() + r_idx_i);
113
121
  indexes.erase(indexes.begin() + r_idx_i);
114
122
 
@@ -124,33 +132,22 @@ std::vector<size_t> detect_anoms(const std::vector<float>& data, int num_obs_per
124
132
  auto lam = t * (n - i) / sqrt(((n - i - 1) + powf(t, 2.0)) * (n - i + 1));
125
133
 
126
134
  if (r > lam) {
127
- r_idx.push_back(r_idx_i2);
128
- } else {
129
- break;
135
+ num_anoms = i;
130
136
  }
131
137
  }
132
138
 
139
+ std::vector<size_t> anomalies(r_idx.begin(), r_idx.begin() + num_anoms);
140
+
133
141
  // Sort like R version
134
- std::sort(r_idx.begin(), r_idx.end());
142
+ std::sort(anomalies.begin(), anomalies.end());
135
143
 
136
- return r_idx;
144
+ return anomalies;
137
145
  }
138
146
 
139
- std::vector<size_t> anomalies(const std::vector<float>& x, int period, float k, float alpha, const std::string& direction) {
140
- bool one_tail;
141
- bool upper_tail;
142
- if (direction == "pos") {
143
- one_tail = true;
144
- upper_tail = true;
145
- } else if (direction == "neg") {
146
- one_tail = true;
147
- upper_tail = false;
148
- } else if (direction == "both") {
149
- one_tail = false;
150
- upper_tail = true; // not used
151
- } else {
152
- throw std::invalid_argument("Bad direction");
153
- }
147
+ std::vector<size_t> anomalies(const std::vector<float>& x, int period, float k, float alpha, Direction direction, bool verbose, std::function<void()> interrupt) {
148
+ bool one_tail = direction != Direction::Both;
149
+ bool upper_tail = direction == Direction::Positive;
150
+ return detect_anoms(x, period, k, alpha, one_tail, upper_tail, verbose, interrupt);
151
+ }
154
152
 
155
- return detect_anoms(x, period, k, alpha, one_tail, upper_tail);
156
153
  }
@@ -0,0 +1,12 @@
1
+ #pragma once
2
+
3
+ #include <string>
4
+ #include <vector>
5
+
6
+ namespace anomaly_detection {
7
+
8
+ enum Direction { Positive, Negative, Both };
9
+
10
+ std::vector<size_t> anomalies(const std::vector<float>& x, int period, float k, float alpha, Direction direction, bool verbose, std::function<void()> interrupt);
11
+
12
+ }
@@ -1,8 +1,9 @@
1
- // rice
2
1
  #include <rice/rice.hpp>
3
2
  #include <rice/stl.hpp>
4
3
 
5
- std::vector<size_t> anomalies(const std::vector<float>& x, int period, float k, float alpha, const std::string& direction);
4
+ #include "anomaly_detection.hpp"
5
+
6
+ using anomaly_detection::Direction;
6
7
 
7
8
  extern "C"
8
9
  void Init_ext() {
@@ -11,8 +12,19 @@ void Init_ext() {
11
12
  rb_mAnomalyDetection
12
13
  .define_singleton_function(
13
14
  "_detect",
14
- [](std::vector<float> x, int period, float k, float alpha, const std::string& direction) {
15
- auto res = anomalies(x, period, k, alpha, direction);
15
+ [](std::vector<float> x, int period, float k, float alpha, const std::string& direction, bool verbose) {
16
+ Direction dir;
17
+ if (direction == "pos") {
18
+ dir = Direction::Positive;
19
+ } else if (direction == "neg") {
20
+ dir = Direction::Negative;
21
+ } else if (direction == "both") {
22
+ dir = Direction::Both;
23
+ } else {
24
+ throw std::invalid_argument("direction must be pos, neg, or both");
25
+ }
26
+
27
+ auto res = anomaly_detection::anomalies(x, period, k, alpha, dir, verbose, rb_thread_check_ints);
16
28
 
17
29
  auto a = Rice::Array();
18
30
  for (auto v : res) {
@@ -1,3 +1,3 @@
1
1
  module AnomalyDetection
2
- VERSION = "0.1.0"
2
+ VERSION = "0.1.1"
3
3
  end
@@ -5,7 +5,7 @@ require "anomaly_detection/ext"
5
5
  require "anomaly_detection/version"
6
6
 
7
7
  module AnomalyDetection
8
- def self.detect(series, period:, max_anoms: 0.1, alpha: 0.05, direction: "both")
8
+ def self.detect(series, period:, max_anoms: 0.1, alpha: 0.05, direction: "both", verbose: false)
9
9
  raise ArgumentError, "series must contain at least 2 periods" if series.size < period * 2
10
10
 
11
11
  if series.is_a?(Hash)
@@ -15,7 +15,7 @@ module AnomalyDetection
15
15
  x = series
16
16
  end
17
17
 
18
- res = _detect(x, period, max_anoms, alpha, direction)
18
+ res = _detect(x, period, max_anoms, alpha, direction, verbose)
19
19
  res.map! { |i| sorted[i][0] } if series.is_a?(Hash)
20
20
  res
21
21
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: anomaly_detection
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-10-15 00:00:00.000000000 Z
11
+ date: 2021-10-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rice
@@ -36,6 +36,7 @@ files:
36
36
  - NOTICE.txt
37
37
  - README.md
38
38
  - ext/anomaly_detection/anomaly_detection.cpp
39
+ - ext/anomaly_detection/anomaly_detection.hpp
39
40
  - ext/anomaly_detection/cdflib.cpp
40
41
  - ext/anomaly_detection/cdflib.hpp
41
42
  - ext/anomaly_detection/ext.cpp
@@ -68,5 +69,5 @@ requirements: []
68
69
  rubygems_version: 3.2.22
69
70
  signing_key:
70
71
  specification_version: 4
71
- summary: Anomaly detection for Ruby
72
+ summary: Time series anomaly detection for Ruby
72
73
  test_files: []