anomaly_detection 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: f494384b8aaf48e96317865acbdeef3d125fd142f3e155359da5a3745ea8b016
4
- data.tar.gz: 5b7d171b3b11fd8041ab32aa02580adb59e2bcb7c31e2a513a9d996bef24b6c4
3
+ metadata.gz: e8882d64f5b2fe33406fa583404a3efead153d47a434ac6f5a1ec21d10666389
4
+ data.tar.gz: f0544946e1cb9011e32c7f3f0a8ae4fb90f9d1a2c7acca67c5f9e8f27f8780af
5
5
  SHA512:
6
- metadata.gz: 0ce7dd8d360d57008134c8f47196b90f212bc322371a6f9147feeb46de5eb364b78e00045b0147a8823c7eea0aef84e696784b96fce20d02ff8ebe0f270e0701
7
- data.tar.gz: 552133db65b2b7e5ebfd813d7ad73f54653efa54f991eaa0b977a05b9678492344c4fcf4aedd968f0c0953d8070c638d33ecb54390be9f9c20a301301710b50e
6
+ metadata.gz: d30c25bf7a1a7069b7ba7c430e928677f5636261cdb1bc4ff24745103bbb2769ad065a2f368b32440eb1b5fce6483e206667b462542912f44f60a9b61494f360
7
+ data.tar.gz: 4a3aba5aeed9e5488c77448d38f331cb3241db6cbf518fc5c4f8d00ca480ce13e53819e65b9a66970e8ca175349db00a4ae6b5b6bc8833f82a44219f840419b9
data/CHANGELOG.md CHANGED
@@ -1,3 +1,9 @@
1
+ ## 0.1.1 (2021-10-17)
2
+
3
+ - Added `verbose` option
4
+ - Improved interrupt handling
5
+ - Fixed issue with stopping too early
6
+
1
7
  ## 0.1.0 (2021-10-15)
2
8
 
3
9
  - First release
data/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # AnomalyDetection.rb
2
2
 
3
- :fire: [AnomalyDetection](https://github.com/twitter/AnomalyDetection) for Ruby
3
+ :fire: Time series [AnomalyDetection](https://github.com/twitter/AnomalyDetection) for Ruby
4
4
 
5
5
  Learn [how it works](https://blog.twitter.com/engineering/en_us/a/2015/introducing-practical-and-robust-anomaly-detection-in-a-time-series)
6
6
 
@@ -53,7 +53,8 @@ AnomalyDetection.detect(
53
53
  period: 7, # number of observations in a single period
54
54
  alpha: 0.05, # level of statistical significance
55
55
  max_anoms: 0.1, # maximum number of anomalies as percent of data
56
- direction: "both" # pos, neg, or both
56
+ direction: "both", # pos, neg, or both
57
+ verbose: false # show progress
57
58
  )
58
59
  ```
59
60
 
@@ -1,24 +1,27 @@
1
- #include <cassert>
1
+ #include <functional>
2
2
  #include <iostream>
3
3
  #include <iterator>
4
+ #include <numeric>
5
+ #include <string>
4
6
  #include <vector>
5
7
 
8
+ #include "anomaly_detection.hpp"
6
9
  #include "cdflib.hpp"
7
10
  #include "stl.hpp"
8
11
 
9
- float median(const std::vector<float>& data) {
10
- std::vector<float> sorted(data);
11
- std::sort(sorted.begin(), sorted.end());
12
+ namespace anomaly_detection {
13
+
14
+ float median(const std::vector<float>& sorted) {
12
15
  return (sorted[(sorted.size() - 1) / 2] + sorted[sorted.size() / 2]) / 2.0;
13
16
  }
14
17
 
15
- float mad(const std::vector<float>& data) {
16
- auto med = median(data);
18
+ float mad(const std::vector<float>& data, float med) {
17
19
  std::vector<float> res;
18
20
  res.reserve(data.size());
19
21
  for (auto v : data) {
20
22
  res.push_back(fabs(v - med));
21
23
  }
24
+ std::sort(res.begin(), res.end());
22
25
  return 1.4826 * median(res);
23
26
  }
24
27
 
@@ -36,45 +39,51 @@ float qt(double p, double df) {
36
39
  return t;
37
40
  }
38
41
 
39
- std::vector<size_t> detect_anoms(const std::vector<float>& data, int num_obs_per_period, float k, float alpha, bool one_tail, bool upper_tail) {
40
- auto num_obs = data.size();
42
+ std::vector<size_t> detect_anoms(const std::vector<float>& data, int num_obs_per_period, float k, float alpha, bool one_tail, bool upper_tail, bool verbose, std::function<void()> interrupt) {
43
+ auto n = data.size();
41
44
 
42
45
  // Check to make sure we have at least two periods worth of data for anomaly context
43
- assert(num_obs >= num_obs_per_period * 2);
46
+ if (n < num_obs_per_period * 2) {
47
+ throw std::invalid_argument("series must contain at least 2 periods");
48
+ }
44
49
 
45
50
  // Handle NANs
46
51
  auto nan = std::count_if(data.begin(), data.end(), [](const auto& value) { return std::isnan(value); });
47
52
  if (nan > 0) {
48
- throw std::invalid_argument("Data contains NANs");
53
+ throw std::invalid_argument("series contains NANs");
49
54
  }
50
55
 
51
56
  // Decompose data. This returns a univarite remainder which will be used for anomaly detection. Optionally, we might NOT decompose.
52
- auto seasonal_length = data.size() * 10 + 1;
57
+ auto seasonal_length = n * 10 + 1;
53
58
  auto data_decomp = stl::params().robust(true).seasonal_length(seasonal_length).fit(data, num_obs_per_period);
54
59
 
55
60
  auto seasonal = data_decomp.seasonal;
56
61
  auto med = median(data);
57
62
  std::vector<float> data2;
58
- data2.reserve(data.size());
59
- for (auto i = 0; i < data.size(); i++) {
63
+ data2.reserve(n);
64
+ for (auto i = 0; i < n; i++) {
60
65
  data2.push_back(data[i] - seasonal[i] - med);
61
66
  }
62
67
 
63
- auto max_outliers = (size_t) num_obs * k;
64
- assert(max_outliers > 0);
65
-
66
- auto n = data2.size();
67
-
68
68
  std::vector<size_t> r_idx;
69
+ auto num_anoms = 0;
70
+ auto max_outliers = (size_t) n * k;
69
71
 
70
- std::vector<size_t> indexes;
71
- indexes.reserve(data2.size());
72
- for (auto i = 0; i < data2.size(); i++) {
73
- indexes.push_back(i);
74
- }
72
+ // Sort data for fast median
73
+ std::vector<size_t> indexes(n);
74
+ std::iota(indexes.begin(), indexes.end(), 0);
75
+ std::stable_sort(indexes.begin(), indexes.end(), [&data2](size_t a, size_t b) { return data2[a] < data2[b]; });
76
+ std::sort(data2.begin(), data2.end());
75
77
 
76
78
  // Compute test statistic until r=max_outliers values have been removed from the sample
77
79
  for (auto i = 1; i <= max_outliers; i++) {
80
+ // Check for interrupts
81
+ interrupt();
82
+
83
+ if (verbose) {
84
+ std::cout << i << " / " << max_outliers << " completed" << std::endl;
85
+ }
86
+
78
87
  // TODO Improve performance between loop iterations
79
88
  auto ma = median(data2);
80
89
  std::vector<float> ares;
@@ -96,19 +105,18 @@ std::vector<size_t> detect_anoms(const std::vector<float>& data, int num_obs_per
96
105
  }
97
106
 
98
107
  // Protect against constant time series
99
- auto data_sigma = mad(data2);
108
+ auto data_sigma = mad(data2, ma);
100
109
  if (data_sigma == 0.0) {
101
110
  break;
102
111
  }
103
112
 
104
113
  auto iter = std::max_element(ares.begin(), ares.end());
105
114
  auto r_idx_i = std::distance(ares.begin(), iter);
106
- auto r_idx_i2 = indexes[r_idx_i];
107
115
 
108
116
  // Only need to take sigma of r for performance
109
117
  auto r = ares[r_idx_i] / data_sigma;
110
118
 
111
- // TODO Swap to last position and delete
119
+ r_idx.push_back(indexes[r_idx_i]);
112
120
  data2.erase(data2.begin() + r_idx_i);
113
121
  indexes.erase(indexes.begin() + r_idx_i);
114
122
 
@@ -124,33 +132,22 @@ std::vector<size_t> detect_anoms(const std::vector<float>& data, int num_obs_per
124
132
  auto lam = t * (n - i) / sqrt(((n - i - 1) + powf(t, 2.0)) * (n - i + 1));
125
133
 
126
134
  if (r > lam) {
127
- r_idx.push_back(r_idx_i2);
128
- } else {
129
- break;
135
+ num_anoms = i;
130
136
  }
131
137
  }
132
138
 
139
+ std::vector<size_t> anomalies(r_idx.begin(), r_idx.begin() + num_anoms);
140
+
133
141
  // Sort like R version
134
- std::sort(r_idx.begin(), r_idx.end());
142
+ std::sort(anomalies.begin(), anomalies.end());
135
143
 
136
- return r_idx;
144
+ return anomalies;
137
145
  }
138
146
 
139
- std::vector<size_t> anomalies(const std::vector<float>& x, int period, float k, float alpha, const std::string& direction) {
140
- bool one_tail;
141
- bool upper_tail;
142
- if (direction == "pos") {
143
- one_tail = true;
144
- upper_tail = true;
145
- } else if (direction == "neg") {
146
- one_tail = true;
147
- upper_tail = false;
148
- } else if (direction == "both") {
149
- one_tail = false;
150
- upper_tail = true; // not used
151
- } else {
152
- throw std::invalid_argument("Bad direction");
153
- }
147
+ std::vector<size_t> anomalies(const std::vector<float>& x, int period, float k, float alpha, Direction direction, bool verbose, std::function<void()> interrupt) {
148
+ bool one_tail = direction != Direction::Both;
149
+ bool upper_tail = direction == Direction::Positive;
150
+ return detect_anoms(x, period, k, alpha, one_tail, upper_tail, verbose, interrupt);
151
+ }
154
152
 
155
- return detect_anoms(x, period, k, alpha, one_tail, upper_tail);
156
153
  }
@@ -0,0 +1,12 @@
1
+ #pragma once
2
+
3
+ #include <string>
4
+ #include <vector>
5
+
6
+ namespace anomaly_detection {
7
+
8
+ enum Direction { Positive, Negative, Both };
9
+
10
+ std::vector<size_t> anomalies(const std::vector<float>& x, int period, float k, float alpha, Direction direction, bool verbose, std::function<void()> interrupt);
11
+
12
+ }
@@ -1,8 +1,9 @@
1
- // rice
2
1
  #include <rice/rice.hpp>
3
2
  #include <rice/stl.hpp>
4
3
 
5
- std::vector<size_t> anomalies(const std::vector<float>& x, int period, float k, float alpha, const std::string& direction);
4
+ #include "anomaly_detection.hpp"
5
+
6
+ using anomaly_detection::Direction;
6
7
 
7
8
  extern "C"
8
9
  void Init_ext() {
@@ -11,8 +12,19 @@ void Init_ext() {
11
12
  rb_mAnomalyDetection
12
13
  .define_singleton_function(
13
14
  "_detect",
14
- [](std::vector<float> x, int period, float k, float alpha, const std::string& direction) {
15
- auto res = anomalies(x, period, k, alpha, direction);
15
+ [](std::vector<float> x, int period, float k, float alpha, const std::string& direction, bool verbose) {
16
+ Direction dir;
17
+ if (direction == "pos") {
18
+ dir = Direction::Positive;
19
+ } else if (direction == "neg") {
20
+ dir = Direction::Negative;
21
+ } else if (direction == "both") {
22
+ dir = Direction::Both;
23
+ } else {
24
+ throw std::invalid_argument("direction must be pos, neg, or both");
25
+ }
26
+
27
+ auto res = anomaly_detection::anomalies(x, period, k, alpha, dir, verbose, rb_thread_check_ints);
16
28
 
17
29
  auto a = Rice::Array();
18
30
  for (auto v : res) {
@@ -1,3 +1,3 @@
1
1
  module AnomalyDetection
2
- VERSION = "0.1.0"
2
+ VERSION = "0.1.1"
3
3
  end
@@ -5,7 +5,7 @@ require "anomaly_detection/ext"
5
5
  require "anomaly_detection/version"
6
6
 
7
7
  module AnomalyDetection
8
- def self.detect(series, period:, max_anoms: 0.1, alpha: 0.05, direction: "both")
8
+ def self.detect(series, period:, max_anoms: 0.1, alpha: 0.05, direction: "both", verbose: false)
9
9
  raise ArgumentError, "series must contain at least 2 periods" if series.size < period * 2
10
10
 
11
11
  if series.is_a?(Hash)
@@ -15,7 +15,7 @@ module AnomalyDetection
15
15
  x = series
16
16
  end
17
17
 
18
- res = _detect(x, period, max_anoms, alpha, direction)
18
+ res = _detect(x, period, max_anoms, alpha, direction, verbose)
19
19
  res.map! { |i| sorted[i][0] } if series.is_a?(Hash)
20
20
  res
21
21
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: anomaly_detection
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2021-10-15 00:00:00.000000000 Z
11
+ date: 2021-10-18 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rice
@@ -36,6 +36,7 @@ files:
36
36
  - NOTICE.txt
37
37
  - README.md
38
38
  - ext/anomaly_detection/anomaly_detection.cpp
39
+ - ext/anomaly_detection/anomaly_detection.hpp
39
40
  - ext/anomaly_detection/cdflib.cpp
40
41
  - ext/anomaly_detection/cdflib.hpp
41
42
  - ext/anomaly_detection/ext.cpp
@@ -68,5 +69,5 @@ requirements: []
68
69
  rubygems_version: 3.2.22
69
70
  signing_key:
70
71
  specification_version: 4
71
- summary: Anomaly detection for Ruby
72
+ summary: Time series anomaly detection for Ruby
72
73
  test_files: []