RubyGems - anomaly_detection - Versions diffs - 0.1.0 → 0.1.1 - Mend

anomaly_detection 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

checksums.yaml +4 -4
data/CHANGELOG.md +6 -0
data/README.md +3 -2
data/ext/anomaly_detection/anomaly_detection.cpp +44 -47
data/ext/anomaly_detection/anomaly_detection.hpp +12 -0
data/ext/anomaly_detection/ext.cpp +16 -4
data/lib/anomaly_detection/version.rb +1 -1
data/lib/anomaly_detection.rb +2 -2
metadata +4 -3

checksums.yaml CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
 SHA256:
-  metadata.gz: f494384b8aaf48e96317865acbdeef3d125fd142f3e155359da5a3745ea8b016
-  data.tar.gz: 5b7d171b3b11fd8041ab32aa02580adb59e2bcb7c31e2a513a9d996bef24b6c4
+  metadata.gz: e8882d64f5b2fe33406fa583404a3efead153d47a434ac6f5a1ec21d10666389
+  data.tar.gz: f0544946e1cb9011e32c7f3f0a8ae4fb90f9d1a2c7acca67c5f9e8f27f8780af
 SHA512:
-  metadata.gz: 0ce7dd8d360d57008134c8f47196b90f212bc322371a6f9147feeb46de5eb364b78e00045b0147a8823c7eea0aef84e696784b96fce20d02ff8ebe0f270e0701
-  data.tar.gz: 552133db65b2b7e5ebfd813d7ad73f54653efa54f991eaa0b977a05b9678492344c4fcf4aedd968f0c0953d8070c638d33ecb54390be9f9c20a301301710b50e
+  metadata.gz: d30c25bf7a1a7069b7ba7c430e928677f5636261cdb1bc4ff24745103bbb2769ad065a2f368b32440eb1b5fce6483e206667b462542912f44f60a9b61494f360
+  data.tar.gz: 4a3aba5aeed9e5488c77448d38f331cb3241db6cbf518fc5c4f8d00ca480ce13e53819e65b9a66970e8ca175349db00a4ae6b5b6bc8833f82a44219f840419b9

data/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,9 @@
+## 0.1.1 (2021-10-17)
+- Added `verbose` option
+- Improved interrupt handling
+- Fixed issue with stopping too early
 ## 0.1.0 (2021-10-15)
 - First release

data/README.md CHANGED Viewed

@@ -1,6 +1,6 @@
 # AnomalyDetection.rb
-:fire: [AnomalyDetection](https://github.com/twitter/AnomalyDetection) for Ruby
+:fire: Time series [AnomalyDetection](https://github.com/twitter/AnomalyDetection) for Ruby
 Learn [how it works](https://blog.twitter.com/engineering/en_us/a/2015/introducing-practical-and-robust-anomaly-detection-in-a-time-series)
@@ -53,7 +53,8 @@ AnomalyDetection.detect(
   period: 7,            # number of observations in a single period
   alpha: 0.05,          # level of statistical significance
   max_anoms: 0.1,       # maximum number of anomalies as percent of data
-  direction: "both"     # pos, neg, or both
+  direction: "both",    # pos, neg, or both
+  verbose: false        # show progress
 )
 ```

data/ext/anomaly_detection/anomaly_detection.cpp CHANGED Viewed

@@ -1,24 +1,27 @@
-#include <cassert>
+#include <functional>
 #include <iostream>
 #include <iterator>
+#include <numeric>
+#include <string>
 #include <vector>
+#include "anomaly_detection.hpp"
 #include "cdflib.hpp"
 #include "stl.hpp"
-float median(const std::vector<float>& data) {
-    std::vector<float> sorted(data);
-    std::sort(sorted.begin(), sorted.end());
+namespace anomaly_detection {
+float median(const std::vector<float>& sorted) {
     return (sorted[(sorted.size() - 1) / 2] + sorted[sorted.size() / 2]) / 2.0;
 }
-float mad(const std::vector<float>& data) {
-    auto med = median(data);
+float mad(const std::vector<float>& data, float med) {
     std::vector<float> res;
     res.reserve(data.size());
     for (auto v : data) {
         res.push_back(fabs(v - med));
     }
+    std::sort(res.begin(), res.end());
     return 1.4826 * median(res);
 }
@@ -36,45 +39,51 @@ float qt(double p, double df) {
     return t;
 }
-std::vector<size_t> detect_anoms(const std::vector<float>& data, int num_obs_per_period, float k, float alpha, bool one_tail, bool upper_tail) {
-    auto num_obs = data.size();
+std::vector<size_t> detect_anoms(const std::vector<float>& data, int num_obs_per_period, float k, float alpha, bool one_tail, bool upper_tail, bool verbose, std::function<void()> interrupt) {
+    auto n = data.size();
     // Check to make sure we have at least two periods worth of data for anomaly context
-    assert(num_obs >= num_obs_per_period * 2);
+    if (n < num_obs_per_period * 2) {
+        throw std::invalid_argument("series must contain at least 2 periods");
+    }
     // Handle NANs
     auto nan = std::count_if(data.begin(), data.end(), [](const auto& value) { return std::isnan(value); });
     if (nan > 0) {
-        throw std::invalid_argument("Data contains NANs");
+        throw std::invalid_argument("series contains NANs");
     }
     // Decompose data. This returns a univarite remainder which will be used for anomaly detection. Optionally, we might NOT decompose.
-    auto seasonal_length = data.size() * 10 + 1;
+    auto seasonal_length = n * 10 + 1;
     auto data_decomp = stl::params().robust(true).seasonal_length(seasonal_length).fit(data, num_obs_per_period);
     auto seasonal = data_decomp.seasonal;
     auto med = median(data);
     std::vector<float> data2;
-    data2.reserve(data.size());
-    for (auto i = 0; i < data.size(); i++) {
+    data2.reserve(n);
+    for (auto i = 0; i < n; i++) {
         data2.push_back(data[i] - seasonal[i] - med);
     }
-    auto max_outliers = (size_t) num_obs * k;
-    assert(max_outliers > 0);
-    auto n = data2.size();
     std::vector<size_t> r_idx;
+    auto num_anoms = 0;
+    auto max_outliers = (size_t) n * k;
-    std::vector<size_t> indexes;
-    indexes.reserve(data2.size());
-    for (auto i = 0; i < data2.size(); i++) {
-        indexes.push_back(i);
-    }
+    // Sort data for fast median
+    std::vector<size_t> indexes(n);
+    std::iota(indexes.begin(), indexes.end(), 0);
+    std::stable_sort(indexes.begin(), indexes.end(), [&data2](size_t a, size_t b) { return data2[a] < data2[b]; });
+    std::sort(data2.begin(), data2.end());
     // Compute test statistic until r=max_outliers values have been removed from the sample
     for (auto i = 1; i <= max_outliers; i++) {
+        // Check for interrupts
+        interrupt();
+        if (verbose) {
+            std::cout << i << " / " << max_outliers << " completed" << std::endl;
+        }
         // TODO Improve performance between loop iterations
         auto ma = median(data2);
         std::vector<float> ares;
@@ -96,19 +105,18 @@ std::vector<size_t> detect_anoms(const std::vector<float>& data, int num_obs_per
         }
         // Protect against constant time series
-        auto data_sigma = mad(data2);
+        auto data_sigma = mad(data2, ma);
         if (data_sigma == 0.0) {
             break;
         }
         auto iter = std::max_element(ares.begin(), ares.end());
         auto r_idx_i = std::distance(ares.begin(), iter);
-        auto r_idx_i2 = indexes[r_idx_i];
         // Only need to take sigma of r for performance
         auto r = ares[r_idx_i] / data_sigma;
-        // TODO Swap to last position and delete
+        r_idx.push_back(indexes[r_idx_i]);
         data2.erase(data2.begin() + r_idx_i);
         indexes.erase(indexes.begin() + r_idx_i);
@@ -124,33 +132,22 @@ std::vector<size_t> detect_anoms(const std::vector<float>& data, int num_obs_per
         auto lam = t * (n - i) / sqrt(((n - i - 1) + powf(t, 2.0)) * (n - i + 1));
         if (r > lam) {
-            r_idx.push_back(r_idx_i2);
-        } else {
-            break;
+            num_anoms = i;
         }
     }
+    std::vector<size_t> anomalies(r_idx.begin(), r_idx.begin() + num_anoms);
     // Sort like R version
-    std::sort(r_idx.begin(), r_idx.end());
+    std::sort(anomalies.begin(), anomalies.end());
-    return r_idx;
+    return anomalies;
 }
-std::vector<size_t> anomalies(const std::vector<float>& x, int period, float k, float alpha, const std::string& direction) {
-    bool one_tail;
-    bool upper_tail;
-    if (direction == "pos") {
-        one_tail = true;
-        upper_tail = true;
-    } else if (direction == "neg") {
-        one_tail = true;
-        upper_tail = false;
-    } else if (direction == "both") {
-        one_tail = false;
-        upper_tail = true; // not used
-    } else {
-        throw std::invalid_argument("Bad direction");
-    }
+std::vector<size_t> anomalies(const std::vector<float>& x, int period, float k, float alpha, Direction direction, bool verbose, std::function<void()> interrupt) {
+    bool one_tail = direction != Direction::Both;
+    bool upper_tail = direction == Direction::Positive;
+    return detect_anoms(x, period, k, alpha, one_tail, upper_tail, verbose, interrupt);
+}
-    return detect_anoms(x, period, k, alpha, one_tail, upper_tail);
 }

data/ext/anomaly_detection/anomaly_detection.hpp ADDED Viewed

@@ -0,0 +1,12 @@
+#pragma once
+#include <string>
+#include <vector>
+namespace anomaly_detection {
+enum Direction { Positive, Negative, Both };
+std::vector<size_t> anomalies(const std::vector<float>& x, int period, float k, float alpha, Direction direction, bool verbose, std::function<void()> interrupt);
+}

data/ext/anomaly_detection/ext.cpp CHANGED Viewed

@@ -1,8 +1,9 @@
-// rice
 #include <rice/rice.hpp>
 #include <rice/stl.hpp>
-std::vector<size_t> anomalies(const std::vector<float>& x, int period, float k, float alpha, const std::string& direction);
+#include "anomaly_detection.hpp"
+using anomaly_detection::Direction;
 extern "C"
 void Init_ext() {
@@ -11,8 +12,19 @@ void Init_ext() {
   rb_mAnomalyDetection
     .define_singleton_function(
       "_detect",
-      [](std::vector<float> x, int period, float k, float alpha, const std::string& direction) {
-        auto res = anomalies(x, period, k, alpha, direction);
+      [](std::vector<float> x, int period, float k, float alpha, const std::string& direction, bool verbose) {
+        Direction dir;
+        if (direction == "pos") {
+          dir = Direction::Positive;
+        } else if (direction == "neg") {
+          dir = Direction::Negative;
+        } else if (direction == "both") {
+          dir = Direction::Both;
+        } else {
+          throw std::invalid_argument("direction must be pos, neg, or both");
+        }
+        auto res = anomaly_detection::anomalies(x, period, k, alpha, dir, verbose, rb_thread_check_ints);
         auto a = Rice::Array();
         for (auto v : res) {

data/lib/anomaly_detection/version.rb CHANGED Viewed

@@ -1,3 +1,3 @@
 module AnomalyDetection
-  VERSION = "0.1.0"
+  VERSION = "0.1.1"
 end

data/lib/anomaly_detection.rb CHANGED Viewed

@@ -5,7 +5,7 @@ require "anomaly_detection/ext"
 require "anomaly_detection/version"
 module AnomalyDetection
-  def self.detect(series, period:, max_anoms: 0.1, alpha: 0.05, direction: "both")
+  def self.detect(series, period:, max_anoms: 0.1, alpha: 0.05, direction: "both", verbose: false)
     raise ArgumentError, "series must contain at least 2 periods" if series.size < period * 2
     if series.is_a?(Hash)
@@ -15,7 +15,7 @@ module AnomalyDetection
       x = series
     end
-    res = _detect(x, period, max_anoms, alpha, direction)
+    res = _detect(x, period, max_anoms, alpha, direction, verbose)
     res.map! { |i| sorted[i][0] } if series.is_a?(Hash)
     res
   end

metadata CHANGED Viewed

@@ -1,14 +1,14 @@
 --- !ruby/object:Gem::Specification
 name: anomaly_detection
 version: !ruby/object:Gem::Version
-  version: 0.1.0
+  version: 0.1.1
 platform: ruby
 authors:
 - Andrew Kane
 autorequire:
 bindir: bin
 cert_chain: []
-date: 2021-10-15 00:00:00.000000000 Z
+date: 2021-10-18 00:00:00.000000000 Z
 dependencies:
 - !ruby/object:Gem::Dependency
   name: rice
@@ -36,6 +36,7 @@ files:
 - NOTICE.txt
 - README.md
 - ext/anomaly_detection/anomaly_detection.cpp
+- ext/anomaly_detection/anomaly_detection.hpp
 - ext/anomaly_detection/cdflib.cpp
 - ext/anomaly_detection/cdflib.hpp
 - ext/anomaly_detection/ext.cpp
@@ -68,5 +69,5 @@ requirements: []
 rubygems_version: 3.2.22
 signing_key:
 specification_version: 4
-summary: Anomaly detection for Ruby
+summary: Time series anomaly detection for Ruby
 test_files: []