anomaly 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +26 -14
- data/Rakefile +13 -0
- data/anomaly.gemspec +1 -0
- data/lib/anomaly/detector.rb +18 -13
- data/lib/anomaly/version.rb +1 -1
- data/spec/anomaly/detector_spec.rb +22 -9
- metadata +16 -5
data/README.md
CHANGED
@@ -16,7 +16,7 @@ And then execute:
|
|
16
16
|
bundle install
|
17
17
|
```
|
18
18
|
|
19
|
-
For max performance (
|
19
|
+
For max performance (~ 2x faster), also install the NArray gem:
|
20
20
|
|
21
21
|
```ruby
|
22
22
|
gem "narray"
|
@@ -26,23 +26,32 @@ Anomaly will automatically detect it and use it.
|
|
26
26
|
|
27
27
|
## How to Use
|
28
28
|
|
29
|
-
|
29
|
+
Say we have weather data for sunny days and we're trying to detect days that aren't sunny. The data looks like:
|
30
30
|
|
31
31
|
```ruby
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
[
|
32
|
+
# Each row is a different day.
|
33
|
+
# [temperature (°F), humidity (%), pressure (in)]
|
34
|
+
weather_data = [
|
35
|
+
[85, 68, 10.4],
|
36
|
+
[88, 62, 12.1],
|
37
|
+
[86, 64, 13.6],
|
38
|
+
...
|
36
39
|
]
|
37
|
-
|
40
|
+
```
|
41
|
+
|
42
|
+
Train the detector with **only non-anomalies** (sunny days in our case).
|
43
|
+
|
44
|
+
```ruby
|
45
|
+
ad = Anomaly::Detector.new(weather_data)
|
38
46
|
```
|
39
47
|
|
40
48
|
That's it! Let's test for anomalies.
|
41
49
|
|
42
50
|
```ruby
|
43
|
-
|
51
|
+
# 79°F, 66% humidity, 12.3 in. pressure
|
52
|
+
test_sample = [79, 66, 12.3]
|
44
53
|
ad.probability(test_sample)
|
45
|
-
# =>
|
54
|
+
# => 7.537174740907633e-08
|
46
55
|
```
|
47
56
|
|
48
57
|
**Super-important:** You must select a threshold for anomalies (which we denote with ε - "epsilon")
|
@@ -52,15 +61,13 @@ Probabilities less than ε are considered anomalies. If ε is higher, more thing
|
|
52
61
|
``` ruby
|
53
62
|
ad.anomaly?(test_sample, 1e-10)
|
54
63
|
# => false
|
55
|
-
ad.anomaly?(test_sample,
|
64
|
+
ad.anomaly?(test_sample, 1e-5)
|
56
65
|
# => true
|
57
66
|
```
|
58
67
|
|
59
|
-
|
68
|
+
The wiki has [sample code](https://github.com/ankane/anomaly/wiki/Home) to help you find the best ε for your application.
|
60
69
|
|
61
|
-
|
62
|
-
# TODO
|
63
|
-
```
|
70
|
+
### Persistence
|
64
71
|
|
65
72
|
You can easily persist the detector to a file or database - it's very tiny.
|
66
73
|
|
@@ -76,6 +83,11 @@ File.open("anomaly_detector.dump", "w") {|f| f.write(serialized_ad) }
|
|
76
83
|
ad2 = Marshal.load(File.open("anomaly_detector.dump", "r").read)
|
77
84
|
```
|
78
85
|
|
86
|
+
## TODO
|
87
|
+
|
88
|
+
- Train in chunks (for very large datasets)
|
89
|
+
- Multivariate normal distribution (possibly)
|
90
|
+
|
79
91
|
## Contributing
|
80
92
|
|
81
93
|
1. Fork it
|
data/Rakefile
CHANGED
@@ -2,3 +2,16 @@
|
|
2
2
|
require "bundler/gem_tasks"
|
3
3
|
require "rspec/core/rake_task"
|
4
4
|
RSpec::Core::RakeTask.new("spec")
|
5
|
+
|
6
|
+
require "benchmark"
|
7
|
+
require "anomaly"
|
8
|
+
|
9
|
+
task :benchmark do
|
10
|
+
data = 1_000_000.times.map{ [rand, rand, rand, rand] }
|
11
|
+
|
12
|
+
Benchmark.bm do |x|
|
13
|
+
x.report { Anomaly::Detector.new(data) }
|
14
|
+
require "narray"
|
15
|
+
x.report { Anomaly::Detector.new(data) }
|
16
|
+
end
|
17
|
+
end
|
data/anomaly.gemspec
CHANGED
data/lib/anomaly/detector.rb
CHANGED
@@ -2,39 +2,46 @@ module Anomaly
|
|
2
2
|
class Detector
|
3
3
|
|
4
4
|
def initialize(data = nil)
|
5
|
-
@
|
5
|
+
@m = 0
|
6
6
|
train(data) if data
|
7
7
|
end
|
8
8
|
|
9
9
|
def train(data)
|
10
10
|
if defined?(NMatrix)
|
11
11
|
d = NMatrix.to_na(data)
|
12
|
+
@n, @m = d.sizes
|
12
13
|
# Convert these to an array for Marshal.dump
|
13
14
|
@mean = d.mean(1).to_a
|
14
15
|
@std = d.stddev(1).to_a
|
15
16
|
else
|
16
17
|
# Default to Array, since built-in Matrix does not give us a big performance advantage.
|
17
18
|
d = data.to_a
|
18
|
-
|
19
|
+
@m = d.size
|
20
|
+
@n = d.first ? d.first.size : 0
|
21
|
+
cols = @n.times.map{|i| d.map{|r| r[i]}}
|
19
22
|
@mean = cols.map{|c| mean(c)}
|
20
23
|
@std = cols.each_with_index.map{|c,i| std(c, @mean[i])}
|
21
24
|
end
|
22
|
-
|
23
25
|
@std.map!{|std| (std == 0 or std.nan?) ? Float::MIN : std}
|
24
|
-
|
25
|
-
# raise "Standard deviation cannot be zero" if @std.find_index{|i| i == 0 or i.nan?}
|
26
|
-
|
27
|
-
@trained = true
|
28
26
|
end
|
29
27
|
|
30
28
|
def trained?
|
31
|
-
@
|
29
|
+
@m > 0
|
30
|
+
end
|
31
|
+
|
32
|
+
def samples
|
33
|
+
@m
|
32
34
|
end
|
33
35
|
|
36
|
+
# Limit the probability of features to [0,1]
|
37
|
+
# to keep probabilities at same scale.
|
34
38
|
def probability(x)
|
35
39
|
raise "Train me first" unless trained?
|
36
|
-
raise ArgumentError, "x must have #{@
|
37
|
-
|
40
|
+
raise ArgumentError, "x must have #{@n} elements" if x.size != @n
|
41
|
+
@n.times.map do |i|
|
42
|
+
p = normal_pdf(x[i], @mean[i], @std[i])
|
43
|
+
(p.nan? or p > 1) ? 1 : p
|
44
|
+
end.reduce(1, :*)
|
38
45
|
end
|
39
46
|
|
40
47
|
def anomaly?(x, epsilon)
|
@@ -45,10 +52,8 @@ module Anomaly
|
|
45
52
|
|
46
53
|
SQRT2PI = Math.sqrt(2*Math::PI)
|
47
54
|
|
48
|
-
# Return 1 (exclude feature) if std ~ 0
|
49
55
|
def normal_pdf(x, mean = 0, std = 1)
|
50
|
-
|
51
|
-
p.nan? ? 1 : p
|
56
|
+
1/(SQRT2PI*std)*Math.exp(-((x - mean)**2/(2.0*(std**2))))
|
52
57
|
end
|
53
58
|
|
54
59
|
# Not used for NArray
|
data/lib/anomaly/version.rb
CHANGED
@@ -25,15 +25,7 @@ describe Anomaly::Detector do
|
|
25
25
|
end
|
26
26
|
end
|
27
27
|
|
28
|
-
context "when
|
29
|
-
let(:data) { [[0]] }
|
30
|
-
|
31
|
-
it "returns infinity" do
|
32
|
-
ad.probability([0]).should == 1
|
33
|
-
end
|
34
|
-
end
|
35
|
-
|
36
|
-
context "when data is a matrix" do
|
28
|
+
context "when data is an array" do
|
37
29
|
let(:data) { [[-1,-2],[0,0],[1,2]] }
|
38
30
|
let(:sample) { [rand, rand] }
|
39
31
|
|
@@ -43,4 +35,25 @@ describe Anomaly::Detector do
|
|
43
35
|
prob.should == Anomaly::Detector.new(data).probability(sample)
|
44
36
|
end
|
45
37
|
end
|
38
|
+
|
39
|
+
context "when lots of samples" do
|
40
|
+
let(:data) { m.times.map{[0]} }
|
41
|
+
let(:m) { rand(100) + 1 }
|
42
|
+
|
43
|
+
it { ad.samples.should == m }
|
44
|
+
it { ad.trained?.should be_true }
|
45
|
+
end
|
46
|
+
|
47
|
+
context "when no samples" do
|
48
|
+
let(:data) { [] }
|
49
|
+
|
50
|
+
it { ad.samples.should == 0 }
|
51
|
+
it { ad.trained?.should be_false }
|
52
|
+
end
|
53
|
+
|
54
|
+
context "when pdf is greater than 1" do
|
55
|
+
let(:data) { 100.times.map{[0]}.push([1]) }
|
56
|
+
|
57
|
+
it { ad.probability([0]).should == 1 }
|
58
|
+
end
|
46
59
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: anomaly
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0.
|
4
|
+
version: 0.0.3
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -13,7 +13,7 @@ date: 2011-12-12 00:00:00.000000000Z
|
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rake
|
16
|
-
requirement: &
|
16
|
+
requirement: &2160380920 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *2160380920
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: rspec
|
27
|
-
requirement: &
|
27
|
+
requirement: &2160379640 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,7 +32,18 @@ dependencies:
|
|
32
32
|
version: 2.0.0
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *2160379640
|
36
|
+
- !ruby/object:Gem::Dependency
|
37
|
+
name: narray
|
38
|
+
requirement: &2160378180 !ruby/object:Gem::Requirement
|
39
|
+
none: false
|
40
|
+
requirements:
|
41
|
+
- - ! '>='
|
42
|
+
- !ruby/object:Gem::Version
|
43
|
+
version: '0'
|
44
|
+
type: :development
|
45
|
+
prerelease: false
|
46
|
+
version_requirements: *2160378180
|
36
47
|
description: Easy-to-use anomaly detection
|
37
48
|
email:
|
38
49
|
- andrew@getformidable.com
|