anomaly 0.0.3 → 0.1.0
Sign up to get free protection for your applications and to get access to all the features.
- data/README.md +19 -23
- data/Rakefile +11 -3
- data/lib/anomaly/detector.rb +81 -19
- data/lib/anomaly/version.rb +1 -1
- data/spec/anomaly/detector_spec.rb +23 -11
- metadata +15 -9
data/README.md
CHANGED
@@ -16,7 +16,7 @@ And then execute:
|
|
16
16
|
bundle install
|
17
17
|
```
|
18
18
|
|
19
|
-
For max performance (~
|
19
|
+
For max performance (trains ~3x faster for large datasets), also install the NArray gem:
|
20
20
|
|
21
21
|
```ruby
|
22
22
|
gem "narray"
|
@@ -26,47 +26,43 @@ Anomaly will automatically detect it and use it.
|
|
26
26
|
|
27
27
|
## How to Use
|
28
28
|
|
29
|
-
Say we have weather data
|
29
|
+
Say we have weather data and we want to predict if it's sunny. In this example, sunny days are non-anomalies, and days with other types of weather (rain, snow, etc.) are anomalies. The data looks like:
|
30
30
|
|
31
31
|
```ruby
|
32
|
-
#
|
33
|
-
# [temperature (°F), humidity (%), pressure (in)]
|
32
|
+
# [temperature(°F), humidity(%), pressure(in), sunny?(y=0, n=1)]
|
34
33
|
weather_data = [
|
35
|
-
[85, 68, 10.4],
|
36
|
-
[88, 62, 12.1],
|
37
|
-
[86, 64, 13.6],
|
34
|
+
[85, 68, 10.4, 0],
|
35
|
+
[88, 62, 12.1, 0],
|
36
|
+
[86, 64, 13.6, 0],
|
37
|
+
[88, 90, 11.1, 1],
|
38
38
|
...
|
39
39
|
]
|
40
40
|
```
|
41
41
|
|
42
|
-
|
42
|
+
The last column **must** be 0 for non-anomalies, 1 for anomalies. Non-anomalies are used to train the detector, and both anomalies and non-anomalies are used to find the best value of ε.
|
43
|
+
|
44
|
+
To train the detector and test for anomalies, run:
|
43
45
|
|
44
46
|
```ruby
|
45
47
|
ad = Anomaly::Detector.new(weather_data)
|
48
|
+
|
49
|
+
# 85°F, 42% humidity, 12.3 in. pressure
|
50
|
+
ad.anomaly?([85, 42, 12.3])
|
51
|
+
# => true
|
46
52
|
```
|
47
53
|
|
48
|
-
|
54
|
+
Anomaly automatically finds the best value for ε, which you can access with:
|
49
55
|
|
50
56
|
```ruby
|
51
|
-
|
52
|
-
test_sample = [79, 66, 12.3]
|
53
|
-
ad.probability(test_sample)
|
54
|
-
# => 7.537174740907633e-08
|
57
|
+
ad.eps
|
55
58
|
```
|
56
59
|
|
57
|
-
|
58
|
-
|
59
|
-
Probabilities less than ε are considered anomalies. If ε is higher, more things are considered anomalies.
|
60
|
+
If you already know you want ε = 0.01, initialize the detector with:
|
60
61
|
|
61
|
-
```
|
62
|
-
ad.
|
63
|
-
# => false
|
64
|
-
ad.anomaly?(test_sample, 1e-5)
|
65
|
-
# => true
|
62
|
+
```ruby
|
63
|
+
ad = Anomaly::Detector.new(weather_data, {:eps => 0.01})
|
66
64
|
```
|
67
65
|
|
68
|
-
The wiki has [sample code](https://github.com/ankane/anomaly/wiki/Home) to help you find the best ε for your application.
|
69
|
-
|
70
66
|
### Persistence
|
71
67
|
|
72
68
|
You can easily persist the detector to a file or database - it's very tiny.
|
data/Rakefile
CHANGED
@@ -7,11 +7,19 @@ require "benchmark"
|
|
7
7
|
require "anomaly"
|
8
8
|
|
9
9
|
task :benchmark do
|
10
|
-
|
10
|
+
examples = 1_000_000.times.map{ [rand, rand, rand, 0] }
|
11
11
|
|
12
12
|
Benchmark.bm do |x|
|
13
|
-
x.report { Anomaly::Detector.new(
|
13
|
+
x.report { Anomaly::Detector.new(examples, {:eps => 0.5}) }
|
14
14
|
require "narray"
|
15
|
-
x.report { Anomaly::Detector.new(
|
15
|
+
x.report { Anomaly::Detector.new(examples, {:eps => 0.5}) }
|
16
16
|
end
|
17
17
|
end
|
18
|
+
|
19
|
+
task :random_examples do
|
20
|
+
examples = 10_000.times.map{ [rand, rand(10), rand(100), 0] } +
|
21
|
+
100.times.map{ [rand + 1, rand(10) + 2, rand(100) + 20, 1] }
|
22
|
+
|
23
|
+
ad = Anomaly::Detector.new(examples)
|
24
|
+
puts ad.eps
|
25
|
+
end
|
data/lib/anomaly/detector.rb
CHANGED
@@ -1,51 +1,80 @@
|
|
1
1
|
module Anomaly
|
2
2
|
class Detector
|
3
|
+
attr_accessor :eps
|
3
4
|
|
4
|
-
def initialize(
|
5
|
+
def initialize(examples = nil, opts = {})
|
5
6
|
@m = 0
|
6
|
-
train(
|
7
|
+
train(examples, opts) if examples
|
7
8
|
end
|
8
9
|
|
9
|
-
def train(
|
10
|
+
def train(examples, opts = {})
|
11
|
+
raise "No examples" if examples.empty?
|
12
|
+
raise "Must have at least two columns" if examples.first.size < 2
|
13
|
+
|
14
|
+
# Divide into groups since we only want to train with non-anomalies.
|
15
|
+
anomalies = []
|
16
|
+
non_anomalies = []
|
17
|
+
examples.each do |example|
|
18
|
+
if example.last == 0
|
19
|
+
non_anomalies << example
|
20
|
+
else
|
21
|
+
anomalies << example
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
raise "Must have at least one non-anomaly" if non_anomalies.empty?
|
26
|
+
|
27
|
+
@eps = (opts[:eps] || 0).to_f
|
28
|
+
if @eps > 0
|
29
|
+
# Use all non-anomalies to train.
|
30
|
+
training_examples = non_anomalies
|
31
|
+
else
|
32
|
+
training_examples, test_examples = partition!(non_anomalies)
|
33
|
+
test_examples.concat(anomalies)
|
34
|
+
end
|
35
|
+
# Remove last column.
|
36
|
+
training_examples = training_examples.map{|e| e[0..-2]}
|
37
|
+
@m = training_examples.size
|
38
|
+
@n = training_examples.first.size
|
39
|
+
|
10
40
|
if defined?(NMatrix)
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
@
|
15
|
-
@std = d.stddev(1).to_a
|
41
|
+
training_examples = NMatrix.to_na(training_examples)
|
42
|
+
# Convert these to an Array for Marshal.dump
|
43
|
+
@mean = training_examples.mean(1).to_a
|
44
|
+
@std = training_examples.stddev(1).to_a
|
16
45
|
else
|
17
46
|
# Default to Array, since built-in Matrix does not give us a big performance advantage.
|
18
|
-
|
19
|
-
@m = d.size
|
20
|
-
@n = d.first ? d.first.size : 0
|
21
|
-
cols = @n.times.map{|i| d.map{|r| r[i]}}
|
47
|
+
cols = @n.times.map{|i| training_examples.map{|r| r[i]}}
|
22
48
|
@mean = cols.map{|c| mean(c)}
|
23
49
|
@std = cols.each_with_index.map{|c,i| std(c, @mean[i])}
|
24
50
|
end
|
25
51
|
@std.map!{|std| (std == 0 or std.nan?) ? Float::MIN : std}
|
52
|
+
|
53
|
+
if @eps == 0
|
54
|
+
# Find the best eps.
|
55
|
+
epss = (1..9).map{|i| [1,3,5,7,9].map{|j| (j*10**(-i)).to_f }}.flatten
|
56
|
+
f1_scores = epss.map{|eps| [eps, compute_f1_score(test_examples, eps)] }
|
57
|
+
@eps, best_f1 = f1_scores.max_by{|v| v[1]}
|
58
|
+
end
|
26
59
|
end
|
27
60
|
|
28
61
|
def trained?
|
29
62
|
@m > 0
|
30
63
|
end
|
31
64
|
|
32
|
-
def samples
|
33
|
-
@m
|
34
|
-
end
|
35
|
-
|
36
65
|
# Limit the probability of features to [0,1]
|
37
66
|
# to keep probabilities at same scale.
|
38
67
|
def probability(x)
|
39
68
|
raise "Train me first" unless trained?
|
40
|
-
raise ArgumentError, "
|
69
|
+
raise ArgumentError, "First argument must have #{@n} elements" if x.size != @n
|
41
70
|
@n.times.map do |i|
|
42
71
|
p = normal_pdf(x[i], @mean[i], @std[i])
|
43
72
|
(p.nan? or p > 1) ? 1 : p
|
44
73
|
end.reduce(1, :*)
|
45
74
|
end
|
46
75
|
|
47
|
-
def anomaly?(x,
|
48
|
-
probability(x) <
|
76
|
+
def anomaly?(x, eps = @eps)
|
77
|
+
probability(x) < eps
|
49
78
|
end
|
50
79
|
|
51
80
|
protected
|
@@ -56,6 +85,39 @@ module Anomaly
|
|
56
85
|
1/(SQRT2PI*std)*Math.exp(-((x - mean)**2/(2.0*(std**2))))
|
57
86
|
end
|
58
87
|
|
88
|
+
# Find best eps.
|
89
|
+
|
90
|
+
def partition!(examples, p_last = 0.2)
|
91
|
+
examples.shuffle!
|
92
|
+
n = (examples.size * p_last).floor
|
93
|
+
[examples[n..-1], examples[0...n]]
|
94
|
+
end
|
95
|
+
|
96
|
+
def compute_f1_score(examples, eps)
|
97
|
+
tp = 0
|
98
|
+
fp = 0
|
99
|
+
fn = 0
|
100
|
+
examples.each do |example|
|
101
|
+
act = example.last != 0
|
102
|
+
pred = self.anomaly?(example[0..-2], eps)
|
103
|
+
if act and pred
|
104
|
+
tp += 1
|
105
|
+
elsif pred # and !act
|
106
|
+
fp += 1
|
107
|
+
elsif act # and !pred
|
108
|
+
fn += 1
|
109
|
+
end
|
110
|
+
end
|
111
|
+
f1_score(tp, fp, fn)
|
112
|
+
end
|
113
|
+
|
114
|
+
def f1_score(tp, fp, fn)
|
115
|
+
precision = tp / (tp + fp).to_f
|
116
|
+
recall = tp / (tp + fn).to_f
|
117
|
+
score = 2.0 * precision * recall / (precision + recall)
|
118
|
+
score.nan? ? 0.0 : score
|
119
|
+
end
|
120
|
+
|
59
121
|
# Not used for NArray
|
60
122
|
|
61
123
|
def mean(x)
|
data/lib/anomaly/version.rb
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
require "spec_helper"
|
2
2
|
|
3
3
|
describe Anomaly::Detector do
|
4
|
-
let(:
|
5
|
-
let(:ad) { Anomaly::Detector.new(
|
4
|
+
let(:examples) { [[-1,-2,0],[0,0,0],[1,2,0]] }
|
5
|
+
let(:ad) { Anomaly::Detector.new(examples) }
|
6
6
|
|
7
7
|
# mean = [0, 0], std = [1, 2]
|
8
8
|
it "computes the right probability" do
|
@@ -14,7 +14,7 @@ describe Anomaly::Detector do
|
|
14
14
|
end
|
15
15
|
|
16
16
|
context "when standard deviation is 0" do
|
17
|
-
let(:
|
17
|
+
let(:examples) { [[0,0],[0,0]] }
|
18
18
|
|
19
19
|
it "returns infinity for mean" do
|
20
20
|
ad.probability([0]).should == 1
|
@@ -25,35 +25,47 @@ describe Anomaly::Detector do
|
|
25
25
|
end
|
26
26
|
end
|
27
27
|
|
28
|
-
context "when
|
29
|
-
let(:
|
28
|
+
context "when examples is an array" do
|
29
|
+
let(:examples) { [[-1,-2,0],[0,0,0],[1,2,0]] }
|
30
30
|
let(:sample) { [rand, rand] }
|
31
31
|
|
32
32
|
it "returns the same probability as an NMatrix" do
|
33
33
|
prob = ad.probability(sample)
|
34
34
|
Object.send(:remove_const, :NMatrix)
|
35
|
-
prob.should == Anomaly::Detector.new(
|
35
|
+
prob.should == Anomaly::Detector.new(examples).probability(sample)
|
36
36
|
end
|
37
37
|
end
|
38
38
|
|
39
39
|
context "when lots of samples" do
|
40
|
-
let(:
|
40
|
+
let(:examples) { m.times.map{[0,0]} }
|
41
41
|
let(:m) { rand(100) + 1 }
|
42
42
|
|
43
|
-
it { ad.samples.should == m }
|
44
43
|
it { ad.trained?.should be_true }
|
45
44
|
end
|
46
45
|
|
47
46
|
context "when no samples" do
|
48
|
-
let(:
|
47
|
+
let(:examples) { nil }
|
49
48
|
|
50
|
-
it { ad.samples.should == 0 }
|
51
49
|
it { ad.trained?.should be_false }
|
52
50
|
end
|
53
51
|
|
54
52
|
context "when pdf is greater than 1" do
|
55
|
-
let(:
|
53
|
+
let(:examples) { 100.times.map{[0,0]}.push([1,0]) }
|
56
54
|
|
57
55
|
it { ad.probability([0]).should == 1 }
|
58
56
|
end
|
57
|
+
|
58
|
+
context "when only anomalies" do
|
59
|
+
let(:examples) { [[0,1]] }
|
60
|
+
|
61
|
+
it "raises error" do
|
62
|
+
expect{ ad }.to raise_error RuntimeError, "Must have at least one non-anomaly"
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
context "when only one non-anomaly" do
|
67
|
+
let(:examples) { [[0,0]] }
|
68
|
+
|
69
|
+
it { ad.eps.should == 1e-1 }
|
70
|
+
end
|
59
71
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: anomaly
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2011-12-
|
12
|
+
date: 2011-12-19 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rake
|
16
|
-
requirement: &
|
16
|
+
requirement: &2155813680 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *2155813680
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: rspec
|
27
|
-
requirement: &
|
27
|
+
requirement: &2155813180 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: 2.0.0
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *2155813180
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: narray
|
38
|
-
requirement: &
|
38
|
+
requirement: &2155812760 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,7 +43,7 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *2155812760
|
47
47
|
description: Easy-to-use anomaly detection
|
48
48
|
email:
|
49
49
|
- andrew@getformidable.com
|
@@ -75,15 +75,21 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
75
75
|
- - ! '>='
|
76
76
|
- !ruby/object:Gem::Version
|
77
77
|
version: '0'
|
78
|
+
segments:
|
79
|
+
- 0
|
80
|
+
hash: 1886385059125072633
|
78
81
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
79
82
|
none: false
|
80
83
|
requirements:
|
81
84
|
- - ! '>='
|
82
85
|
- !ruby/object:Gem::Version
|
83
86
|
version: '0'
|
87
|
+
segments:
|
88
|
+
- 0
|
89
|
+
hash: 1886385059125072633
|
84
90
|
requirements: []
|
85
91
|
rubyforge_project:
|
86
|
-
rubygems_version: 1.8.
|
92
|
+
rubygems_version: 1.8.11
|
87
93
|
signing_key:
|
88
94
|
specification_version: 3
|
89
95
|
summary: Easy-to-use anomaly detection
|