anomaly 0.0.3 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/README.md +19 -23
- data/Rakefile +11 -3
- data/lib/anomaly/detector.rb +81 -19
- data/lib/anomaly/version.rb +1 -1
- data/spec/anomaly/detector_spec.rb +23 -11
- metadata +15 -9
data/README.md
CHANGED
@@ -16,7 +16,7 @@ And then execute:
|
|
16
16
|
bundle install
|
17
17
|
```
|
18
18
|
|
19
|
-
For max performance (~
|
19
|
+
For max performance (trains ~3x faster for large datasets), also install the NArray gem:
|
20
20
|
|
21
21
|
```ruby
|
22
22
|
gem "narray"
|
@@ -26,47 +26,43 @@ Anomaly will automatically detect it and use it.
|
|
26
26
|
|
27
27
|
## How to Use
|
28
28
|
|
29
|
-
Say we have weather data
|
29
|
+
Say we have weather data and we want to predict if it's sunny. In this example, sunny days are non-anomalies, and days with other types of weather (rain, snow, etc.) are anomalies. The data looks like:
|
30
30
|
|
31
31
|
```ruby
|
32
|
-
#
|
33
|
-
# [temperature (°F), humidity (%), pressure (in)]
|
32
|
+
# [temperature(°F), humidity(%), pressure(in), sunny?(y=0, n=1)]
|
34
33
|
weather_data = [
|
35
|
-
[85, 68, 10.4],
|
36
|
-
[88, 62, 12.1],
|
37
|
-
[86, 64, 13.6],
|
34
|
+
[85, 68, 10.4, 0],
|
35
|
+
[88, 62, 12.1, 0],
|
36
|
+
[86, 64, 13.6, 0],
|
37
|
+
[88, 90, 11.1, 1],
|
38
38
|
...
|
39
39
|
]
|
40
40
|
```
|
41
41
|
|
42
|
-
|
42
|
+
The last column **must** be 0 for non-anomalies, 1 for anomalies. Non-anomalies are used to train the detector, and both anomalies and non-anomalies are used to find the best value of ε.
|
43
|
+
|
44
|
+
To train the detector and test for anomalies, run:
|
43
45
|
|
44
46
|
```ruby
|
45
47
|
ad = Anomaly::Detector.new(weather_data)
|
48
|
+
|
49
|
+
# 85°F, 42% humidity, 12.3 in. pressure
|
50
|
+
ad.anomaly?([85, 42, 12.3])
|
51
|
+
# => true
|
46
52
|
```
|
47
53
|
|
48
|
-
|
54
|
+
Anomaly automatically finds the best value for ε, which you can access with:
|
49
55
|
|
50
56
|
```ruby
|
51
|
-
|
52
|
-
test_sample = [79, 66, 12.3]
|
53
|
-
ad.probability(test_sample)
|
54
|
-
# => 7.537174740907633e-08
|
57
|
+
ad.eps
|
55
58
|
```
|
56
59
|
|
57
|
-
|
58
|
-
|
59
|
-
Probabilities less than ε are considered anomalies. If ε is higher, more things are considered anomalies.
|
60
|
+
If you already know you want ε = 0.01, initialize the detector with:
|
60
61
|
|
61
|
-
```
|
62
|
-
ad.
|
63
|
-
# => false
|
64
|
-
ad.anomaly?(test_sample, 1e-5)
|
65
|
-
# => true
|
62
|
+
```ruby
|
63
|
+
ad = Anomaly::Detector.new(weather_data, {:eps => 0.01})
|
66
64
|
```
|
67
65
|
|
68
|
-
The wiki has [sample code](https://github.com/ankane/anomaly/wiki/Home) to help you find the best ε for your application.
|
69
|
-
|
70
66
|
### Persistence
|
71
67
|
|
72
68
|
You can easily persist the detector to a file or database - it's very tiny.
|
data/Rakefile
CHANGED
@@ -7,11 +7,19 @@ require "benchmark"
|
|
7
7
|
require "anomaly"
|
8
8
|
|
9
9
|
task :benchmark do
|
10
|
-
|
10
|
+
examples = 1_000_000.times.map{ [rand, rand, rand, 0] }
|
11
11
|
|
12
12
|
Benchmark.bm do |x|
|
13
|
-
x.report { Anomaly::Detector.new(
|
13
|
+
x.report { Anomaly::Detector.new(examples, {:eps => 0.5}) }
|
14
14
|
require "narray"
|
15
|
-
x.report { Anomaly::Detector.new(
|
15
|
+
x.report { Anomaly::Detector.new(examples, {:eps => 0.5}) }
|
16
16
|
end
|
17
17
|
end
|
18
|
+
|
19
|
+
task :random_examples do
|
20
|
+
examples = 10_000.times.map{ [rand, rand(10), rand(100), 0] } +
|
21
|
+
100.times.map{ [rand + 1, rand(10) + 2, rand(100) + 20, 1] }
|
22
|
+
|
23
|
+
ad = Anomaly::Detector.new(examples)
|
24
|
+
puts ad.eps
|
25
|
+
end
|
data/lib/anomaly/detector.rb
CHANGED
@@ -1,51 +1,80 @@
|
|
1
1
|
module Anomaly
|
2
2
|
class Detector
|
3
|
+
attr_accessor :eps
|
3
4
|
|
4
|
-
def initialize(
|
5
|
+
def initialize(examples = nil, opts = {})
|
5
6
|
@m = 0
|
6
|
-
train(
|
7
|
+
train(examples, opts) if examples
|
7
8
|
end
|
8
9
|
|
9
|
-
def train(
|
10
|
+
def train(examples, opts = {})
|
11
|
+
raise "No examples" if examples.empty?
|
12
|
+
raise "Must have at least two columns" if examples.first.size < 2
|
13
|
+
|
14
|
+
# Divide into groups since we only want to train with non-anomalies.
|
15
|
+
anomalies = []
|
16
|
+
non_anomalies = []
|
17
|
+
examples.each do |example|
|
18
|
+
if example.last == 0
|
19
|
+
non_anomalies << example
|
20
|
+
else
|
21
|
+
anomalies << example
|
22
|
+
end
|
23
|
+
end
|
24
|
+
|
25
|
+
raise "Must have at least one non-anomaly" if non_anomalies.empty?
|
26
|
+
|
27
|
+
@eps = (opts[:eps] || 0).to_f
|
28
|
+
if @eps > 0
|
29
|
+
# Use all non-anomalies to train.
|
30
|
+
training_examples = non_anomalies
|
31
|
+
else
|
32
|
+
training_examples, test_examples = partition!(non_anomalies)
|
33
|
+
test_examples.concat(anomalies)
|
34
|
+
end
|
35
|
+
# Remove last column.
|
36
|
+
training_examples = training_examples.map{|e| e[0..-2]}
|
37
|
+
@m = training_examples.size
|
38
|
+
@n = training_examples.first.size
|
39
|
+
|
10
40
|
if defined?(NMatrix)
|
11
|
-
|
12
|
-
|
13
|
-
|
14
|
-
@
|
15
|
-
@std = d.stddev(1).to_a
|
41
|
+
training_examples = NMatrix.to_na(training_examples)
|
42
|
+
# Convert these to an Array for Marshal.dump
|
43
|
+
@mean = training_examples.mean(1).to_a
|
44
|
+
@std = training_examples.stddev(1).to_a
|
16
45
|
else
|
17
46
|
# Default to Array, since built-in Matrix does not give us a big performance advantage.
|
18
|
-
|
19
|
-
@m = d.size
|
20
|
-
@n = d.first ? d.first.size : 0
|
21
|
-
cols = @n.times.map{|i| d.map{|r| r[i]}}
|
47
|
+
cols = @n.times.map{|i| training_examples.map{|r| r[i]}}
|
22
48
|
@mean = cols.map{|c| mean(c)}
|
23
49
|
@std = cols.each_with_index.map{|c,i| std(c, @mean[i])}
|
24
50
|
end
|
25
51
|
@std.map!{|std| (std == 0 or std.nan?) ? Float::MIN : std}
|
52
|
+
|
53
|
+
if @eps == 0
|
54
|
+
# Find the best eps.
|
55
|
+
epss = (1..9).map{|i| [1,3,5,7,9].map{|j| (j*10**(-i)).to_f }}.flatten
|
56
|
+
f1_scores = epss.map{|eps| [eps, compute_f1_score(test_examples, eps)] }
|
57
|
+
@eps, best_f1 = f1_scores.max_by{|v| v[1]}
|
58
|
+
end
|
26
59
|
end
|
27
60
|
|
28
61
|
def trained?
|
29
62
|
@m > 0
|
30
63
|
end
|
31
64
|
|
32
|
-
def samples
|
33
|
-
@m
|
34
|
-
end
|
35
|
-
|
36
65
|
# Limit the probability of features to [0,1]
|
37
66
|
# to keep probabilities at same scale.
|
38
67
|
def probability(x)
|
39
68
|
raise "Train me first" unless trained?
|
40
|
-
raise ArgumentError, "
|
69
|
+
raise ArgumentError, "First argument must have #{@n} elements" if x.size != @n
|
41
70
|
@n.times.map do |i|
|
42
71
|
p = normal_pdf(x[i], @mean[i], @std[i])
|
43
72
|
(p.nan? or p > 1) ? 1 : p
|
44
73
|
end.reduce(1, :*)
|
45
74
|
end
|
46
75
|
|
47
|
-
def anomaly?(x,
|
48
|
-
probability(x) <
|
76
|
+
def anomaly?(x, eps = @eps)
|
77
|
+
probability(x) < eps
|
49
78
|
end
|
50
79
|
|
51
80
|
protected
|
@@ -56,6 +85,39 @@ module Anomaly
|
|
56
85
|
1/(SQRT2PI*std)*Math.exp(-((x - mean)**2/(2.0*(std**2))))
|
57
86
|
end
|
58
87
|
|
88
|
+
# Find best eps.
|
89
|
+
|
90
|
+
def partition!(examples, p_last = 0.2)
|
91
|
+
examples.shuffle!
|
92
|
+
n = (examples.size * p_last).floor
|
93
|
+
[examples[n..-1], examples[0...n]]
|
94
|
+
end
|
95
|
+
|
96
|
+
def compute_f1_score(examples, eps)
|
97
|
+
tp = 0
|
98
|
+
fp = 0
|
99
|
+
fn = 0
|
100
|
+
examples.each do |example|
|
101
|
+
act = example.last != 0
|
102
|
+
pred = self.anomaly?(example[0..-2], eps)
|
103
|
+
if act and pred
|
104
|
+
tp += 1
|
105
|
+
elsif pred # and !act
|
106
|
+
fp += 1
|
107
|
+
elsif act # and !pred
|
108
|
+
fn += 1
|
109
|
+
end
|
110
|
+
end
|
111
|
+
f1_score(tp, fp, fn)
|
112
|
+
end
|
113
|
+
|
114
|
+
def f1_score(tp, fp, fn)
|
115
|
+
precision = tp / (tp + fp).to_f
|
116
|
+
recall = tp / (tp + fn).to_f
|
117
|
+
score = 2.0 * precision * recall / (precision + recall)
|
118
|
+
score.nan? ? 0.0 : score
|
119
|
+
end
|
120
|
+
|
59
121
|
# Not used for NArray
|
60
122
|
|
61
123
|
def mean(x)
|
data/lib/anomaly/version.rb
CHANGED
@@ -1,8 +1,8 @@
|
|
1
1
|
require "spec_helper"
|
2
2
|
|
3
3
|
describe Anomaly::Detector do
|
4
|
-
let(:
|
5
|
-
let(:ad) { Anomaly::Detector.new(
|
4
|
+
let(:examples) { [[-1,-2,0],[0,0,0],[1,2,0]] }
|
5
|
+
let(:ad) { Anomaly::Detector.new(examples) }
|
6
6
|
|
7
7
|
# mean = [0, 0], std = [1, 2]
|
8
8
|
it "computes the right probability" do
|
@@ -14,7 +14,7 @@ describe Anomaly::Detector do
|
|
14
14
|
end
|
15
15
|
|
16
16
|
context "when standard deviation is 0" do
|
17
|
-
let(:
|
17
|
+
let(:examples) { [[0,0],[0,0]] }
|
18
18
|
|
19
19
|
it "returns infinity for mean" do
|
20
20
|
ad.probability([0]).should == 1
|
@@ -25,35 +25,47 @@ describe Anomaly::Detector do
|
|
25
25
|
end
|
26
26
|
end
|
27
27
|
|
28
|
-
context "when
|
29
|
-
let(:
|
28
|
+
context "when examples is an array" do
|
29
|
+
let(:examples) { [[-1,-2,0],[0,0,0],[1,2,0]] }
|
30
30
|
let(:sample) { [rand, rand] }
|
31
31
|
|
32
32
|
it "returns the same probability as an NMatrix" do
|
33
33
|
prob = ad.probability(sample)
|
34
34
|
Object.send(:remove_const, :NMatrix)
|
35
|
-
prob.should == Anomaly::Detector.new(
|
35
|
+
prob.should == Anomaly::Detector.new(examples).probability(sample)
|
36
36
|
end
|
37
37
|
end
|
38
38
|
|
39
39
|
context "when lots of samples" do
|
40
|
-
let(:
|
40
|
+
let(:examples) { m.times.map{[0,0]} }
|
41
41
|
let(:m) { rand(100) + 1 }
|
42
42
|
|
43
|
-
it { ad.samples.should == m }
|
44
43
|
it { ad.trained?.should be_true }
|
45
44
|
end
|
46
45
|
|
47
46
|
context "when no samples" do
|
48
|
-
let(:
|
47
|
+
let(:examples) { nil }
|
49
48
|
|
50
|
-
it { ad.samples.should == 0 }
|
51
49
|
it { ad.trained?.should be_false }
|
52
50
|
end
|
53
51
|
|
54
52
|
context "when pdf is greater than 1" do
|
55
|
-
let(:
|
53
|
+
let(:examples) { 100.times.map{[0,0]}.push([1,0]) }
|
56
54
|
|
57
55
|
it { ad.probability([0]).should == 1 }
|
58
56
|
end
|
57
|
+
|
58
|
+
context "when only anomalies" do
|
59
|
+
let(:examples) { [[0,1]] }
|
60
|
+
|
61
|
+
it "raises error" do
|
62
|
+
expect{ ad }.to raise_error RuntimeError, "Must have at least one non-anomaly"
|
63
|
+
end
|
64
|
+
end
|
65
|
+
|
66
|
+
context "when only one non-anomaly" do
|
67
|
+
let(:examples) { [[0,0]] }
|
68
|
+
|
69
|
+
it { ad.eps.should == 1e-1 }
|
70
|
+
end
|
59
71
|
end
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: anomaly
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.0
|
4
|
+
version: 0.1.0
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2011-12-
|
12
|
+
date: 2011-12-19 00:00:00.000000000Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: rake
|
16
|
-
requirement: &
|
16
|
+
requirement: &2155813680 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ! '>='
|
@@ -21,10 +21,10 @@ dependencies:
|
|
21
21
|
version: '0'
|
22
22
|
type: :development
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *2155813680
|
25
25
|
- !ruby/object:Gem::Dependency
|
26
26
|
name: rspec
|
27
|
-
requirement: &
|
27
|
+
requirement: &2155813180 !ruby/object:Gem::Requirement
|
28
28
|
none: false
|
29
29
|
requirements:
|
30
30
|
- - ! '>='
|
@@ -32,10 +32,10 @@ dependencies:
|
|
32
32
|
version: 2.0.0
|
33
33
|
type: :development
|
34
34
|
prerelease: false
|
35
|
-
version_requirements: *
|
35
|
+
version_requirements: *2155813180
|
36
36
|
- !ruby/object:Gem::Dependency
|
37
37
|
name: narray
|
38
|
-
requirement: &
|
38
|
+
requirement: &2155812760 !ruby/object:Gem::Requirement
|
39
39
|
none: false
|
40
40
|
requirements:
|
41
41
|
- - ! '>='
|
@@ -43,7 +43,7 @@ dependencies:
|
|
43
43
|
version: '0'
|
44
44
|
type: :development
|
45
45
|
prerelease: false
|
46
|
-
version_requirements: *
|
46
|
+
version_requirements: *2155812760
|
47
47
|
description: Easy-to-use anomaly detection
|
48
48
|
email:
|
49
49
|
- andrew@getformidable.com
|
@@ -75,15 +75,21 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
75
75
|
- - ! '>='
|
76
76
|
- !ruby/object:Gem::Version
|
77
77
|
version: '0'
|
78
|
+
segments:
|
79
|
+
- 0
|
80
|
+
hash: 1886385059125072633
|
78
81
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
79
82
|
none: false
|
80
83
|
requirements:
|
81
84
|
- - ! '>='
|
82
85
|
- !ruby/object:Gem::Version
|
83
86
|
version: '0'
|
87
|
+
segments:
|
88
|
+
- 0
|
89
|
+
hash: 1886385059125072633
|
84
90
|
requirements: []
|
85
91
|
rubyforge_project:
|
86
|
-
rubygems_version: 1.8.
|
92
|
+
rubygems_version: 1.8.11
|
87
93
|
signing_key:
|
88
94
|
specification_version: 3
|
89
95
|
summary: Easy-to-use anomaly detection
|