network_resiliency 0.7.10 → 0.7.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/Gemfile.lock +1 -1
- data/README.md +5 -0
- data/lib/network_resiliency/power_stats.rb +132 -0
- data/lib/network_resiliency/stats.rb +15 -7
- data/lib/network_resiliency/version.rb +1 -1
- data/lib/network_resiliency.rb +27 -24
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 93c15344f1a3f02c3ef567baeb1650745b27286a92db1975368a2066e29df423
|
4
|
+
data.tar.gz: '092ee5a69a9f6b3e2e299946404c83353bb142e7d6d596d7fe4d556640500c5f'
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 9a84c796661456dd272fa052647ec99991956f5d69435a8dd046a93634de0bbfdc5100cd7cb9b8dfb54d3641d18c71cb5b0728c70448ed6de40fa2bd181eb02e
|
7
|
+
data.tar.gz: d9487add703cd5c2ef71abbed8d913619338e240faceaaeac8621c58dbfe2af6203029328f37875f75315d94f91498a6283e252f3c3703fefd12b546e46745c1
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,13 @@
|
|
1
|
+
### v0.7.12 (2024-02-23)
|
2
|
+
- improve metric sampling
|
3
|
+
- lower resiliency threshold
|
4
|
+
- power buckets
|
5
|
+
|
6
|
+
### v0.7.11 (2024-02-20)
|
7
|
+
- lower dynamic timeout
|
8
|
+
- fix metric specs
|
9
|
+
- extend cache ttl
|
10
|
+
|
1
11
|
### v0.7.10 (2024-02-15)
|
2
12
|
- ddog sampling
|
3
13
|
- reconnect redis
|
data/Gemfile.lock
CHANGED
data/README.md
CHANGED
@@ -39,3 +39,8 @@ Yes please :)
|
|
39
39
|
https://github.com/lostisland/faraday-retry/blob/main/lib/faraday/retry/middleware.rb
|
40
40
|
|
41
41
|
https://github.com/ankane/the-ultimate-guide-to-ruby-timeouts
|
42
|
+
|
43
|
+
|
44
|
+
https://reprep.io/writings/20220326_timeouts_deadline_propagation.html
|
45
|
+
|
46
|
+
https://grpc.io/blog/deadlines
|
@@ -0,0 +1,132 @@
|
|
1
|
+
require "network_resiliency/refinements"
|
2
|
+
|
3
|
+
using NetworkResiliency::Refinements
|
4
|
+
|
5
|
+
module NetworkResiliency
|
6
|
+
class PowerStats
|
7
|
+
MIN_VALUE = 1
|
8
|
+
LOCK = Thread::Mutex.new
|
9
|
+
STATS = {}
|
10
|
+
|
11
|
+
attr_reader :n
|
12
|
+
|
13
|
+
class << self
|
14
|
+
def [](key)
|
15
|
+
LOCK.synchronize { STATS[key] ||= new }
|
16
|
+
end
|
17
|
+
|
18
|
+
def reset
|
19
|
+
LOCK.synchronize { STATS.clear }
|
20
|
+
end
|
21
|
+
|
22
|
+
private
|
23
|
+
|
24
|
+
def synchronize(fn_name)
|
25
|
+
fn = instance_method(fn_name)
|
26
|
+
|
27
|
+
define_method(fn_name) do |*args|
|
28
|
+
@lock.synchronize { fn.bind(self).call(*args) }
|
29
|
+
end
|
30
|
+
end
|
31
|
+
end
|
32
|
+
|
33
|
+
def initialize(values = [])
|
34
|
+
@lock = Thread::Mutex.new
|
35
|
+
reset
|
36
|
+
|
37
|
+
values.each {|x| add(x) }
|
38
|
+
end
|
39
|
+
|
40
|
+
def <<(value)
|
41
|
+
case value
|
42
|
+
when Array
|
43
|
+
value.each {|x| add(x) }
|
44
|
+
when self.class
|
45
|
+
merge!(value)
|
46
|
+
else
|
47
|
+
add(value)
|
48
|
+
end
|
49
|
+
|
50
|
+
self
|
51
|
+
end
|
52
|
+
|
53
|
+
synchronize def add(value)
|
54
|
+
raise ArgumentError, "Numeric expected, found #{value.class}" unless value.is_a?(Numeric)
|
55
|
+
|
56
|
+
value = [ value, MIN_VALUE ].max
|
57
|
+
i = Math.log10(value).ceil
|
58
|
+
|
59
|
+
@buckets[i] ||= 0
|
60
|
+
@buckets[i] += 1
|
61
|
+
@n += 1
|
62
|
+
end
|
63
|
+
|
64
|
+
synchronize def percentile(p)
|
65
|
+
raise ArgumentError, "Percentile must be between 0 and 100" unless p.between?(0, 100)
|
66
|
+
|
67
|
+
return 0 if @n == 0
|
68
|
+
|
69
|
+
threshold = ((100 - p) / 100.0 * @n).floor
|
70
|
+
index = @buckets.size - 1
|
71
|
+
|
72
|
+
while index >= 0
|
73
|
+
if @buckets[index]
|
74
|
+
break if @buckets[index] >= threshold
|
75
|
+
|
76
|
+
threshold -= @buckets[index]
|
77
|
+
end
|
78
|
+
|
79
|
+
index -= 1
|
80
|
+
end
|
81
|
+
|
82
|
+
10 ** index
|
83
|
+
end
|
84
|
+
alias_method :p, :percentile
|
85
|
+
|
86
|
+
def p99
|
87
|
+
percentile(99)
|
88
|
+
end
|
89
|
+
|
90
|
+
def merge(other)
|
91
|
+
dup.merge!(other)
|
92
|
+
end
|
93
|
+
alias_method :+, :merge
|
94
|
+
|
95
|
+
synchronize def merge!(other)
|
96
|
+
raise ArgumentError unless other.is_a?(self.class)
|
97
|
+
|
98
|
+
other_buckets = other.instance_variable_get(:@buckets)
|
99
|
+
|
100
|
+
if @n == 0
|
101
|
+
@n = other.n
|
102
|
+
@buckets = other_buckets.dup
|
103
|
+
elsif other.n > 0
|
104
|
+
@n += other.n
|
105
|
+
|
106
|
+
other_buckets.each_with_index do |count, i|
|
107
|
+
next unless count
|
108
|
+
|
109
|
+
@buckets[i] ||= 0
|
110
|
+
@buckets[i] += count
|
111
|
+
end
|
112
|
+
end
|
113
|
+
|
114
|
+
self
|
115
|
+
end
|
116
|
+
|
117
|
+
synchronize def scale!(percentage)
|
118
|
+
raise ArgumentError, "Numeric expected, found #{percentage.class}" unless percentage.is_a?(Numeric)
|
119
|
+
raise ArgumentError, "argument must be between 0 and 100" unless percentage.between?(0, 100)
|
120
|
+
|
121
|
+
factor = percentage / 100.0
|
122
|
+
|
123
|
+
@buckets.map! {|x| (x * factor).round if x }
|
124
|
+
@n = @buckets.compact.sum
|
125
|
+
end
|
126
|
+
|
127
|
+
synchronize def reset
|
128
|
+
@n = 0
|
129
|
+
@buckets = []
|
130
|
+
end
|
131
|
+
end
|
132
|
+
end
|
@@ -80,6 +80,16 @@ module NetworkResiliency
|
|
80
80
|
self
|
81
81
|
end
|
82
82
|
|
83
|
+
synchronize def scale!(percentage)
|
84
|
+
raise ArgumentError, "Numeric expected, found #{percentage.class}" unless percentage.is_a?(Numeric)
|
85
|
+
raise ArgumentError, "argument must be between 0 and 100" unless percentage.between?(0, 100)
|
86
|
+
|
87
|
+
factor = percentage / 100.0
|
88
|
+
|
89
|
+
@sq_dist *= factor
|
90
|
+
@n = (@n * factor).round
|
91
|
+
end
|
92
|
+
|
83
93
|
def ==(other)
|
84
94
|
return false unless other.is_a?(self.class)
|
85
95
|
|
@@ -94,10 +104,10 @@ module NetworkResiliency
|
|
94
104
|
@sq_dist = 0.0 # sum of squared distance from mean
|
95
105
|
end
|
96
106
|
|
97
|
-
MIN_SAMPLE_SIZE =
|
107
|
+
MIN_SAMPLE_SIZE = 300
|
98
108
|
MAX_WINDOW_LENGTH = 1000
|
99
109
|
STATS_TTL = 24 * 60 * 60 # 1 day
|
100
|
-
CACHE_TTL =
|
110
|
+
CACHE_TTL = 120 # seconds
|
101
111
|
|
102
112
|
LUA_SCRIPT = <<~LUA
|
103
113
|
local results = {}
|
@@ -189,11 +199,9 @@ module NetworkResiliency
|
|
189
199
|
end
|
190
200
|
|
191
201
|
res = redis.eval(LUA_SCRIPT, keys, args)
|
192
|
-
data.keys.zip(res.each_slice(3)).
|
193
|
-
n, avg, sq_dist
|
194
|
-
|
195
|
-
[ key, Stats.from(n: n, avg: avg, sq_dist: sq_dist) ]
|
196
|
-
end.to_h
|
202
|
+
data.keys.zip(res.each_slice(3)).to_h.transform_values! do |n, avg, sq_dist|
|
203
|
+
Stats.from(n: n, avg: avg, sq_dist: sq_dist)
|
204
|
+
end
|
197
205
|
end
|
198
206
|
|
199
207
|
def self.fetch(redis, keys)
|
data/lib/network_resiliency.rb
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
require "network_resiliency/power_stats"
|
1
2
|
require "network_resiliency/refinements"
|
2
3
|
require "network_resiliency/stats"
|
3
4
|
require "network_resiliency/stats_engine"
|
@@ -19,7 +20,7 @@ module NetworkResiliency
|
|
19
20
|
ACTIONS = [ :connect, :request ].freeze
|
20
21
|
ADAPTERS = [ :http, :faraday, :redis, :mysql, :postgres, :rails ].freeze
|
21
22
|
MODE = [ :observe, :resilient ].freeze
|
22
|
-
RESILIENCY_SIZE_THRESHOLD =
|
23
|
+
RESILIENCY_SIZE_THRESHOLD = 300
|
23
24
|
SAMPLE_RATE = {
|
24
25
|
timeout: 0.1,
|
25
26
|
stats: 0.1,
|
@@ -266,26 +267,28 @@ module NetworkResiliency
|
|
266
267
|
# ensure Syncer is running
|
267
268
|
Syncer.start
|
268
269
|
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
270
|
+
if rand < SAMPLE_RATE[:stats]
|
271
|
+
NetworkResiliency.statsd&.distribution(
|
272
|
+
"network_resiliency.#{action}.stats.n",
|
273
|
+
stats.n,
|
274
|
+
tags: tags,
|
275
|
+
sample_rate: SAMPLE_RATE[:stats],
|
276
|
+
)
|
275
277
|
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
278
|
+
NetworkResiliency.statsd&.distribution(
|
279
|
+
"network_resiliency.#{action}.stats.avg",
|
280
|
+
stats.avg,
|
281
|
+
tags: tags,
|
282
|
+
sample_rate: SAMPLE_RATE[:stats],
|
283
|
+
)
|
282
284
|
|
283
|
-
|
284
|
-
|
285
|
-
|
286
|
-
|
287
|
-
|
288
|
-
|
285
|
+
NetworkResiliency.statsd&.distribution(
|
286
|
+
"network_resiliency.#{action}.stats.stdev",
|
287
|
+
stats.stdev,
|
288
|
+
tags: tags,
|
289
|
+
sample_rate: SAMPLE_RATE[:stats],
|
290
|
+
)
|
291
|
+
end
|
289
292
|
end
|
290
293
|
|
291
294
|
nil
|
@@ -316,7 +319,7 @@ module NetworkResiliency
|
|
316
319
|
destination: destination,
|
317
320
|
}
|
318
321
|
|
319
|
-
p99 = (stats.avg + stats.stdev *
|
322
|
+
p99 = (stats.avg + stats.stdev * 2).order_of_magnitude(ceil: true)
|
320
323
|
|
321
324
|
timeouts = []
|
322
325
|
|
@@ -337,7 +340,7 @@ module NetworkResiliency
|
|
337
340
|
"network_resiliency.timeout.raised",
|
338
341
|
tags: tags,
|
339
342
|
sample_rate: SAMPLE_RATE[:timeout],
|
340
|
-
)
|
343
|
+
) if rand < SAMPLE_RATE[:timeout]
|
341
344
|
end
|
342
345
|
else
|
343
346
|
# the specified timeout is less than our expected p99...awkward
|
@@ -347,7 +350,7 @@ module NetworkResiliency
|
|
347
350
|
"network_resiliency.timeout.too_low",
|
348
351
|
tags: tags,
|
349
352
|
sample_rate: SAMPLE_RATE[:timeout],
|
350
|
-
)
|
353
|
+
) if rand < SAMPLE_RATE[:timeout]
|
351
354
|
end
|
352
355
|
else
|
353
356
|
timeouts << p99
|
@@ -361,7 +364,7 @@ module NetworkResiliency
|
|
361
364
|
"network_resiliency.timeout.missing",
|
362
365
|
tags: tags,
|
363
366
|
sample_rate: SAMPLE_RATE[:timeout],
|
364
|
-
)
|
367
|
+
) if rand < SAMPLE_RATE[:timeout]
|
365
368
|
end
|
366
369
|
|
367
370
|
NetworkResiliency.statsd&.distribution(
|
@@ -372,7 +375,7 @@ module NetworkResiliency
|
|
372
375
|
destination: destination,
|
373
376
|
},
|
374
377
|
sample_rate: SAMPLE_RATE[:timeout],
|
375
|
-
)
|
378
|
+
) if rand < SAMPLE_RATE[:timeout]
|
376
379
|
|
377
380
|
case units
|
378
381
|
when nil, :ms, :milliseconds
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: network_resiliency
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.7.
|
4
|
+
version: 0.7.12
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Daniel Pepper
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-02-
|
11
|
+
date: 2024-02-23 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: byebug
|
@@ -238,6 +238,7 @@ files:
|
|
238
238
|
- lib/network_resiliency/adapter/postgres.rb
|
239
239
|
- lib/network_resiliency/adapter/rails.rb
|
240
240
|
- lib/network_resiliency/adapter/redis.rb
|
241
|
+
- lib/network_resiliency/power_stats.rb
|
241
242
|
- lib/network_resiliency/refinements.rb
|
242
243
|
- lib/network_resiliency/stats.rb
|
243
244
|
- lib/network_resiliency/stats_engine.rb
|