network_resiliency 0.7.11 → 0.7.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 1eec5786e7daaea4b6f3c285dffc1e4bf1c70d7096ce69d44f4cc4ea80111ea5
4
- data.tar.gz: bc3ca5e592f1a809bfb28453e43d49a64405a424861aef2c6c6745ef2f27ebb4
3
+ metadata.gz: c006caf6d6fefeca3185a0a6a1d75be6bcf1cbc16ed9e1b00f2b147114b8442f
4
+ data.tar.gz: cad33936cc88b7d586b9b1ca92ab36308e37af11768509e67fb6f20e2af354ba
5
5
  SHA512:
6
- metadata.gz: fa1af69b1f175fb708247ddd0e9501ae2b945909197b26c343347e113bb78e3e645c5217c5f74d63fda780a027b730746b8641b9f77cb5c31626bc2f985c60ff
7
- data.tar.gz: e9733ddd7978b321f8c49218de503f42b300aa269757559cac442dcefb43e0b136e1507360629e9aad1976cedd31c7c0ccd4e9a256c41962a990e50f046d3979
6
+ metadata.gz: e4eaf6bf8b4a4b176a8bd2ae3b79a80fc1a9b01c0d0a6dc486e870d656eb3793c1626dd923c2afd1c2b06774012067d77815a1710e5de44f06d79f098dd8db2b
7
+ data.tar.gz: 36d9406a01c76ca09d0859194401bbd49a177f644c1ca38d5df3c1fb8ad27907304d8d17ad2c165ced7dcfc67b467ee4e2061ce63e80a884fc8a5a372e994f2b
data/CHANGELOG.md CHANGED
@@ -1,3 +1,12 @@
1
+ ### v0.7.13 (2024-03-04)
2
+ - downsampling
3
+ - redis resiliency
4
+
5
+ ### v0.7.12 (2024-02-23)
6
+ - improve metric sampling
7
+ - lower resiliency threshold
8
+ - power buckets
9
+
1
10
  ### v0.7.11 (2024-02-20)
2
11
  - lower dynamic timeout
3
12
  - fix metric specs
data/Gemfile.lock CHANGED
@@ -1,7 +1,7 @@
1
1
  PATH
2
2
  remote: .
3
3
  specs:
4
- network_resiliency (0.7.11)
4
+ network_resiliency (0.7.13)
5
5
 
6
6
  GEM
7
7
  remote: https://rubygems.org/
data/README.md CHANGED
@@ -39,3 +39,8 @@ Yes please :)
39
39
  https://github.com/lostisland/faraday-retry/blob/main/lib/faraday/retry/middleware.rb
40
40
 
41
41
  https://github.com/ankane/the-ultimate-guide-to-ruby-timeouts
42
+
43
+
44
+ https://reprep.io/writings/20220326_timeouts_deadline_propagation.html
45
+
46
+ https://grpc.io/blog/deadlines
@@ -0,0 +1,132 @@
1
+ require "network_resiliency/refinements"
2
+
3
+ using NetworkResiliency::Refinements
4
+
5
+ module NetworkResiliency
6
+ class PowerStats
7
+ MIN_VALUE = 1
8
+ LOCK = Thread::Mutex.new
9
+ STATS = {}
10
+
11
+ attr_reader :n
12
+
13
+ class << self
14
+ def [](key)
15
+ LOCK.synchronize { STATS[key] ||= new }
16
+ end
17
+
18
+ def reset
19
+ LOCK.synchronize { STATS.clear }
20
+ end
21
+
22
+ private
23
+
24
+ def synchronize(fn_name)
25
+ fn = instance_method(fn_name)
26
+
27
+ define_method(fn_name) do |*args|
28
+ @lock.synchronize { fn.bind(self).call(*args) }
29
+ end
30
+ end
31
+ end
32
+
33
+ def initialize(values = [])
34
+ @lock = Thread::Mutex.new
35
+ reset
36
+
37
+ values.each {|x| add(x) }
38
+ end
39
+
40
+ def <<(value)
41
+ case value
42
+ when Array
43
+ value.each {|x| add(x) }
44
+ when self.class
45
+ merge!(value)
46
+ else
47
+ add(value)
48
+ end
49
+
50
+ self
51
+ end
52
+
53
+ synchronize def add(value)
54
+ raise ArgumentError, "Numeric expected, found #{value.class}" unless value.is_a?(Numeric)
55
+
56
+ value = [ value, MIN_VALUE ].max
57
+ i = Math.log10(value).ceil
58
+
59
+ @buckets[i] ||= 0
60
+ @buckets[i] += 1
61
+ @n += 1
62
+ end
63
+
64
+ synchronize def percentile(p)
65
+ raise ArgumentError, "Percentile must be between 0 and 100" unless p.between?(0, 100)
66
+
67
+ return 0 if @n == 0
68
+
69
+ threshold = ((100 - p) / 100.0 * @n).floor
70
+ index = @buckets.size - 1
71
+
72
+ while index >= 0
73
+ if @buckets[index]
74
+ break if @buckets[index] >= threshold
75
+
76
+ threshold -= @buckets[index]
77
+ end
78
+
79
+ index -= 1
80
+ end
81
+
82
+ 10 ** index
83
+ end
84
+ alias_method :p, :percentile
85
+
86
+ def p99
87
+ percentile(99)
88
+ end
89
+
90
+ def merge(other)
91
+ dup.merge!(other)
92
+ end
93
+ alias_method :+, :merge
94
+
95
+ synchronize def merge!(other)
96
+ raise ArgumentError unless other.is_a?(self.class)
97
+
98
+ other_buckets = other.instance_variable_get(:@buckets)
99
+
100
+ if @n == 0
101
+ @n = other.n
102
+ @buckets = other_buckets.dup
103
+ elsif other.n > 0
104
+ @n += other.n
105
+
106
+ other_buckets.each_with_index do |count, i|
107
+ next unless count
108
+
109
+ @buckets[i] ||= 0
110
+ @buckets[i] += count
111
+ end
112
+ end
113
+
114
+ self
115
+ end
116
+
117
+ synchronize def scale!(percentage)
118
+ raise ArgumentError, "Numeric expected, found #{percentage.class}" unless percentage.is_a?(Numeric)
119
+ raise ArgumentError, "argument must be between 0 and 100" unless percentage.between?(0, 100)
120
+
121
+ factor = percentage / 100.0
122
+
123
+ @buckets.map! {|x| (x * factor).round if x }
124
+ @n = @buckets.compact.sum
125
+ end
126
+
127
+ synchronize def reset
128
+ @n = 0
129
+ @buckets = []
130
+ end
131
+ end
132
+ end
@@ -80,6 +80,16 @@ module NetworkResiliency
80
80
  self
81
81
  end
82
82
 
83
+ synchronize def scale!(percentage)
84
+ raise ArgumentError, "Numeric expected, found #{percentage.class}" unless percentage.is_a?(Numeric)
85
+ raise ArgumentError, "argument must be between 0 and 100" unless percentage.between?(0, 100)
86
+
87
+ factor = percentage / 100.0
88
+
89
+ @sq_dist *= factor
90
+ @n = (@n * factor).round
91
+ end
92
+
83
93
  def ==(other)
84
94
  return false unless other.is_a?(self.class)
85
95
 
@@ -94,7 +104,7 @@ module NetworkResiliency
94
104
  @sq_dist = 0.0 # sum of squared distance from mean
95
105
  end
96
106
 
97
- MIN_SAMPLE_SIZE = 1000
107
+ MIN_SAMPLE_SIZE = 300
98
108
  MAX_WINDOW_LENGTH = 1000
99
109
  STATS_TTL = 24 * 60 * 60 # 1 day
100
110
  CACHE_TTL = 120 # seconds
@@ -50,7 +50,9 @@ module NetworkResiliency
50
50
  NetworkResiliency.redis.disconnect! if NetworkResiliency.redis.connected?
51
51
 
52
52
  until @shutdown
53
- StatsEngine.sync(NetworkResiliency.redis)
53
+ NetworkResiliency.redis.with_reconnect do
54
+ StatsEngine.sync(NetworkResiliency.redis)
55
+ end
54
56
 
55
57
  sleep(SLEEP_DURATION)
56
58
  end
@@ -1,3 +1,3 @@
1
1
  module NetworkResiliency
2
- VERSION = "0.7.11"
2
+ VERSION = "0.7.13"
3
3
  end
@@ -1,3 +1,4 @@
1
+ require "network_resiliency/power_stats"
1
2
  require "network_resiliency/refinements"
2
3
  require "network_resiliency/stats"
3
4
  require "network_resiliency/stats_engine"
@@ -19,7 +20,7 @@ module NetworkResiliency
19
20
  ACTIONS = [ :connect, :request ].freeze
20
21
  ADAPTERS = [ :http, :faraday, :redis, :mysql, :postgres, :rails ].freeze
21
22
  MODE = [ :observe, :resilient ].freeze
22
- RESILIENCY_SIZE_THRESHOLD = 1_000
23
+ RESILIENCY_SIZE_THRESHOLD = 300
23
24
  SAMPLE_RATE = {
24
25
  timeout: 0.1,
25
26
  stats: 0.1,
@@ -256,6 +257,12 @@ module NetworkResiliency
256
257
  # record stats
257
258
  key = [ adapter, action, destination ].join(":")
258
259
  stats = StatsEngine.add(key, duration)
260
+
261
+ if stats.n > RESILIENCY_SIZE_THRESHOLD * 4
262
+ # downsample to age out old stats
263
+ stats.scale!(50)
264
+ end
265
+
259
266
  tags = {
260
267
  adapter: adapter,
261
268
  destination: destination,
@@ -266,26 +273,28 @@ module NetworkResiliency
266
273
  # ensure Syncer is running
267
274
  Syncer.start
268
275
 
269
- NetworkResiliency.statsd&.distribution(
270
- "network_resiliency.#{action}.stats.n",
271
- stats.n,
272
- tags: tags,
273
- sample_rate: SAMPLE_RATE[:stats],
274
- )
276
+ if rand < SAMPLE_RATE[:stats]
277
+ NetworkResiliency.statsd&.distribution(
278
+ "network_resiliency.#{action}.stats.n",
279
+ stats.n,
280
+ tags: tags,
281
+ sample_rate: SAMPLE_RATE[:stats],
282
+ )
275
283
 
276
- NetworkResiliency.statsd&.distribution(
277
- "network_resiliency.#{action}.stats.avg",
278
- stats.avg,
279
- tags: tags,
280
- sample_rate: SAMPLE_RATE[:stats],
281
- )
284
+ NetworkResiliency.statsd&.distribution(
285
+ "network_resiliency.#{action}.stats.avg",
286
+ stats.avg,
287
+ tags: tags,
288
+ sample_rate: SAMPLE_RATE[:stats],
289
+ )
282
290
 
283
- NetworkResiliency.statsd&.distribution(
284
- "network_resiliency.#{action}.stats.stdev",
285
- stats.stdev,
286
- tags: tags,
287
- sample_rate: SAMPLE_RATE[:stats],
288
- )
291
+ NetworkResiliency.statsd&.distribution(
292
+ "network_resiliency.#{action}.stats.stdev",
293
+ stats.stdev,
294
+ tags: tags,
295
+ sample_rate: SAMPLE_RATE[:stats],
296
+ )
297
+ end
289
298
  end
290
299
 
291
300
  nil
@@ -337,7 +346,7 @@ module NetworkResiliency
337
346
  "network_resiliency.timeout.raised",
338
347
  tags: tags,
339
348
  sample_rate: SAMPLE_RATE[:timeout],
340
- )
349
+ ) if rand < SAMPLE_RATE[:timeout]
341
350
  end
342
351
  else
343
352
  # the specified timeout is less than our expected p99...awkward
@@ -347,7 +356,7 @@ module NetworkResiliency
347
356
  "network_resiliency.timeout.too_low",
348
357
  tags: tags,
349
358
  sample_rate: SAMPLE_RATE[:timeout],
350
- )
359
+ ) if rand < SAMPLE_RATE[:timeout]
351
360
  end
352
361
  else
353
362
  timeouts << p99
@@ -361,7 +370,7 @@ module NetworkResiliency
361
370
  "network_resiliency.timeout.missing",
362
371
  tags: tags,
363
372
  sample_rate: SAMPLE_RATE[:timeout],
364
- )
373
+ ) if rand < SAMPLE_RATE[:timeout]
365
374
  end
366
375
 
367
376
  NetworkResiliency.statsd&.distribution(
@@ -372,7 +381,7 @@ module NetworkResiliency
372
381
  destination: destination,
373
382
  },
374
383
  sample_rate: SAMPLE_RATE[:timeout],
375
- )
384
+ ) if rand < SAMPLE_RATE[:timeout]
376
385
 
377
386
  case units
378
387
  when nil, :ms, :milliseconds
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: network_resiliency
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.7.11
4
+ version: 0.7.13
5
5
  platform: ruby
6
6
  authors:
7
7
  - Daniel Pepper
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-02-20 00:00:00.000000000 Z
11
+ date: 2024-03-04 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: byebug
@@ -238,6 +238,7 @@ files:
238
238
  - lib/network_resiliency/adapter/postgres.rb
239
239
  - lib/network_resiliency/adapter/rails.rb
240
240
  - lib/network_resiliency/adapter/redis.rb
241
+ - lib/network_resiliency/power_stats.rb
241
242
  - lib/network_resiliency/refinements.rb
242
243
  - lib/network_resiliency/stats.rb
243
244
  - lib/network_resiliency/stats_engine.rb