network_resiliency 0.3.1 → 0.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/Gemfile.lock +1 -1
- data/lib/network_resiliency/adapter/redis.rb +26 -12
- data/lib/network_resiliency/refinements.rb +14 -2
- data/lib/network_resiliency/version.rb +1 -1
- data/lib/network_resiliency.rb +163 -11
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c372873056610bf21197e55a265bdfd03e200b064c727a4713ea8725ae3d696d
|
4
|
+
data.tar.gz: 5da4933543bf9a57d46ed4a3b4d94a692524e641f6c1293d8bcb99aa74c0c9d9
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b8e9b66cb83ff5bfda080c43935e8834228cfbd16afd0c1229be2b7edbea8469702d917ae7b6bb7383b774ebf68802e9a5572ee89357723d479d1dd8b3a91f2c
|
7
|
+
data.tar.gz: 68f241f0bd9b7ef5b0b291e2e1be11aa3d2c377f6b4b70552ba32638122bff300bc1252b279a13c41e96af3ffa8843bc571b74bd9aac7f1300f27472e18001ff
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,15 @@
|
|
1
|
+
### v0.4.0 (2023-11-13)
|
2
|
+
- redis resiliency
|
3
|
+
- timeouts_for
|
4
|
+
- mode
|
5
|
+
|
6
|
+
### v0.3.2 (2023-11-03)
|
7
|
+
- stats observability
|
8
|
+
- surface errors in specs
|
9
|
+
- destination filter
|
10
|
+
- error observability
|
11
|
+
- improve order of magnitude
|
12
|
+
|
1
13
|
### v0.3.1 (2023-11-02)
|
2
14
|
- sync thread safety
|
3
15
|
- order of magnitude stats
|
data/Gemfile.lock
CHANGED
@@ -36,35 +36,49 @@ module NetworkResiliency
|
|
36
36
|
end
|
37
37
|
|
38
38
|
module Instrumentation
|
39
|
-
# def initialize(...)
|
40
|
-
# super
|
41
|
-
|
42
|
-
# @network_resiliency_attempts = options[:reconnect_attempts]
|
43
|
-
# options[:reconnect_attempts] = 0
|
44
|
-
# end
|
45
|
-
|
46
39
|
def establish_connection
|
47
40
|
return super unless NetworkResiliency.enabled?(:redis)
|
48
41
|
|
42
|
+
original_timeout = @options[:connect_timeout]
|
43
|
+
|
44
|
+
timeouts = NetworkResiliency.timeouts_for(
|
45
|
+
adapter: "redis",
|
46
|
+
action: "connect",
|
47
|
+
destination: host,
|
48
|
+
max: original_timeout,
|
49
|
+
)
|
50
|
+
|
51
|
+
attempts = 0
|
52
|
+
ts = -NetworkResiliency.timestamp
|
53
|
+
|
49
54
|
begin
|
50
|
-
|
55
|
+
attempts += 1
|
56
|
+
error = nil
|
57
|
+
|
58
|
+
@options[:connect_timeout] = timeouts.shift
|
51
59
|
|
52
60
|
super
|
53
61
|
rescue ::Redis::CannotConnectError => e
|
54
62
|
# capture error
|
63
|
+
|
64
|
+
# grab underlying exception within Redis wrapper
|
65
|
+
error = e.cause.class
|
66
|
+
|
67
|
+
retry if timeouts.size > 0
|
68
|
+
|
55
69
|
raise
|
56
70
|
ensure
|
57
71
|
ts += NetworkResiliency.timestamp
|
58
|
-
|
59
|
-
# grab underlying exception within Redis wrapper
|
60
|
-
error = e ? e.cause.class : nil
|
72
|
+
@options[:connect_timeout] = original_timeout
|
61
73
|
|
62
74
|
NetworkResiliency.record(
|
63
75
|
adapter: "redis",
|
64
76
|
action: "connect",
|
65
77
|
destination: host,
|
66
|
-
error: error,
|
67
78
|
duration: ts,
|
79
|
+
error: error,
|
80
|
+
timeout: @options[:connect_timeout],
|
81
|
+
attempts: attempts,
|
68
82
|
)
|
69
83
|
end
|
70
84
|
end
|
@@ -1,8 +1,20 @@
|
|
1
1
|
module NetworkResiliency
|
2
2
|
module Refinements
|
3
3
|
refine Numeric do
|
4
|
-
def order_of_magnitude
|
5
|
-
|
4
|
+
def order_of_magnitude(ceil: false)
|
5
|
+
return 0 if self <= 0
|
6
|
+
return 1 if self <= 1
|
7
|
+
|
8
|
+
log10 = Math.log10(self.round)
|
9
|
+
10 ** (ceil ? log10.ceil : log10.floor)
|
10
|
+
end
|
11
|
+
|
12
|
+
def power_ceil
|
13
|
+
return 0 if self <= 0
|
14
|
+
return 1 if self <= 1
|
15
|
+
|
16
|
+
digits = Math.log10(self).floor
|
17
|
+
10 ** digits * (self.to_f / 10 ** digits).ceil
|
6
18
|
end
|
7
19
|
end
|
8
20
|
end
|
data/lib/network_resiliency.rb
CHANGED
@@ -14,6 +14,9 @@ module NetworkResiliency
|
|
14
14
|
autoload :Postgres, "network_resiliency/adapter/postgres"
|
15
15
|
end
|
16
16
|
|
17
|
+
MODE = [ :observe, :resilient ].freeze
|
18
|
+
RESILIENCY_SIZE_THRESHOLD = 1_000
|
19
|
+
|
17
20
|
extend self
|
18
21
|
|
19
22
|
attr_accessor :statsd, :redis
|
@@ -87,13 +90,22 @@ module NetworkResiliency
|
|
87
90
|
Process.clock_gettime(Process::CLOCK_MONOTONIC) * 1_000
|
88
91
|
end
|
89
92
|
|
90
|
-
|
93
|
+
def mode
|
94
|
+
@mode || :observe
|
95
|
+
end
|
91
96
|
|
92
|
-
|
97
|
+
def mode=(mode)
|
98
|
+
unless MODE.include?(mode)
|
99
|
+
raise ArgumentError, "invalid NetworkResiliency mode: #{mode}"
|
100
|
+
end
|
93
101
|
|
94
|
-
|
95
|
-
|
96
|
-
|
102
|
+
@mode = mode
|
103
|
+
end
|
104
|
+
|
105
|
+
# private
|
106
|
+
|
107
|
+
def record(adapter:, action:, destination:, duration:, error:, timeout: nil, attempts: 1)
|
108
|
+
return if ignore_destination?(adapter, action, destination)
|
97
109
|
|
98
110
|
NetworkResiliency.statsd&.distribution(
|
99
111
|
"network_resiliency.#{action}",
|
@@ -102,12 +114,13 @@ module NetworkResiliency
|
|
102
114
|
adapter: adapter,
|
103
115
|
destination: destination,
|
104
116
|
error: error,
|
117
|
+
attempts: (attempts if attempts > 1),
|
105
118
|
}.compact,
|
106
119
|
)
|
107
120
|
|
108
121
|
NetworkResiliency.statsd&.distribution(
|
109
122
|
"network_resiliency.#{action}.magnitude",
|
110
|
-
duration.order_of_magnitude,
|
123
|
+
duration.order_of_magnitude(ceil: true),
|
111
124
|
tags: {
|
112
125
|
adapter: adapter,
|
113
126
|
destination: destination,
|
@@ -115,24 +128,165 @@ module NetworkResiliency
|
|
115
128
|
}.compact,
|
116
129
|
)
|
117
130
|
|
131
|
+
NetworkResiliency.statsd&.gauge(
|
132
|
+
"network_resiliency.#{action}.timeout",
|
133
|
+
timeout,
|
134
|
+
tags: {
|
135
|
+
adapter: adapter,
|
136
|
+
destination: destination,
|
137
|
+
},
|
138
|
+
)
|
139
|
+
|
140
|
+
if error
|
141
|
+
NetworkResiliency.statsd&.distribution(
|
142
|
+
"network_resiliency.#{action}.time_saved",
|
143
|
+
timeout - duration,
|
144
|
+
tags: {
|
145
|
+
adapter: adapter,
|
146
|
+
destination: destination,
|
147
|
+
},
|
148
|
+
) if timeout
|
149
|
+
else
|
150
|
+
# track successful retries
|
151
|
+
NetworkResiliency.statsd&.increment(
|
152
|
+
"network_resiliency.#{action}.resilient",
|
153
|
+
tags: {
|
154
|
+
adapter: adapter,
|
155
|
+
destination: destination,
|
156
|
+
},
|
157
|
+
) if attempts > 1
|
158
|
+
|
159
|
+
# record stats
|
160
|
+
key = [ adapter, action, destination ].join(":")
|
161
|
+
stats = StatsEngine.add(key, duration)
|
162
|
+
tags = {
|
163
|
+
adapter: adapter,
|
164
|
+
destination: destination,
|
165
|
+
n: stats.n.order_of_magnitude,
|
166
|
+
}
|
167
|
+
|
168
|
+
NetworkResiliency.statsd&.distribution(
|
169
|
+
"network_resiliency.#{action}.stats.n",
|
170
|
+
stats.n,
|
171
|
+
tags: tags,
|
172
|
+
)
|
173
|
+
|
174
|
+
NetworkResiliency.statsd&.distribution(
|
175
|
+
"network_resiliency.#{action}.stats.avg",
|
176
|
+
stats.avg,
|
177
|
+
tags: tags,
|
178
|
+
)
|
179
|
+
|
180
|
+
NetworkResiliency.statsd&.distribution(
|
181
|
+
"network_resiliency.#{action}.stats.stdev",
|
182
|
+
stats.stdev,
|
183
|
+
tags: tags,
|
184
|
+
)
|
185
|
+
end
|
186
|
+
|
187
|
+
nil
|
188
|
+
rescue => e
|
189
|
+
NetworkResiliency.statsd&.increment(
|
190
|
+
"network_resiliency.error",
|
191
|
+
tags: {
|
192
|
+
method: __method__,
|
193
|
+
type: e.class,
|
194
|
+
},
|
195
|
+
)
|
196
|
+
|
197
|
+
warn "[ERROR] NetworkResiliency: #{e.class}: #{e.message}"
|
198
|
+
end
|
199
|
+
|
200
|
+
IP_ADDRESS_REGEX = Regexp.new(/\d{1,3}(\.\d{1,3}){3}/)
|
201
|
+
|
202
|
+
def ignore_destination?(adapter, action, destination)
|
203
|
+
# filter raw IP addresses
|
204
|
+
IP_ADDRESS_REGEX.match?(destination)
|
205
|
+
end
|
206
|
+
|
207
|
+
def timeouts_for(adapter:, action:, destination:, max: nil)
|
208
|
+
default = [ max ]
|
209
|
+
|
210
|
+
return default if NetworkResiliency.mode == :observe
|
211
|
+
|
118
212
|
key = [ adapter, action, destination ].join(":")
|
119
|
-
StatsEngine.
|
213
|
+
stats = StatsEngine.get(key)
|
214
|
+
|
215
|
+
return default unless stats.n >= RESILIENCY_SIZE_THRESHOLD
|
216
|
+
|
217
|
+
tags = {
|
218
|
+
adapter: adapter,
|
219
|
+
action: action,
|
220
|
+
destination: destination,
|
221
|
+
}
|
222
|
+
|
223
|
+
p99 = (stats.avg + stats.stdev * 3).power_ceil
|
224
|
+
timeouts = []
|
225
|
+
|
226
|
+
if max
|
227
|
+
if p99 < max
|
228
|
+
timeouts << p99
|
229
|
+
|
230
|
+
# fallback attempt
|
231
|
+
if max - p99 > p99
|
232
|
+
# use remaining time for second attempt
|
233
|
+
timeouts << max - p99
|
234
|
+
else
|
235
|
+
timeouts << max
|
236
|
+
|
237
|
+
NetworkResiliency.statsd&.increment(
|
238
|
+
"network_resiliency.timeout.raised",
|
239
|
+
tags: tags,
|
240
|
+
)
|
241
|
+
end
|
242
|
+
else
|
243
|
+
# the specified timeout is less than our expected p99...awkward
|
244
|
+
timeouts << max
|
245
|
+
|
246
|
+
NetworkResiliency.statsd&.increment(
|
247
|
+
"network_resiliency.timeout.too_low",
|
248
|
+
tags: tags,
|
249
|
+
)
|
250
|
+
end
|
251
|
+
else
|
252
|
+
timeouts << p99
|
253
|
+
|
254
|
+
# timeouts << p99 * 10 if NetworkResiliency.mode == :resolute
|
255
|
+
|
256
|
+
# unbounded second attempt
|
257
|
+
timeouts << nil
|
258
|
+
|
259
|
+
NetworkResiliency.statsd&.increment(
|
260
|
+
"network_resiliency.timeout.missing",
|
261
|
+
tags: tags,
|
262
|
+
)
|
263
|
+
end
|
264
|
+
|
265
|
+
timeouts
|
120
266
|
rescue => e
|
121
267
|
NetworkResiliency.statsd&.increment(
|
122
268
|
"network_resiliency.error",
|
123
269
|
tags: {
|
270
|
+
method: __method__,
|
124
271
|
type: e.class,
|
125
272
|
},
|
126
273
|
)
|
127
274
|
|
128
275
|
warn "[ERROR] NetworkResiliency: #{e.class}: #{e.message}"
|
276
|
+
|
277
|
+
default
|
129
278
|
end
|
130
279
|
|
131
280
|
def reset
|
132
281
|
@enabled = nil
|
282
|
+
@mode = nil
|
133
283
|
Thread.current["network_resiliency"] = nil
|
134
284
|
StatsEngine.reset
|
135
|
-
|
285
|
+
|
286
|
+
if @sync_worker
|
287
|
+
@sync_worker.kill
|
288
|
+
@sync_worker = nil
|
289
|
+
end
|
136
290
|
end
|
137
291
|
|
138
292
|
private
|
@@ -147,13 +301,11 @@ module NetworkResiliency
|
|
147
301
|
raise "Redis not configured" unless redis
|
148
302
|
|
149
303
|
@sync_worker = Thread.new do
|
150
|
-
|
304
|
+
loop do
|
151
305
|
StatsEngine.sync(redis)
|
152
306
|
|
153
307
|
sleep(3)
|
154
308
|
end
|
155
|
-
rescue Interrupt
|
156
|
-
# goodbye
|
157
309
|
end
|
158
310
|
end
|
159
311
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: network_resiliency
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Daniel Pepper
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-11-
|
11
|
+
date: 2023-11-13 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: byebug
|