semian 0.6.0 → 0.6.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,67 @@
1
+ /*
2
+ System, 3rd party, and project includes
3
+
4
+ Implements Init_semian, which is used as C/Ruby entrypoint.
5
+ */
6
+
7
+ #ifndef SEMIAN_H
8
+ #define SEMIAN_H
9
+
10
+ // System includes
11
+ #include <errno.h>
12
+ #include <string.h>
13
+ #include <stdio.h>
14
+
15
+ // 3rd party includes
16
+ #include <openssl/sha.h>
17
+ #include <ruby.h>
18
+ #include <ruby/util.h>
19
+ #include <ruby/io.h>
20
+
21
+ //semian includes
22
+ #include "types.h"
23
+ #include "resource.h"
24
+
25
+ // FIXME: This is needed here temporarily
26
+ // Defines for ruby threading primitives
27
+ #if defined(HAVE_RB_THREAD_CALL_WITHOUT_GVL) && defined(HAVE_RUBY_THREAD_H)
28
+ // 2.0
29
+ #include <ruby/thread.h>
30
+ #define WITHOUT_GVL(fn,a,ubf,b) rb_thread_call_without_gvl((fn),(a),(ubf),(b))
31
+ #elif defined(HAVE_RB_THREAD_BLOCKING_REGION)
32
+ // 1.9
33
+ typedef VALUE (*my_blocking_fn_t)(void*);
34
+ #define WITHOUT_GVL(fn,a,ubf,b) rb_thread_blocking_region((my_blocking_fn_t)(fn),(a),(ubf),(b))
35
+ #endif
36
+
37
+ VALUE eSyscall, eTimeout, eInternal;
38
+
39
+ void Init_semian();
40
+
41
+ // FIXME: These are needed here temporarily while we move functions around
42
+ // Will be removed once there are new header files that the should belong to.
43
+ void
44
+ configure_tickets(int sem_id, int tickets, int should_initialize);
45
+
46
+ key_t
47
+ generate_key(const char *name);
48
+
49
+ void
50
+ set_semaphore_permissions(int sem_id, long permissions);
51
+
52
+ int
53
+ create_semaphore(int key, long permissions, int *created);
54
+
55
+ int
56
+ get_semaphore(int key);
57
+
58
+ void
59
+ raise_semian_syscall_error(const char *syscall, int error_num);
60
+
61
+ int
62
+ perform_semop(int sem_id, short index, short op, short flags, struct timespec *ts);
63
+
64
+ void *
65
+ acquire_semaphore_without_gvl(void *p);
66
+
67
+ #endif //SEMIAN_H
@@ -0,0 +1,45 @@
1
+ /*
2
+ For custom type definitions specific to semian
3
+ */
4
+ #ifndef SEMIAN_TYPES_H
5
+ #define SEMIAN_TYPES_H
6
+
7
+ #include <sys/types.h>
8
+ #include <sys/ipc.h>
9
+ #include <sys/sem.h>
10
+ #include <sys/time.h>
11
+
12
+ // For sysV semop syscals
13
+ // see man semop
14
+ union semun {
15
+ int val; /* Value for SETVAL */
16
+ struct semid_ds *buf; /* Buffer for IPC_STAT, IPC_SET */
17
+ unsigned short *array; /* Array for GETALL, SETALL */
18
+ struct seminfo *__buf; /* Buffer for IPC_INFO
19
+ (Linux-specific) */
20
+ };
21
+
22
+ // To update the ticket count
23
+ typedef struct {
24
+ int sem_id;
25
+ int tickets;
26
+ } update_ticket_count_t;
27
+
28
+ // Internal semaphore structure
29
+ typedef struct {
30
+ int sem_id;
31
+ struct timespec timeout;
32
+ int error;
33
+ char *name;
34
+ } semian_resource_t;
35
+
36
+ // FIXME: move this to more appropriate location once the file exists
37
+ typedef enum
38
+ {
39
+ SI_SEM_TICKETS, // semaphore for the tickets currently issued
40
+ SI_SEM_CONFIGURED_TICKETS, // semaphore to track the desired number of tickets available for issue
41
+ SI_SEM_LOCK, // metadata lock to act as a mutex, ensuring thread-safety for updating other semaphores
42
+ SI_NUM_SEMAPHORES // always leave this as last entry for count to be accurate
43
+ } semaphore_indices;
44
+
45
+ #endif // SEMIAN_TYPES_H
@@ -1,3 +1,3 @@
1
1
  module Semian
2
- VERSION = '0.6.0'
2
+ VERSION = '0.6.1'
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: semian
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.0
4
+ version: 0.6.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Scott Francis
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2016-10-17 00:00:00.000000000 Z
12
+ date: 2017-02-15 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rake-compiler
@@ -132,17 +132,12 @@ extensions:
132
132
  - ext/semian/extconf.rb
133
133
  extra_rdoc_files: []
134
134
  files:
135
- - ".gitignore"
136
- - ".rubocop.yml"
137
- - ".ruby-version"
138
- - ".travis.yml"
139
- - CHANGELOG.md
140
- - Gemfile
141
- - LICENSE.md
142
- - README.md
143
- - Rakefile
144
135
  - ext/semian/extconf.rb
136
+ - ext/semian/resource.c
137
+ - ext/semian/resource.h
145
138
  - ext/semian/semian.c
139
+ - ext/semian/semian.h
140
+ - ext/semian/types.h
146
141
  - lib/semian.rb
147
142
  - lib/semian/adapter.rb
148
143
  - lib/semian/circuit_breaker.rb
@@ -159,23 +154,6 @@ files:
159
154
  - lib/semian/simple_state.rb
160
155
  - lib/semian/unprotected_resource.rb
161
156
  - lib/semian/version.rb
162
- - repodb.yml
163
- - scripts/install_toxiproxy.sh
164
- - semian.gemspec
165
- - test/circuit_breaker_test.rb
166
- - test/fixtures/binary.sql
167
- - test/helpers/background_helper.rb
168
- - test/instrumentation_test.rb
169
- - test/mysql2_test.rb
170
- - test/net_http_test.rb
171
- - test/redis_test.rb
172
- - test/resource_test.rb
173
- - test/semian_test.rb
174
- - test/simple_integer_test.rb
175
- - test/simple_sliding_window_test.rb
176
- - test/simple_state_test.rb
177
- - test/test_helper.rb
178
- - test/unprotected_resource_test.rb
179
157
  homepage: https://github.com/shopify/semian
180
158
  licenses:
181
159
  - MIT
@@ -196,7 +174,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
196
174
  version: '0'
197
175
  requirements: []
198
176
  rubyforge_project:
199
- rubygems_version: 2.5.1
177
+ rubygems_version: 2.5.2
200
178
  signing_key:
201
179
  specification_version: 4
202
180
  summary: Bulkheading for Ruby with SysV semaphores
data/.gitignore DELETED
@@ -1,8 +0,0 @@
1
- /.bundle/
2
- /lib/**/*.so
3
- /lib/**/*.bundle
4
- /tmp/*
5
- *.gem
6
- /html/
7
- Gemfile.lock
8
- vendor/
@@ -1,113 +0,0 @@
1
- AllCops:
2
- Exclude:
3
- - Gemfile
4
- - lib/snippets/**/*
5
- - vendor/**/*
6
- - data/**/*
7
- - db/schema.rb
8
- - db/migrate/*
9
- - test/dummy/**/*
10
- - bin/rails
11
- - lib/shipit-engine.rb
12
- - tmp/**/*
13
-
14
- Style/GuardClause:
15
- Enabled: false
16
-
17
- Lint/AssignmentInCondition:
18
- Enabled: false
19
-
20
- Lint/HandleExceptions:
21
- Enabled: false
22
-
23
- Lint/EndAlignment:
24
- Enabled: false
25
-
26
- Style/NumericLiterals:
27
- Exclude:
28
- - db/schema.rb
29
-
30
- Style/SingleSpaceBeforeFirstArg:
31
- Exclude:
32
- - db/schema.rb
33
-
34
- Style/DoubleNegation:
35
- Enabled: false
36
-
37
- Metrics/LineLength:
38
- Max: 135
39
-
40
- Metrics/MethodLength:
41
- Max: 40
42
-
43
- Metrics/ClassLength:
44
- Max: 500
45
-
46
- Metrics/AbcSize:
47
- Max: 50
48
-
49
- Metrics/CyclomaticComplexity:
50
- Max: 10
51
-
52
- Style/Documentation:
53
- Enabled: false
54
-
55
- Style/SingleLineBlockParams:
56
- Enabled: false
57
-
58
- Style/SignalException:
59
- Enabled: false
60
-
61
- Style/RaiseArgs:
62
- Enabled: false
63
-
64
- Style/ModuleFunction:
65
- Enabled: false
66
-
67
- Style/RedundantReturn:
68
- AllowMultipleReturnValues: true
69
-
70
- Style/IndentHash:
71
- Enabled: false
72
-
73
- Style/TrailingComma:
74
- EnforcedStyleForMultiline: comma
75
-
76
- Style/ClassAndModuleChildren:
77
- Enabled: false
78
-
79
- Style/PredicateName:
80
- Exclude:
81
- - app/serializers/**/*
82
-
83
- Style/SpaceInsideHashLiteralBraces:
84
- EnforcedStyle: no_space
85
-
86
- Style/StringLiterals:
87
- Enabled: false
88
-
89
- Style/PerlBackrefs:
90
- Enabled: false
91
-
92
- Style/TrivialAccessors:
93
- AllowPredicates: true
94
-
95
- Style/ExtraSpacing:
96
- AllowForAlignment: false
97
-
98
- Style/GlobalVars:
99
- Exclude:
100
- - 'ext/semian/extconf.rb'
101
-
102
- Lint/Eval:
103
- Exclude:
104
- - 'Rakefile'
105
-
106
- Metrics/ParameterLists:
107
- Enabled: false
108
-
109
- Style/IfUnlessModifier:
110
- Enabled: false
111
-
112
- Style/CaseIndentation:
113
- IndentWhenRelativeTo: end
@@ -1 +0,0 @@
1
- 2.1.6
@@ -1,15 +0,0 @@
1
- language: ruby
2
-
3
- sudo: true
4
-
5
- before_install:
6
- - gem install bundler
7
- - scripts/install_toxiproxy.sh
8
-
9
- rvm:
10
- - '2.1'
11
- - '2.2'
12
- - '2.3.1'
13
-
14
- services:
15
- - redis-server
@@ -1,11 +0,0 @@
1
- # v0.4.1
2
- * resource: cast float ticket count to fixnum #75
3
-
4
- # v0.4.0
5
-
6
- * net/http: add adapter for net/http #58
7
- * circuit_breaker: split circuit breaker into three data structures to allow for
8
- alternative implementations in the future #62
9
- * mysql: don't prevent rollbacks on transactions #60
10
- * core: fix initialization bug when the resource is accessed before the options
11
- are set #65
data/Gemfile DELETED
@@ -1,10 +0,0 @@
1
- source 'https://rubygems.org'
2
- gemspec
3
-
4
- group :debug do
5
- gem 'byebug'
6
- end
7
-
8
- group :development, :test do
9
- gem 'rubocop', '~> 0.34.2'
10
- end
data/LICENSE.md DELETED
@@ -1,21 +0,0 @@
1
- The MIT License (MIT)
2
-
3
- Copyright (c) 2014 Shopify
4
-
5
- Permission is hereby granted, free of charge, to any person obtaining a copy
6
- of this software and associated documentation files (the "Software"), to deal
7
- in the Software without restriction, including without limitation the rights
8
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
- copies of the Software, and to permit persons to whom the Software is
10
- furnished to do so, subject to the following conditions:
11
-
12
- The above copyright notice and this permission notice shall be included in all
13
- copies or substantial portions of the Software.
14
-
15
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
- SOFTWARE.
data/README.md DELETED
@@ -1,576 +0,0 @@
1
- ## Semian [![Build Status](https://travis-ci.org/Shopify/semian.svg?branch=master)](https://travis-ci.org/Shopify/semian)
2
-
3
- ![](http://i.imgur.com/7Vn2ibF.png)
4
-
5
- Semian is a library for controlling access to slow or unresponsive external
6
- services to avoid cascading failures.
7
-
8
- When services are down they typically fail fast with errors like `ECONNREFUSED`
9
- and `ECONNRESET` which can be rescued in code. However, slow resources fail
10
- slowly. The thread serving the request blocks until it hits the timeout for the
11
- slow resource. During that time, the thread is doing nothing useful and thus the
12
- slow resource has caused a cascading failure by occupying workers and therefore
13
- losing capacity. **Semian is a library for failing fast in these situations,
14
- allowing you to handle errors gracefully.** Semian does this by intercepting
15
- resource access through heuristic patterns inspired by [Hystrix][hystrix] and
16
- [Release It][release-it]:
17
-
18
- * [**Circuit breaker**](#circuit-breaker). A pattern for limiting the
19
- amount of requests to a dependency that is having issues.
20
- * [**Bulkheading**](#bulkheading). Controlling the concurrent access to
21
- a single resource, access is coordinates server-wide with [SysV
22
- semaphores][sysv].
23
-
24
- Resource drivers are monkey-patched to be aware of Semian, these are called
25
- [Semian Adapters](#adapters). Thus, every time resource access is requested
26
- Semian is queried for status on the resource first. If Semian, through the
27
- patterns above, deems the resource to be unavailable it will raise an exception.
28
- **The ultimate outcome of Semian is always an exception that can then be rescued
29
- for a graceful fallback**. Instead of waiting for the timeout, Semian raises
30
- straight away.
31
-
32
- If you are already rescuing exceptions for failing resources and timeouts,
33
- Semian is mostly a drop-in library with a little configuration that will make
34
- your code more resilient to slow resource access. But, [do you even need
35
- Semian?](#do-i-need-semian)
36
-
37
- For an overview of building resilient Ruby applications, start by reading [the
38
- Shopify blog post on Toxiproxy and Semian][resiliency-blog-post]. For more in
39
- depth information on Semian, see [Understanding Semian](#understanding-semian).
40
- Semian is an extraction from [Shopify][shopify] where it's been running
41
- successfully in production since October, 2014.
42
-
43
- The other component to your Ruby resiliency kit is [Toxiproxy][toxiproxy] to
44
- write automated resiliency tests.
45
-
46
- # Usage
47
-
48
- Install by adding the gem to your `Gemfile` and require the [adapters](#adapters) you need:
49
-
50
- ```ruby
51
- gem 'semian', require: %w(semian semian/mysql2 semian/redis)
52
- ```
53
-
54
- We recommend this pattern of requiring adapters directly from the `Gemfile`.
55
- This makes ensures Semian adapters is loaded as early as possible, to also
56
- protect your application during boot. Please see the [adapter configuration
57
- section](#configuration) on how to configure adapters.
58
-
59
- ## Adapters
60
-
61
- Semian works by intercepting resource access. Every time access is requested,
62
- Semian is queried, and it will raise an exception if the resource is unavailable
63
- according to the circuit breaker or bulkheads. This is done by monkey-patching
64
- the resource driver. **The exception raised by the driver always inherits from
65
- the Base exception class of the driver**, meaning you can always simply rescue
66
- the base class and catch both Semian and driver errors in the same rescue for
67
- fallbacks.
68
-
69
- The following adapters are in Semian and tested heavily in production, the
70
- version is the version of the public gem with the same name:
71
-
72
- * [`semian/mysql2`][mysql-semian-adapter] (~> 0.3.16)
73
- * [`semian/redis`][redis-semian-adapter] (~> 3.2.1)
74
- * [`semian/net_http`][nethttp-semian-adapter]
75
-
76
- ### Creating Adapters
77
-
78
- To create a Semian adapter you must implement the following methods:
79
-
80
- 1. [`include Semian::Adapter`][semian-adapter]. Use the helpers to wrap the
81
- resource. This takes care of situations such as monitoring, nested resources,
82
- unsupported platforms, creating the Semian resource if it doesn't already
83
- exist and so on.
84
- 2. `#semian_identifier`. This is responsible for returning a symbol that
85
- represents every unique resource, for example `redis_master` or
86
- `mysql_shard_1`. This is usually assembled from a `name` attribute on the
87
- Semian configuration hash, but could also be `<host>:<port>`.
88
- 3. `connect`. The name of this method varies. You must override the driver's
89
- connect method with one that wraps the connect call with
90
- `Semian::Resource#acquire`. You should do this at the lowest possible level.
91
- 4. `query`. Same as `connect` but for queries on the resource.
92
- 5. Define exceptions `ResourceBusyError` and `CircuitOpenError`. These are
93
- raised when the request was rejected early because the resource is out of
94
- tickets or because the circuit breaker is open (see [Understanding
95
- Semian](#understanding-semian). They should inherit from the base exception
96
- class from the raw driver. For example `Mysql2::Error` or
97
- `Redis::BaseConnectionError` for the MySQL and Redis drivers. This makes it
98
- easy to `rescue` and handle them gracefully in application code, by
99
- `rescue`ing the base class.
100
-
101
- The best resource is looking at the [already implemented adapters](#adapters).
102
-
103
- ### Configuration
104
-
105
- When instantiating a resource it now needs to be configured for Semian. This is
106
- done by passing `semian` as an argument when initializing the client. Examples
107
- built in adapters:
108
-
109
- ```ruby
110
- # MySQL2 client
111
- # In Rails this means having a Semian key in database.yml for each db.
112
- client = Mysql2::Client.new(host: "localhost", username: "root", semian: {
113
- name: "master",
114
- tickets: 8, # See the Understanding Semian section on picking these values
115
- success_threshold: 2,
116
- error_threshold: 3,
117
- error_timeout: 10
118
- })
119
-
120
- # Redis client
121
- client = Redis.new(semian: {
122
- name: "inventory",
123
- tickets: 4,
124
- success_threshold: 2,
125
- error_threshold: 4,
126
- error_timeout: 20
127
- })
128
- ```
129
-
130
- #### Net::HTTP
131
- For the `Net::HTTP` specific Semian adapter, since many external libraries may create
132
- HTTP connections on the user's behalf, the parameters are instead provided
133
- by associating callback functions with `Semian::NetHTTP`, perhaps in an initialization file.
134
-
135
- ##### Naming and Options
136
- To give Semian parameters, assign a `proc` to `Semian::NetHTTP.semian_configuration`
137
- that takes a two parameters, `host` and `port` like `127.0.0.1`,`443` or `github_com`,`80`,
138
- and returns a `Hash` with configuration parameters as follows. The `proc` is used as a
139
- callback to initialize the configuration options, similar to other adapters.
140
-
141
- ```ruby
142
- SEMIAN_PARAMETERS = { tickets: 1,
143
- success_threshold: 1,
144
- error_threshold: 3,
145
- error_timeout: 10 }
146
- Semian::NetHTTP.semian_configuration = proc do |host, port|
147
- # Let's make it only active for github.com
148
- if host == "github.com" && port == "80"
149
- SEMIAN_PARAMETERS.merge(name: "github.com_80")
150
- else
151
- nil
152
- end
153
- end
154
-
155
- # Called from within API:
156
- # semian_options = Semian::NetHTTP.semian_configuration("github.com", 80)
157
- # semian_identifier = "nethttp_#{semian_options[:name]}"
158
- ```
159
-
160
- The `name` should be carefully chosen since it identifies the resource being protected.
161
- The `semian_options` passed apply to that resource. Semian creates the `semian_identifier`
162
- from the `name` to look up and store changes in the circuit breaker and bulkhead states
163
- and associate successes, failures, errors with the protected resource.
164
-
165
- We only require that:
166
- * the `semian_configuration` be **set only once** over the lifetime of the library
167
- * the output of the `proc` be the same over time, that is, the configuration produced by
168
- each pair of `host`, `port` is **the same each time** the callback is invoked.
169
-
170
- For most purposes, `"#{host}_#{port}"` is a good default `name`. Custom `name` formats
171
- can be useful to grouping related subdomains as one resource, so that they all
172
- contribute to the same circuit breaker and bulkhead state and fail together.
173
-
174
- A return value of `nil` for `semian_configuration` means Semian is disabled for that
175
- HTTP endpoint. This works well since the result of a failed Hash lookup is `nil` also.
176
- This behavior lets the adapter default to whitelisting, although the
177
- behavior can be changed to blacklisting or even be completely disabled by varying
178
- the use of returning `nil` in the assigned closure.
179
-
180
- ##### Additional Exceptions
181
- Since we envision this particular adapter can be used in combination with many
182
- external libraries, that can raise additional exceptions, we added functionality to
183
- expand the Exceptions that can be tracked as part of Semian's circuit breaker.
184
- This may be necessary for libraries that introduce new exceptions or re-raise them.
185
- Add exceptions and reset to the [`default`][nethttp-default-errors] list using the following:
186
-
187
- ```ruby
188
- # assert_equal(Semian::NetHTTP.exceptions, Semian::NetHTTP::DEFAULT_ERRORS)
189
- Semian::NetHTTP.exceptions += [::OpenSSL::SSL::SSLError]
190
-
191
- Semian::NetHTTP.reset_exceptions
192
- # assert_equal(Semian::NetHTTP.exceptions, Semian::NetHTTP::DEFAULT_ERRORS)
193
- ```
194
-
195
- # Understanding Semian
196
-
197
- Semian is a library with heuristics for failing fast. This section will explain
198
- in depth how Semian works and which situations it's applicable for. First we
199
- explain the category of problems Semian is meant to solve. Then we dive into how
200
- Semian works to solve these problems.
201
-
202
- ## Do I need Semian?
203
-
204
- Semian is not a trivial library to understand, introduces complexity and thus
205
- should be introduced with care. Remember, all Semian does is raise exceptions
206
- based on heuristics. It is paramount that you understand Semian before
207
- including it in production as you may otherwise be surprised by its behaviour.
208
-
209
- Applications that benefit from Semian are those working on eliminating SPOFs
210
- (Single Points of Failure), and specifically are running into a wall regarding
211
- slow resources. But it is by no means a magic wand that solves all your latency
212
- problems by being added to your `Gemfile`. This section describes the types of
213
- problems Semian solves.
214
-
215
- If your application is multithreaded or evented (e.g. not Resque and Unicorn)
216
- these problems are not as pressing. You can still get use out of Semian however.
217
-
218
- ### Real World Example
219
-
220
- This is better illustrated with a real world example from Shopify. When you are
221
- browsing a store while signed in, Shopify stores your session in Redis.
222
- If Redis becomes unavailable, the driver will start throwing exceptions.
223
- We rescue these exceptions and simply disable all customer sign in functionality
224
- on the store until Redis is back online.
225
-
226
- This is great if querying the resource fails instantly, because it means we fail
227
- in just a single roundtrip of ~1ms. But if the resource is unresponsive or slow,
228
- this can take as long as our timeout which is easily 200ms. This means every
229
- request, even if it does rescue the exception, now takes an extra 200ms.
230
- Because every resource takes that long, our capacity is also significantly
231
- degraded. These problems are explained in depth in the next two sections.
232
-
233
- With Semian, the slow resource would fail instantly (after a small amount of
234
- convergence time) preventing your response time from spiking and not decreasing
235
- capacity of the cluster.
236
-
237
- If this sounds familiar to you, Semian is what you need to be resilient to
238
- latency. You may not need the graceful fallback depending on your application,
239
- in which case it will just result in an error (e.g. a `HTTP 500`) faster.
240
-
241
- We will now examine the two problems in detail.
242
-
243
- #### In-depth analysis of real world example
244
-
245
- If a single resource is slow, every single request is going to suffer. We saw
246
- this in the example before. Let's illustrate this more clearly in the following
247
- Rails example where the user session is stored in Redis:
248
-
249
- ```ruby
250
- def index
251
- @user = fetch_user
252
- @posts = Post.all
253
- end
254
-
255
- private
256
- def fetch_user
257
- user = User.find(session[:user_id])
258
- rescue Redis::CannotConnectError
259
- nil
260
- end
261
- ```
262
-
263
- Our code is resilient to a failure of the session layer, it doesn't `HTTP 500`
264
- if the session store is unavailable (this can be tested with
265
- [Toxiproxy][toxiproxy]). If the `User` and `Post` data store is unavailable, the
266
- server will send back `HTTP 500`. We accept that, because it's our primary data
267
- store. This could be prevented with a caching tier or something else out of
268
- scope.
269
-
270
- This code has two flaws however:
271
-
272
- 1. **What happens if the session storage is consistently slow?** I.e. the majority
273
- of requests take, say, more than half the timeout time (but it should only
274
- take ~1ms)?
275
- 2. **What happens if the session storage is unavailable and is not responding at
276
- all?** I.e. we hit timeouts on every request.
277
-
278
- These two problems in turn have two related problems associated with them:
279
- response time and capacity.
280
-
281
- #### Response time
282
-
283
- Requests that attempt to access a down session storage are all gracefully handled, the
284
- `@user` will simply be `nil`, which the code handles. There is still a
285
- major impact on users however, as every request to the storage has to time
286
- out. This causes the average response time to all pages that access it to go up by
287
- however long your timeout is. Your timeout is proportional to your worst case timeout,
288
- as well as the number of attempts to hit it on each page. This is the problem Semian
289
- solves by using heuristics to fail these requests early which causes a much better
290
- user experience during downtime.
291
-
292
- #### Capacity loss
293
-
294
- When your single-threaded worker is waiting for a resource to return, it's
295
- effectively doing nothing when it could be serving fast requests. To use the
296
- example from before, perhaps some actions do not access the session storage at
297
- all. These requests will pile up behind the now slow requests that are trying to
298
- access that layer, because they're failing slowly. Essentially, your capacity
299
- degrades significantly because your average response time goes up (as explained
300
- in the previous section). Capacity loss simply follows from an increase in
301
- response time. The higher your timeout and the slower your resource, the more
302
- capacity you lose.
303
-
304
- #### Timeouts aren't enough
305
-
306
- It should be clear by now that timeouts aren't enough. Consistent timeouts will
307
- increase the average response time, which causes a bad user experience, and
308
- ultimately compromise the performance of the entire system. Even if the timeout
309
- is as low as ~250ms (just enough to allow a single TCP retransmit) there's a
310
- large loss of capacity and for many applications a 100-300% increase in average
311
- response time. This is the problem Semian solves by failing fast.
312
-
313
- ## How does Semian work?
314
-
315
- Semian consists of two parts: circuit breaker and bulkheading. To understand
316
- Semian, and especially how to configure it, we must understand these patterns
317
- and their implementation.
318
-
319
- ### Circuit Breaker
320
-
321
- The circuit breaker pattern is based on a simple observation - if we hit a
322
- timeout or any other error for a given service one or more times, we’re likely
323
- to hit it again for some amount of time. Instead of hitting the timeout
324
- repeatedly, we can mark the resource as dead for some amount of time during
325
- which we raise an exception instantly on any call to it. This is called the
326
- [circuit breaker pattern][cbp].
327
-
328
- ![](http://cdn.shopify.com/s/files/1/0070/7032/files/image01_grande.png)
329
-
330
- When we perform a Remote Procedure Call (RPC), it will first check the circuit.
331
- If the circuit is rejecting requests because of too many failures reported by
332
- the driver, it will throw an exception immediately. Otherwise the circuit will
333
- call the driver. If the driver fails to get data back from the data store, it
334
- will notify the circuit. The circuit will count the error so that if too many
335
- errors have happened recently, it will start rejecting requests immediately
336
- instead of waiting for the driver to time out. The exception will then be raised
337
- back to the original caller. If the driver’s request was successful, it will
338
- return the data back to the calling method and notify the circuit that it made a
339
- successful call.
340
-
341
- The state of the circuit breaker is local to the worker and is not shared across
342
- all workers on a server.
343
-
344
- #### Circuit Breaker Configuration
345
-
346
- There are three configuration parameters for circuit breakers in Semian:
347
-
348
- * **error_threshold**. The amount of errors to encounter for the worker before
349
- opening the circuit, that is to start rejecting requests instantly.
350
- * **error_timeout**. The amount of time until trying to query the resource
351
- again.
352
- * **success_threshold**. The amount of successes on the circuit until closing it
353
- again, that is to start accepting all requests to the circuit.
354
-
355
- ### Bulkheading
356
-
357
- For many applications, circuit breakers are not enough however. This is best
358
- illustrated with an extreme. Imagine if the timeout for our data store isn't as
359
- low as 200ms, but actually 10 seconds. For example, you might have a relational data
360
- store where for some customers, 10s queries are (unfortunately) legitimate.
361
- Reducing the time of worst case queries requires a lot of effort. Dropping the
362
- query immediately could potentially leave some customers unable to access certain
363
- functionality. High timeouts are especially critical in a non-threaded
364
- environment where blocking IO means a worker is effectively doing nothing.
365
-
366
- In this case, circuit breakers aren't sufficient. Assuming the circuit is shared
367
- across all processes on a server, it will still take at least 10s before the
368
- circuit is open—in that time every worker is blocked. Meaning we are in a
369
- reduced capacity state for at least 20s, with the last 10s timeouts
370
- occurring just before the circuit opens at the 10s mark when a couple of
371
- workers have hit a timeout and the circuit opens. We thought of a number of
372
- potential solutions to this problem - stricter timeouts, grouping timeouts by
373
- section of our application, timeouts per statement—but they all still revolved
374
- around timeouts, and those are extremely hard to get right.
375
-
376
- Instead of thinking about timeouts, we took inspiration from Hystrix by Netflix
377
- and the book Release It (the resiliency bible), and look at our services as
378
- connection pools. On a server with `W` workers, only a certain number of them
379
- are expected to be talking to a single data store at once. Let's say we've
380
- determined from our monitoring that there’s a 10% chance they’re talking to
381
- `mysql_shard_0` at any given point in time under normal traffic. The probability
382
- that five workers are talking to it at the same time is 0.001%. If we only allow
383
- five workers to talk to a resource at any given point in time, and accept the
384
- 0.001% false positive rate—we can fail the sixth worker attempting to check out
385
- a connection instantly. This means that while the five workers are waiting for a
386
- timeout, all the other `W-5` workers on the node will instantly be failing on
387
- checking out the connection and opening their circuits. Our capacity is only
388
- degraded by a relatively small amount.
389
-
390
- We call this limitation primitive "tickets". In this case, the resource access
391
- is limited to 5 tickets (see Configuration). The timeout value specifies the
392
- maximum amount of time to block if no ticket is available.
393
-
394
- How do we limit the access to a resource for all workers on a server when the
395
- workers do not directly share memory? This is implemented with [SysV
396
- semaphores][sysv] to provide server-wide access control.
397
-
398
- #### Bulkhead Configuration
399
-
400
- There are two configuration values. It's not easy to choose good values and we're
401
- still experimenting with ways to figure out optimal ticket numbers. Generally
402
- something below half the number of workers on the server for endpoints that are
403
- queried frequently has worked well for us.
404
-
405
- * **tickets**. Number of workers that can concurrently access a resource.
406
- * **timeout**. Time to wait to acquire a ticket if there are no tickets left.
407
- We recommend this to be `0` unless you have very few workers running (i.e.
408
- less than ~5).
409
-
410
- ## Defense line
411
-
412
- The finished defense line for resource access with circuit breakers and
413
- bulkheads then looks like this:
414
-
415
- ![](http://cdn.shopify.com/s/files/1/0070/7032/files/image02_grande.png)
416
-
417
- The RPC first checks the circuit; if the circuit is open it will raise the
418
- exception straight away which will trigger the fallback (the default fallback is
419
- a 500 response). Otherwise, it will try Semian which fails instantly if too many
420
- workers are already querying the resource. Finally the driver will query the
421
- data store. If the data store succeeds, the driver will return the data back to
422
- the RPC. If the data store is slow or fails, this is our last line of defense
423
- against a misbehaving resource. The driver will raise an exception after trying
424
- to connect with a timeout or after an immediate failure. These driver actions
425
- will affect the circuit and Semian, which can make future calls fail faster.
426
-
427
- ## Failing gracefully
428
-
429
- Ok, great, we've got a way to fail fast with slow resources, how does that make
430
- my application more resilient?
431
-
432
- Failing fast is only half the battle. It's up to you what you do with these
433
- errors, in the [session example](#real-world-example) we handle it gracefully by
434
- signing people out and disabling all session related functionality till the data
435
- store is back online. However, not rescuing the exception and simply sending
436
- `HTTP 500` back to the client faster will help with [capacity
437
- loss](#capacity-loss).
438
-
439
- ### Exceptions inherit from base class
440
-
441
- It's important to understand that the exceptions raised by [Semian
442
- Adapters](#adapters) inherit from the base class of the driver itself, meaning
443
- that if you do something like:
444
-
445
- ```ruby
446
- def posts
447
- Post.all
448
- rescue Mysql2::Error
449
- []
450
- end
451
- ```
452
-
453
- Exceptions raised by Semian's `MySQL2` adapter will also get caught.
454
-
455
- ### Patterns
456
-
457
- We do not recommend mindlessly sprinkling `rescue`s all over the place. What you
458
- should do instead is writing decorators around secondary data stores (e.g. sessions)
459
- that provide resiliency for free. For example, if we stored the tags associated
460
- with products in a secondary data store it could look something like this:
461
-
462
- ```ruby
463
- # Resilient decorator for storing a Set in Redis.
464
- class RedisSet
465
- def initialize(key)
466
- @key = key
467
- end
468
-
469
- def get
470
- redis.smembers(@key)
471
- rescue Redis::BaseConnectionError
472
- []
473
- end
474
-
475
- private
476
-
477
- def redis
478
- @redis ||= Redis.new
479
- end
480
- end
481
-
482
- class Product
483
- # This will simply return an empty array in the case of a Redis outage.
484
- def tags
485
- tags_set.get
486
- end
487
-
488
- private
489
-
490
- def tags_set
491
- @tags_set ||= RedisSet.new("product:tags:#{self.id}")
492
- end
493
- end
494
- ```
495
-
496
- These decorators can be resiliency tested with [Toxiproxy][toxiproxy]. You can
497
- provide fallbacks around your primary data store as well. In our case, we simply
498
- `HTTP 500` in those cases unless it's cached because these pages aren't worth
499
- much without data from their primary data store.
500
-
501
- ## Monitoring
502
-
503
- With [`Semian::Instrumentable`][semian-instrumentable] clients can monitor
504
- Semian internals. For example to instrument just events with
505
- [`statsd-instrument`][statsd-instrument]:
506
-
507
- ```ruby
508
- # `event` is `success`, `busy`, `circuit_open`.
509
- # `resource` is the `Semian::Resource` object
510
- # `scope` is `connection` or `query` (others can be instrumented too from the adapter)
511
- # `adapter` is the name of the adapter (mysql2, redis, ..)
512
- Semian.subscribe do |event, resource, scope, adapter|
513
- StatsD.increment("Shopify.#{adapter}.semian.#{event}", 1, tags: [
514
- "resource:#{resource.name}",
515
- "total_tickets:#{resource.tickets}",
516
- "type:#{scope}",
517
- ])
518
- end
519
- ```
520
-
521
- # FAQ
522
-
523
- **How does Semian work with containers?** Semian uses [SysV semaphores][sysv] to
524
- coordinate access to a resource. The semaphore is only shared within the
525
- [IPC][namespaces]. Unless you are running many workers inside every container,
526
- this leaves the bulkheading pattern effectively useless. We recommend sharing
527
- the IPC namespace between all containers on your host for the best ticket
528
- economy. If you are using Docker, this can be done with the [--ipc
529
- flag](https://docs.docker.com/reference/run/#ipc-settings).
530
-
531
- **Why isn't resource access shared across the entire cluster?** This implies a
532
- coordination data store. Semian would have to be resilient to failures of this
533
- data store as well, and fall back to other primitives. While it's nice to have
534
- all workers have the same view of the world, this greatly increases the
535
- complexity of the implementation which is not favourable for resiliency code.
536
-
537
- **Why isn't the circuit breaker implemented as a host-wide mechanism?** No good
538
- reason. Patches welcome!
539
-
540
- **Why is there no fallback mechanism in Semian?** Read the [Failing
541
- Gracefully](#failing-gracefully) section. In short, exceptions is exactly this.
542
- We did not want to put an extra level on abstraction on top of this. In the
543
- first internal implementation this was the case, but we later moved away from
544
- it.
545
-
546
- **Why does it not use normal Ruby semaphores?** To work properly the access
547
- control needs to be performed across many workers. With MRI that means having
548
- multiple processes, not threads. Thus we need a primitive outside of the
549
- interpreter. For other Ruby implementations a driver that uses Ruby semaphores
550
- could be used (and would be accepted as a PR).
551
-
552
- **Why are there three semaphores in the semaphore sets for each resource?** This
553
- has to do with being able to resize the number of tickets for a resource online.
554
-
555
- **Can I change the number of tickets freely?** Yes, the logic for this isn't
556
- trivial but it works well.
557
-
558
- **What is the performance overhead of Semian?** Extremely minimal in comparison
559
- to going to the network. Don't worry about it unless you're instrumenting
560
- non-IO.
561
-
562
- [hystrix]: https://github.com/Netflix/Hystrix
563
- [release-it]: https://pragprog.com/book/mnee/release-it
564
- [shopify]: http://www.shopify.com/
565
- [mysql-semian-adapter]: lib/semian/mysql2.rb
566
- [redis-semian-adapter]: lib/semian/redis.rb
567
- [semian-adapter]: lib/semian/adapter.rb
568
- [nethttp-semian-adapter]: lib/semian/net_http.rb
569
- [nethttp-default-errors]: lib/semian/net_http.rb#L35-L45
570
- [semian-instrumentable]: lib/semian/instrumentable.rb
571
- [statsd-instrument]: http://github.com/shopify/statsd-instrument
572
- [resiliency-blog-post]: http://www.shopify.com/technology/16906928-building-and-testing-resilient-ruby-on-rails-applications
573
- [toxiproxy]: https://github.com/Shopify/toxiproxy
574
- [sysv]: http://man7.org/linux/man-pages/man7/svipc.7.html
575
- [cbp]: https://en.wikipedia.org/wiki/Circuit_breaker_design_pattern
576
- [namespaces]: http://man7.org/linux/man-pages/man7/namespaces.7.html