semian 0.6.0 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,67 @@
1
+ /*
2
+ System, 3rd party, and project includes
3
+
4
+ Implements Init_semian, which is used as C/Ruby entrypoint.
5
+ */
6
+
7
+ #ifndef SEMIAN_H
8
+ #define SEMIAN_H
9
+
10
+ // System includes
11
+ #include <errno.h>
12
+ #include <string.h>
13
+ #include <stdio.h>
14
+
15
+ // 3rd party includes
16
+ #include <openssl/sha.h>
17
+ #include <ruby.h>
18
+ #include <ruby/util.h>
19
+ #include <ruby/io.h>
20
+
21
+ //semian includes
22
+ #include "types.h"
23
+ #include "resource.h"
24
+
25
+ // FIXME: This is needed here temporarily
26
+ // Defines for ruby threading primitives
27
+ #if defined(HAVE_RB_THREAD_CALL_WITHOUT_GVL) && defined(HAVE_RUBY_THREAD_H)
28
+ // 2.0
29
+ #include <ruby/thread.h>
30
+ #define WITHOUT_GVL(fn,a,ubf,b) rb_thread_call_without_gvl((fn),(a),(ubf),(b))
31
+ #elif defined(HAVE_RB_THREAD_BLOCKING_REGION)
32
+ // 1.9
33
+ typedef VALUE (*my_blocking_fn_t)(void*);
34
+ #define WITHOUT_GVL(fn,a,ubf,b) rb_thread_blocking_region((my_blocking_fn_t)(fn),(a),(ubf),(b))
35
+ #endif
36
+
37
+ VALUE eSyscall, eTimeout, eInternal;
38
+
39
+ void Init_semian();
40
+
41
+ // FIXME: These are needed here temporarily while we move functions around
42
+ // Will be removed once there are new header files that the should belong to.
43
+ void
44
+ configure_tickets(int sem_id, int tickets, int should_initialize);
45
+
46
+ key_t
47
+ generate_key(const char *name);
48
+
49
+ void
50
+ set_semaphore_permissions(int sem_id, long permissions);
51
+
52
+ int
53
+ create_semaphore(int key, long permissions, int *created);
54
+
55
+ int
56
+ get_semaphore(int key);
57
+
58
+ void
59
+ raise_semian_syscall_error(const char *syscall, int error_num);
60
+
61
+ int
62
+ perform_semop(int sem_id, short index, short op, short flags, struct timespec *ts);
63
+
64
+ void *
65
+ acquire_semaphore_without_gvl(void *p);
66
+
67
+ #endif //SEMIAN_H
@@ -0,0 +1,45 @@
1
+ /*
2
+ For custom type definitions specific to semian
3
+ */
4
+ #ifndef SEMIAN_TYPES_H
5
+ #define SEMIAN_TYPES_H
6
+
7
+ #include <sys/types.h>
8
+ #include <sys/ipc.h>
9
+ #include <sys/sem.h>
10
+ #include <sys/time.h>
11
+
12
+ // For sysV semop syscals
13
+ // see man semop
14
+ union semun {
15
+ int val; /* Value for SETVAL */
16
+ struct semid_ds *buf; /* Buffer for IPC_STAT, IPC_SET */
17
+ unsigned short *array; /* Array for GETALL, SETALL */
18
+ struct seminfo *__buf; /* Buffer for IPC_INFO
19
+ (Linux-specific) */
20
+ };
21
+
22
+ // To update the ticket count
23
+ typedef struct {
24
+ int sem_id;
25
+ int tickets;
26
+ } update_ticket_count_t;
27
+
28
+ // Internal semaphore structure
29
+ typedef struct {
30
+ int sem_id;
31
+ struct timespec timeout;
32
+ int error;
33
+ char *name;
34
+ } semian_resource_t;
35
+
36
+ // FIXME: move this to more appropriate location once the file exists
37
+ typedef enum
38
+ {
39
+ SI_SEM_TICKETS, // semaphore for the tickets currently issued
40
+ SI_SEM_CONFIGURED_TICKETS, // semaphore to track the desired number of tickets available for issue
41
+ SI_SEM_LOCK, // metadata lock to act as a mutex, ensuring thread-safety for updating other semaphores
42
+ SI_NUM_SEMAPHORES // always leave this as last entry for count to be accurate
43
+ } semaphore_indices;
44
+
45
+ #endif // SEMIAN_TYPES_H
@@ -1,3 +1,3 @@
1
1
  module Semian
2
- VERSION = '0.6.0'
2
+ VERSION = '0.6.1'
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: semian
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.0
4
+ version: 0.6.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - Scott Francis
@@ -9,7 +9,7 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2016-10-17 00:00:00.000000000 Z
12
+ date: 2017-02-15 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: rake-compiler
@@ -132,17 +132,12 @@ extensions:
132
132
  - ext/semian/extconf.rb
133
133
  extra_rdoc_files: []
134
134
  files:
135
- - ".gitignore"
136
- - ".rubocop.yml"
137
- - ".ruby-version"
138
- - ".travis.yml"
139
- - CHANGELOG.md
140
- - Gemfile
141
- - LICENSE.md
142
- - README.md
143
- - Rakefile
144
135
  - ext/semian/extconf.rb
136
+ - ext/semian/resource.c
137
+ - ext/semian/resource.h
145
138
  - ext/semian/semian.c
139
+ - ext/semian/semian.h
140
+ - ext/semian/types.h
146
141
  - lib/semian.rb
147
142
  - lib/semian/adapter.rb
148
143
  - lib/semian/circuit_breaker.rb
@@ -159,23 +154,6 @@ files:
159
154
  - lib/semian/simple_state.rb
160
155
  - lib/semian/unprotected_resource.rb
161
156
  - lib/semian/version.rb
162
- - repodb.yml
163
- - scripts/install_toxiproxy.sh
164
- - semian.gemspec
165
- - test/circuit_breaker_test.rb
166
- - test/fixtures/binary.sql
167
- - test/helpers/background_helper.rb
168
- - test/instrumentation_test.rb
169
- - test/mysql2_test.rb
170
- - test/net_http_test.rb
171
- - test/redis_test.rb
172
- - test/resource_test.rb
173
- - test/semian_test.rb
174
- - test/simple_integer_test.rb
175
- - test/simple_sliding_window_test.rb
176
- - test/simple_state_test.rb
177
- - test/test_helper.rb
178
- - test/unprotected_resource_test.rb
179
157
  homepage: https://github.com/shopify/semian
180
158
  licenses:
181
159
  - MIT
@@ -196,7 +174,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
196
174
  version: '0'
197
175
  requirements: []
198
176
  rubyforge_project:
199
- rubygems_version: 2.5.1
177
+ rubygems_version: 2.5.2
200
178
  signing_key:
201
179
  specification_version: 4
202
180
  summary: Bulkheading for Ruby with SysV semaphores
data/.gitignore DELETED
@@ -1,8 +0,0 @@
1
- /.bundle/
2
- /lib/**/*.so
3
- /lib/**/*.bundle
4
- /tmp/*
5
- *.gem
6
- /html/
7
- Gemfile.lock
8
- vendor/
@@ -1,113 +0,0 @@
1
- AllCops:
2
- Exclude:
3
- - Gemfile
4
- - lib/snippets/**/*
5
- - vendor/**/*
6
- - data/**/*
7
- - db/schema.rb
8
- - db/migrate/*
9
- - test/dummy/**/*
10
- - bin/rails
11
- - lib/shipit-engine.rb
12
- - tmp/**/*
13
-
14
- Style/GuardClause:
15
- Enabled: false
16
-
17
- Lint/AssignmentInCondition:
18
- Enabled: false
19
-
20
- Lint/HandleExceptions:
21
- Enabled: false
22
-
23
- Lint/EndAlignment:
24
- Enabled: false
25
-
26
- Style/NumericLiterals:
27
- Exclude:
28
- - db/schema.rb
29
-
30
- Style/SingleSpaceBeforeFirstArg:
31
- Exclude:
32
- - db/schema.rb
33
-
34
- Style/DoubleNegation:
35
- Enabled: false
36
-
37
- Metrics/LineLength:
38
- Max: 135
39
-
40
- Metrics/MethodLength:
41
- Max: 40
42
-
43
- Metrics/ClassLength:
44
- Max: 500
45
-
46
- Metrics/AbcSize:
47
- Max: 50
48
-
49
- Metrics/CyclomaticComplexity:
50
- Max: 10
51
-
52
- Style/Documentation:
53
- Enabled: false
54
-
55
- Style/SingleLineBlockParams:
56
- Enabled: false
57
-
58
- Style/SignalException:
59
- Enabled: false
60
-
61
- Style/RaiseArgs:
62
- Enabled: false
63
-
64
- Style/ModuleFunction:
65
- Enabled: false
66
-
67
- Style/RedundantReturn:
68
- AllowMultipleReturnValues: true
69
-
70
- Style/IndentHash:
71
- Enabled: false
72
-
73
- Style/TrailingComma:
74
- EnforcedStyleForMultiline: comma
75
-
76
- Style/ClassAndModuleChildren:
77
- Enabled: false
78
-
79
- Style/PredicateName:
80
- Exclude:
81
- - app/serializers/**/*
82
-
83
- Style/SpaceInsideHashLiteralBraces:
84
- EnforcedStyle: no_space
85
-
86
- Style/StringLiterals:
87
- Enabled: false
88
-
89
- Style/PerlBackrefs:
90
- Enabled: false
91
-
92
- Style/TrivialAccessors:
93
- AllowPredicates: true
94
-
95
- Style/ExtraSpacing:
96
- AllowForAlignment: false
97
-
98
- Style/GlobalVars:
99
- Exclude:
100
- - 'ext/semian/extconf.rb'
101
-
102
- Lint/Eval:
103
- Exclude:
104
- - 'Rakefile'
105
-
106
- Metrics/ParameterLists:
107
- Enabled: false
108
-
109
- Style/IfUnlessModifier:
110
- Enabled: false
111
-
112
- Style/CaseIndentation:
113
- IndentWhenRelativeTo: end
@@ -1 +0,0 @@
1
- 2.1.6
@@ -1,15 +0,0 @@
1
- language: ruby
2
-
3
- sudo: true
4
-
5
- before_install:
6
- - gem install bundler
7
- - scripts/install_toxiproxy.sh
8
-
9
- rvm:
10
- - '2.1'
11
- - '2.2'
12
- - '2.3.1'
13
-
14
- services:
15
- - redis-server
@@ -1,11 +0,0 @@
1
- # v0.4.1
2
- * resource: cast float ticket count to fixnum #75
3
-
4
- # v0.4.0
5
-
6
- * net/http: add adapter for net/http #58
7
- * circuit_breaker: split circuit breaker into three data structures to allow for
8
- alternative implementations in the future #62
9
- * mysql: don't prevent rollbacks on transactions #60
10
- * core: fix initialization bug when the resource is accessed before the options
11
- are set #65
data/Gemfile DELETED
@@ -1,10 +0,0 @@
1
- source 'https://rubygems.org'
2
- gemspec
3
-
4
- group :debug do
5
- gem 'byebug'
6
- end
7
-
8
- group :development, :test do
9
- gem 'rubocop', '~> 0.34.2'
10
- end
data/LICENSE.md DELETED
@@ -1,21 +0,0 @@
1
- The MIT License (MIT)
2
-
3
- Copyright (c) 2014 Shopify
4
-
5
- Permission is hereby granted, free of charge, to any person obtaining a copy
6
- of this software and associated documentation files (the "Software"), to deal
7
- in the Software without restriction, including without limitation the rights
8
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
- copies of the Software, and to permit persons to whom the Software is
10
- furnished to do so, subject to the following conditions:
11
-
12
- The above copyright notice and this permission notice shall be included in all
13
- copies or substantial portions of the Software.
14
-
15
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
- SOFTWARE.
data/README.md DELETED
@@ -1,576 +0,0 @@
1
- ## Semian [![Build Status](https://travis-ci.org/Shopify/semian.svg?branch=master)](https://travis-ci.org/Shopify/semian)
2
-
3
- ![](http://i.imgur.com/7Vn2ibF.png)
4
-
5
- Semian is a library for controlling access to slow or unresponsive external
6
- services to avoid cascading failures.
7
-
8
- When services are down they typically fail fast with errors like `ECONNREFUSED`
9
- and `ECONNRESET` which can be rescued in code. However, slow resources fail
10
- slowly. The thread serving the request blocks until it hits the timeout for the
11
- slow resource. During that time, the thread is doing nothing useful and thus the
12
- slow resource has caused a cascading failure by occupying workers and therefore
13
- losing capacity. **Semian is a library for failing fast in these situations,
14
- allowing you to handle errors gracefully.** Semian does this by intercepting
15
- resource access through heuristic patterns inspired by [Hystrix][hystrix] and
16
- [Release It][release-it]:
17
-
18
- * [**Circuit breaker**](#circuit-breaker). A pattern for limiting the
19
- amount of requests to a dependency that is having issues.
20
- * [**Bulkheading**](#bulkheading). Controlling the concurrent access to
21
- a single resource, access is coordinates server-wide with [SysV
22
- semaphores][sysv].
23
-
24
- Resource drivers are monkey-patched to be aware of Semian, these are called
25
- [Semian Adapters](#adapters). Thus, every time resource access is requested
26
- Semian is queried for status on the resource first. If Semian, through the
27
- patterns above, deems the resource to be unavailable it will raise an exception.
28
- **The ultimate outcome of Semian is always an exception that can then be rescued
29
- for a graceful fallback**. Instead of waiting for the timeout, Semian raises
30
- straight away.
31
-
32
- If you are already rescuing exceptions for failing resources and timeouts,
33
- Semian is mostly a drop-in library with a little configuration that will make
34
- your code more resilient to slow resource access. But, [do you even need
35
- Semian?](#do-i-need-semian)
36
-
37
- For an overview of building resilient Ruby applications, start by reading [the
38
- Shopify blog post on Toxiproxy and Semian][resiliency-blog-post]. For more in
39
- depth information on Semian, see [Understanding Semian](#understanding-semian).
40
- Semian is an extraction from [Shopify][shopify] where it's been running
41
- successfully in production since October, 2014.
42
-
43
- The other component to your Ruby resiliency kit is [Toxiproxy][toxiproxy] to
44
- write automated resiliency tests.
45
-
46
- # Usage
47
-
48
- Install by adding the gem to your `Gemfile` and require the [adapters](#adapters) you need:
49
-
50
- ```ruby
51
- gem 'semian', require: %w(semian semian/mysql2 semian/redis)
52
- ```
53
-
54
- We recommend this pattern of requiring adapters directly from the `Gemfile`.
55
- This makes ensures Semian adapters is loaded as early as possible, to also
56
- protect your application during boot. Please see the [adapter configuration
57
- section](#configuration) on how to configure adapters.
58
-
59
- ## Adapters
60
-
61
- Semian works by intercepting resource access. Every time access is requested,
62
- Semian is queried, and it will raise an exception if the resource is unavailable
63
- according to the circuit breaker or bulkheads. This is done by monkey-patching
64
- the resource driver. **The exception raised by the driver always inherits from
65
- the Base exception class of the driver**, meaning you can always simply rescue
66
- the base class and catch both Semian and driver errors in the same rescue for
67
- fallbacks.
68
-
69
- The following adapters are in Semian and tested heavily in production, the
70
- version is the version of the public gem with the same name:
71
-
72
- * [`semian/mysql2`][mysql-semian-adapter] (~> 0.3.16)
73
- * [`semian/redis`][redis-semian-adapter] (~> 3.2.1)
74
- * [`semian/net_http`][nethttp-semian-adapter]
75
-
76
- ### Creating Adapters
77
-
78
- To create a Semian adapter you must implement the following methods:
79
-
80
- 1. [`include Semian::Adapter`][semian-adapter]. Use the helpers to wrap the
81
- resource. This takes care of situations such as monitoring, nested resources,
82
- unsupported platforms, creating the Semian resource if it doesn't already
83
- exist and so on.
84
- 2. `#semian_identifier`. This is responsible for returning a symbol that
85
- represents every unique resource, for example `redis_master` or
86
- `mysql_shard_1`. This is usually assembled from a `name` attribute on the
87
- Semian configuration hash, but could also be `<host>:<port>`.
88
- 3. `connect`. The name of this method varies. You must override the driver's
89
- connect method with one that wraps the connect call with
90
- `Semian::Resource#acquire`. You should do this at the lowest possible level.
91
- 4. `query`. Same as `connect` but for queries on the resource.
92
- 5. Define exceptions `ResourceBusyError` and `CircuitOpenError`. These are
93
- raised when the request was rejected early because the resource is out of
94
- tickets or because the circuit breaker is open (see [Understanding
95
- Semian](#understanding-semian). They should inherit from the base exception
96
- class from the raw driver. For example `Mysql2::Error` or
97
- `Redis::BaseConnectionError` for the MySQL and Redis drivers. This makes it
98
- easy to `rescue` and handle them gracefully in application code, by
99
- `rescue`ing the base class.
100
-
101
- The best resource is looking at the [already implemented adapters](#adapters).
102
-
103
- ### Configuration
104
-
105
- When instantiating a resource it now needs to be configured for Semian. This is
106
- done by passing `semian` as an argument when initializing the client. Examples
107
- built in adapters:
108
-
109
- ```ruby
110
- # MySQL2 client
111
- # In Rails this means having a Semian key in database.yml for each db.
112
- client = Mysql2::Client.new(host: "localhost", username: "root", semian: {
113
- name: "master",
114
- tickets: 8, # See the Understanding Semian section on picking these values
115
- success_threshold: 2,
116
- error_threshold: 3,
117
- error_timeout: 10
118
- })
119
-
120
- # Redis client
121
- client = Redis.new(semian: {
122
- name: "inventory",
123
- tickets: 4,
124
- success_threshold: 2,
125
- error_threshold: 4,
126
- error_timeout: 20
127
- })
128
- ```
129
-
130
- #### Net::HTTP
131
- For the `Net::HTTP` specific Semian adapter, since many external libraries may create
132
- HTTP connections on the user's behalf, the parameters are instead provided
133
- by associating callback functions with `Semian::NetHTTP`, perhaps in an initialization file.
134
-
135
- ##### Naming and Options
136
- To give Semian parameters, assign a `proc` to `Semian::NetHTTP.semian_configuration`
137
- that takes a two parameters, `host` and `port` like `127.0.0.1`,`443` or `github_com`,`80`,
138
- and returns a `Hash` with configuration parameters as follows. The `proc` is used as a
139
- callback to initialize the configuration options, similar to other adapters.
140
-
141
- ```ruby
142
- SEMIAN_PARAMETERS = { tickets: 1,
143
- success_threshold: 1,
144
- error_threshold: 3,
145
- error_timeout: 10 }
146
- Semian::NetHTTP.semian_configuration = proc do |host, port|
147
- # Let's make it only active for github.com
148
- if host == "github.com" && port == "80"
149
- SEMIAN_PARAMETERS.merge(name: "github.com_80")
150
- else
151
- nil
152
- end
153
- end
154
-
155
- # Called from within API:
156
- # semian_options = Semian::NetHTTP.semian_configuration("github.com", 80)
157
- # semian_identifier = "nethttp_#{semian_options[:name]}"
158
- ```
159
-
160
- The `name` should be carefully chosen since it identifies the resource being protected.
161
- The `semian_options` passed apply to that resource. Semian creates the `semian_identifier`
162
- from the `name` to look up and store changes in the circuit breaker and bulkhead states
163
- and associate successes, failures, errors with the protected resource.
164
-
165
- We only require that:
166
- * the `semian_configuration` be **set only once** over the lifetime of the library
167
- * the output of the `proc` be the same over time, that is, the configuration produced by
168
- each pair of `host`, `port` is **the same each time** the callback is invoked.
169
-
170
- For most purposes, `"#{host}_#{port}"` is a good default `name`. Custom `name` formats
171
- can be useful to grouping related subdomains as one resource, so that they all
172
- contribute to the same circuit breaker and bulkhead state and fail together.
173
-
174
- A return value of `nil` for `semian_configuration` means Semian is disabled for that
175
- HTTP endpoint. This works well since the result of a failed Hash lookup is `nil` also.
176
- This behavior lets the adapter default to whitelisting, although the
177
- behavior can be changed to blacklisting or even be completely disabled by varying
178
- the use of returning `nil` in the assigned closure.
179
-
180
- ##### Additional Exceptions
181
- Since we envision this particular adapter can be used in combination with many
182
- external libraries, that can raise additional exceptions, we added functionality to
183
- expand the Exceptions that can be tracked as part of Semian's circuit breaker.
184
- This may be necessary for libraries that introduce new exceptions or re-raise them.
185
- Add exceptions and reset to the [`default`][nethttp-default-errors] list using the following:
186
-
187
- ```ruby
188
- # assert_equal(Semian::NetHTTP.exceptions, Semian::NetHTTP::DEFAULT_ERRORS)
189
- Semian::NetHTTP.exceptions += [::OpenSSL::SSL::SSLError]
190
-
191
- Semian::NetHTTP.reset_exceptions
192
- # assert_equal(Semian::NetHTTP.exceptions, Semian::NetHTTP::DEFAULT_ERRORS)
193
- ```
194
-
195
- # Understanding Semian
196
-
197
- Semian is a library with heuristics for failing fast. This section will explain
198
- in depth how Semian works and which situations it's applicable for. First we
199
- explain the category of problems Semian is meant to solve. Then we dive into how
200
- Semian works to solve these problems.
201
-
202
- ## Do I need Semian?
203
-
204
- Semian is not a trivial library to understand, introduces complexity and thus
205
- should be introduced with care. Remember, all Semian does is raise exceptions
206
- based on heuristics. It is paramount that you understand Semian before
207
- including it in production as you may otherwise be surprised by its behaviour.
208
-
209
- Applications that benefit from Semian are those working on eliminating SPOFs
210
- (Single Points of Failure), and specifically are running into a wall regarding
211
- slow resources. But it is by no means a magic wand that solves all your latency
212
- problems by being added to your `Gemfile`. This section describes the types of
213
- problems Semian solves.
214
-
215
- If your application is multithreaded or evented (e.g. not Resque and Unicorn)
216
- these problems are not as pressing. You can still get use out of Semian however.
217
-
218
- ### Real World Example
219
-
220
- This is better illustrated with a real world example from Shopify. When you are
221
- browsing a store while signed in, Shopify stores your session in Redis.
222
- If Redis becomes unavailable, the driver will start throwing exceptions.
223
- We rescue these exceptions and simply disable all customer sign in functionality
224
- on the store until Redis is back online.
225
-
226
- This is great if querying the resource fails instantly, because it means we fail
227
- in just a single roundtrip of ~1ms. But if the resource is unresponsive or slow,
228
- this can take as long as our timeout which is easily 200ms. This means every
229
- request, even if it does rescue the exception, now takes an extra 200ms.
230
- Because every resource takes that long, our capacity is also significantly
231
- degraded. These problems are explained in depth in the next two sections.
232
-
233
- With Semian, the slow resource would fail instantly (after a small amount of
234
- convergence time) preventing your response time from spiking and not decreasing
235
- capacity of the cluster.
236
-
237
- If this sounds familiar to you, Semian is what you need to be resilient to
238
- latency. You may not need the graceful fallback depending on your application,
239
- in which case it will just result in an error (e.g. a `HTTP 500`) faster.
240
-
241
- We will now examine the two problems in detail.
242
-
243
- #### In-depth analysis of real world example
244
-
245
- If a single resource is slow, every single request is going to suffer. We saw
246
- this in the example before. Let's illustrate this more clearly in the following
247
- Rails example where the user session is stored in Redis:
248
-
249
- ```ruby
250
- def index
251
- @user = fetch_user
252
- @posts = Post.all
253
- end
254
-
255
- private
256
- def fetch_user
257
- user = User.find(session[:user_id])
258
- rescue Redis::CannotConnectError
259
- nil
260
- end
261
- ```
262
-
263
- Our code is resilient to a failure of the session layer, it doesn't `HTTP 500`
264
- if the session store is unavailable (this can be tested with
265
- [Toxiproxy][toxiproxy]). If the `User` and `Post` data store is unavailable, the
266
- server will send back `HTTP 500`. We accept that, because it's our primary data
267
- store. This could be prevented with a caching tier or something else out of
268
- scope.
269
-
270
- This code has two flaws however:
271
-
272
- 1. **What happens if the session storage is consistently slow?** I.e. the majority
273
- of requests take, say, more than half the timeout time (but it should only
274
- take ~1ms)?
275
- 2. **What happens if the session storage is unavailable and is not responding at
276
- all?** I.e. we hit timeouts on every request.
277
-
278
- These two problems in turn have two related problems associated with them:
279
- response time and capacity.
280
-
281
- #### Response time
282
-
283
- Requests that attempt to access a down session storage are all gracefully handled, the
284
- `@user` will simply be `nil`, which the code handles. There is still a
285
- major impact on users however, as every request to the storage has to time
286
- out. This causes the average response time to all pages that access it to go up by
287
- however long your timeout is. Your timeout is proportional to your worst case timeout,
288
- as well as the number of attempts to hit it on each page. This is the problem Semian
289
- solves by using heuristics to fail these requests early which causes a much better
290
- user experience during downtime.
291
-
292
- #### Capacity loss
293
-
294
- When your single-threaded worker is waiting for a resource to return, it's
295
- effectively doing nothing when it could be serving fast requests. To use the
296
- example from before, perhaps some actions do not access the session storage at
297
- all. These requests will pile up behind the now slow requests that are trying to
298
- access that layer, because they're failing slowly. Essentially, your capacity
299
- degrades significantly because your average response time goes up (as explained
300
- in the previous section). Capacity loss simply follows from an increase in
301
- response time. The higher your timeout and the slower your resource, the more
302
- capacity you lose.
303
-
304
- #### Timeouts aren't enough
305
-
306
- It should be clear by now that timeouts aren't enough. Consistent timeouts will
307
- increase the average response time, which causes a bad user experience, and
308
- ultimately compromise the performance of the entire system. Even if the timeout
309
- is as low as ~250ms (just enough to allow a single TCP retransmit) there's a
310
- large loss of capacity and for many applications a 100-300% increase in average
311
- response time. This is the problem Semian solves by failing fast.
312
-
313
- ## How does Semian work?
314
-
315
- Semian consists of two parts: circuit breaker and bulkheading. To understand
316
- Semian, and especially how to configure it, we must understand these patterns
317
- and their implementation.
318
-
319
- ### Circuit Breaker
320
-
321
- The circuit breaker pattern is based on a simple observation - if we hit a
322
- timeout or any other error for a given service one or more times, we’re likely
323
- to hit it again for some amount of time. Instead of hitting the timeout
324
- repeatedly, we can mark the resource as dead for some amount of time during
325
- which we raise an exception instantly on any call to it. This is called the
326
- [circuit breaker pattern][cbp].
327
-
328
- ![](http://cdn.shopify.com/s/files/1/0070/7032/files/image01_grande.png)
329
-
330
- When we perform a Remote Procedure Call (RPC), it will first check the circuit.
331
- If the circuit is rejecting requests because of too many failures reported by
332
- the driver, it will throw an exception immediately. Otherwise the circuit will
333
- call the driver. If the driver fails to get data back from the data store, it
334
- will notify the circuit. The circuit will count the error so that if too many
335
- errors have happened recently, it will start rejecting requests immediately
336
- instead of waiting for the driver to time out. The exception will then be raised
337
- back to the original caller. If the driver’s request was successful, it will
338
- return the data back to the calling method and notify the circuit that it made a
339
- successful call.
340
-
341
- The state of the circuit breaker is local to the worker and is not shared across
342
- all workers on a server.
343
-
344
- #### Circuit Breaker Configuration
345
-
346
- There are three configuration parameters for circuit breakers in Semian:
347
-
348
- * **error_threshold**. The amount of errors to encounter for the worker before
349
- opening the circuit, that is to start rejecting requests instantly.
350
- * **error_timeout**. The amount of time until trying to query the resource
351
- again.
352
- * **success_threshold**. The amount of successes on the circuit until closing it
353
- again, that is to start accepting all requests to the circuit.
354
-
355
- ### Bulkheading
356
-
357
- For many applications, circuit breakers are not enough however. This is best
358
- illustrated with an extreme. Imagine if the timeout for our data store isn't as
359
- low as 200ms, but actually 10 seconds. For example, you might have a relational data
360
- store where for some customers, 10s queries are (unfortunately) legitimate.
361
- Reducing the time of worst case queries requires a lot of effort. Dropping the
362
- query immediately could potentially leave some customers unable to access certain
363
- functionality. High timeouts are especially critical in a non-threaded
364
- environment where blocking IO means a worker is effectively doing nothing.
365
-
366
- In this case, circuit breakers aren't sufficient. Assuming the circuit is shared
367
- across all processes on a server, it will still take at least 10s before the
368
- circuit is open—in that time every worker is blocked. Meaning we are in a
369
- reduced capacity state for at least 20s, with the last 10s timeouts
370
- occurring just before the circuit opens at the 10s mark when a couple of
371
- workers have hit a timeout and the circuit opens. We thought of a number of
372
- potential solutions to this problem - stricter timeouts, grouping timeouts by
373
- section of our application, timeouts per statement—but they all still revolved
374
- around timeouts, and those are extremely hard to get right.
375
-
376
- Instead of thinking about timeouts, we took inspiration from Hystrix by Netflix
377
- and the book Release It (the resiliency bible), and look at our services as
378
- connection pools. On a server with `W` workers, only a certain number of them
379
- are expected to be talking to a single data store at once. Let's say we've
380
- determined from our monitoring that there’s a 10% chance they’re talking to
381
- `mysql_shard_0` at any given point in time under normal traffic. The probability
382
- that five workers are talking to it at the same time is 0.001%. If we only allow
383
- five workers to talk to a resource at any given point in time, and accept the
384
- 0.001% false positive rate—we can fail the sixth worker attempting to check out
385
- a connection instantly. This means that while the five workers are waiting for a
386
- timeout, all the other `W-5` workers on the node will instantly be failing on
387
- checking out the connection and opening their circuits. Our capacity is only
388
- degraded by a relatively small amount.
389
-
390
- We call this limitation primitive "tickets". In this case, the resource access
391
- is limited to 5 tickets (see Configuration). The timeout value specifies the
392
- maximum amount of time to block if no ticket is available.
393
-
394
- How do we limit the access to a resource for all workers on a server when the
395
- workers do not directly share memory? This is implemented with [SysV
396
- semaphores][sysv] to provide server-wide access control.
397
-
398
- #### Bulkhead Configuration
399
-
400
- There are two configuration values. It's not easy to choose good values and we're
401
- still experimenting with ways to figure out optimal ticket numbers. Generally
402
- something below half the number of workers on the server for endpoints that are
403
- queried frequently has worked well for us.
404
-
405
- * **tickets**. Number of workers that can concurrently access a resource.
406
- * **timeout**. Time to wait to acquire a ticket if there are no tickets left.
407
- We recommend this to be `0` unless you have very few workers running (i.e.
408
- less than ~5).
409
-
410
- ## Defense line
411
-
412
- The finished defense line for resource access with circuit breakers and
413
- bulkheads then looks like this:
414
-
415
- ![](http://cdn.shopify.com/s/files/1/0070/7032/files/image02_grande.png)
416
-
417
- The RPC first checks the circuit; if the circuit is open it will raise the
418
- exception straight away which will trigger the fallback (the default fallback is
419
- a 500 response). Otherwise, it will try Semian which fails instantly if too many
420
- workers are already querying the resource. Finally the driver will query the
421
- data store. If the data store succeeds, the driver will return the data back to
422
- the RPC. If the data store is slow or fails, this is our last line of defense
423
- against a misbehaving resource. The driver will raise an exception after trying
424
- to connect with a timeout or after an immediate failure. These driver actions
425
- will affect the circuit and Semian, which can make future calls fail faster.
426
-
427
- ## Failing gracefully
428
-
429
- Ok, great, we've got a way to fail fast with slow resources, how does that make
430
- my application more resilient?
431
-
432
- Failing fast is only half the battle. It's up to you what you do with these
433
- errors, in the [session example](#real-world-example) we handle it gracefully by
434
- signing people out and disabling all session related functionality till the data
435
- store is back online. However, not rescuing the exception and simply sending
436
- `HTTP 500` back to the client faster will help with [capacity
437
- loss](#capacity-loss).
438
-
439
- ### Exceptions inherit from base class
440
-
441
- It's important to understand that the exceptions raised by [Semian
442
- Adapters](#adapters) inherit from the base class of the driver itself, meaning
443
- that if you do something like:
444
-
445
- ```ruby
446
- def posts
447
- Post.all
448
- rescue Mysql2::Error
449
- []
450
- end
451
- ```
452
-
453
- Exceptions raised by Semian's `MySQL2` adapter will also get caught.
454
-
455
- ### Patterns
456
-
457
- We do not recommend mindlessly sprinkling `rescue`s all over the place. What you
458
- should do instead is writing decorators around secondary data stores (e.g. sessions)
459
- that provide resiliency for free. For example, if we stored the tags associated
460
- with products in a secondary data store it could look something like this:
461
-
462
- ```ruby
463
- # Resilient decorator for storing a Set in Redis.
464
- class RedisSet
465
- def initialize(key)
466
- @key = key
467
- end
468
-
469
- def get
470
- redis.smembers(@key)
471
- rescue Redis::BaseConnectionError
472
- []
473
- end
474
-
475
- private
476
-
477
- def redis
478
- @redis ||= Redis.new
479
- end
480
- end
481
-
482
- class Product
483
- # This will simply return an empty array in the case of a Redis outage.
484
- def tags
485
- tags_set.get
486
- end
487
-
488
- private
489
-
490
- def tags_set
491
- @tags_set ||= RedisSet.new("product:tags:#{self.id}")
492
- end
493
- end
494
- ```
495
-
496
- These decorators can be resiliency tested with [Toxiproxy][toxiproxy]. You can
497
- provide fallbacks around your primary data store as well. In our case, we simply
498
- `HTTP 500` in those cases unless it's cached because these pages aren't worth
499
- much without data from their primary data store.
500
-
501
- ## Monitoring
502
-
503
- With [`Semian::Instrumentable`][semian-instrumentable] clients can monitor
504
- Semian internals. For example to instrument just events with
505
- [`statsd-instrument`][statsd-instrument]:
506
-
507
- ```ruby
508
- # `event` is `success`, `busy`, `circuit_open`.
509
- # `resource` is the `Semian::Resource` object
510
- # `scope` is `connection` or `query` (others can be instrumented too from the adapter)
511
- # `adapter` is the name of the adapter (mysql2, redis, ..)
512
- Semian.subscribe do |event, resource, scope, adapter|
513
- StatsD.increment("Shopify.#{adapter}.semian.#{event}", 1, tags: [
514
- "resource:#{resource.name}",
515
- "total_tickets:#{resource.tickets}",
516
- "type:#{scope}",
517
- ])
518
- end
519
- ```
520
-
521
- # FAQ
522
-
523
- **How does Semian work with containers?** Semian uses [SysV semaphores][sysv] to
524
- coordinate access to a resource. The semaphore is only shared within the
525
- [IPC][namespaces]. Unless you are running many workers inside every container,
526
- this leaves the bulkheading pattern effectively useless. We recommend sharing
527
- the IPC namespace between all containers on your host for the best ticket
528
- economy. If you are using Docker, this can be done with the [--ipc
529
- flag](https://docs.docker.com/reference/run/#ipc-settings).
530
-
531
- **Why isn't resource access shared across the entire cluster?** This implies a
532
- coordination data store. Semian would have to be resilient to failures of this
533
- data store as well, and fall back to other primitives. While it's nice to have
534
- all workers have the same view of the world, this greatly increases the
535
- complexity of the implementation which is not favourable for resiliency code.
536
-
537
- **Why isn't the circuit breaker implemented as a host-wide mechanism?** No good
538
- reason. Patches welcome!
539
-
540
- **Why is there no fallback mechanism in Semian?** Read the [Failing
541
- Gracefully](#failing-gracefully) section. In short, exceptions is exactly this.
542
- We did not want to put an extra level on abstraction on top of this. In the
543
- first internal implementation this was the case, but we later moved away from
544
- it.
545
-
546
- **Why does it not use normal Ruby semaphores?** To work properly the access
547
- control needs to be performed across many workers. With MRI that means having
548
- multiple processes, not threads. Thus we need a primitive outside of the
549
- interpreter. For other Ruby implementations a driver that uses Ruby semaphores
550
- could be used (and would be accepted as a PR).
551
-
552
- **Why are there three semaphores in the semaphore sets for each resource?** This
553
- has to do with being able to resize the number of tickets for a resource online.
554
-
555
- **Can I change the number of tickets freely?** Yes, the logic for this isn't
556
- trivial but it works well.
557
-
558
- **What is the performance overhead of Semian?** Extremely minimal in comparison
559
- to going to the network. Don't worry about it unless you're instrumenting
560
- non-IO.
561
-
562
- [hystrix]: https://github.com/Netflix/Hystrix
563
- [release-it]: https://pragprog.com/book/mnee/release-it
564
- [shopify]: http://www.shopify.com/
565
- [mysql-semian-adapter]: lib/semian/mysql2.rb
566
- [redis-semian-adapter]: lib/semian/redis.rb
567
- [semian-adapter]: lib/semian/adapter.rb
568
- [nethttp-semian-adapter]: lib/semian/net_http.rb
569
- [nethttp-default-errors]: lib/semian/net_http.rb#L35-L45
570
- [semian-instrumentable]: lib/semian/instrumentable.rb
571
- [statsd-instrument]: http://github.com/shopify/statsd-instrument
572
- [resiliency-blog-post]: http://www.shopify.com/technology/16906928-building-and-testing-resilient-ruby-on-rails-applications
573
- [toxiproxy]: https://github.com/Shopify/toxiproxy
574
- [sysv]: http://man7.org/linux/man-pages/man7/svipc.7.html
575
- [cbp]: https://en.wikipedia.org/wiki/Circuit_breaker_design_pattern
576
- [namespaces]: http://man7.org/linux/man-pages/man7/namespaces.7.html