swift 2.23.2__py3-none-any.whl → 2.35.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- swift/__init__.py +29 -50
- swift/account/auditor.py +21 -118
- swift/account/backend.py +33 -28
- swift/account/reaper.py +37 -28
- swift/account/replicator.py +22 -0
- swift/account/server.py +60 -26
- swift/account/utils.py +28 -11
- swift-2.23.2.data/scripts/swift-account-audit → swift/cli/account_audit.py +23 -13
- swift-2.23.2.data/scripts/swift-config → swift/cli/config.py +2 -2
- swift/cli/container_deleter.py +5 -11
- swift-2.23.2.data/scripts/swift-dispersion-populate → swift/cli/dispersion_populate.py +8 -7
- swift/cli/dispersion_report.py +10 -9
- swift-2.23.2.data/scripts/swift-drive-audit → swift/cli/drive_audit.py +63 -21
- swift/cli/form_signature.py +3 -7
- swift-2.23.2.data/scripts/swift-get-nodes → swift/cli/get_nodes.py +8 -2
- swift/cli/info.py +183 -29
- swift/cli/manage_shard_ranges.py +708 -37
- swift-2.23.2.data/scripts/swift-oldies → swift/cli/oldies.py +25 -14
- swift-2.23.2.data/scripts/swift-orphans → swift/cli/orphans.py +7 -3
- swift/cli/recon.py +196 -67
- swift-2.23.2.data/scripts/swift-recon-cron → swift/cli/recon_cron.py +17 -20
- swift-2.23.2.data/scripts/swift-reconciler-enqueue → swift/cli/reconciler_enqueue.py +2 -3
- swift/cli/relinker.py +807 -126
- swift/cli/reload.py +135 -0
- swift/cli/ringbuilder.py +217 -20
- swift/cli/ringcomposer.py +0 -1
- swift/cli/shard-info.py +4 -3
- swift/common/base_storage_server.py +9 -20
- swift/common/bufferedhttp.py +48 -74
- swift/common/constraints.py +20 -15
- swift/common/container_sync_realms.py +9 -11
- swift/common/daemon.py +25 -8
- swift/common/db.py +198 -127
- swift/common/db_auditor.py +168 -0
- swift/common/db_replicator.py +95 -55
- swift/common/digest.py +141 -0
- swift/common/direct_client.py +144 -33
- swift/common/error_limiter.py +93 -0
- swift/common/exceptions.py +25 -1
- swift/common/header_key_dict.py +2 -9
- swift/common/http_protocol.py +373 -0
- swift/common/internal_client.py +129 -59
- swift/common/linkat.py +3 -4
- swift/common/manager.py +284 -67
- swift/common/memcached.py +396 -147
- swift/common/middleware/__init__.py +4 -0
- swift/common/middleware/account_quotas.py +211 -46
- swift/common/middleware/acl.py +3 -8
- swift/common/middleware/backend_ratelimit.py +230 -0
- swift/common/middleware/bulk.py +22 -34
- swift/common/middleware/catch_errors.py +1 -3
- swift/common/middleware/cname_lookup.py +6 -11
- swift/common/middleware/container_quotas.py +1 -1
- swift/common/middleware/container_sync.py +39 -17
- swift/common/middleware/copy.py +12 -0
- swift/common/middleware/crossdomain.py +22 -9
- swift/common/middleware/crypto/__init__.py +2 -1
- swift/common/middleware/crypto/crypto_utils.py +11 -15
- swift/common/middleware/crypto/decrypter.py +28 -11
- swift/common/middleware/crypto/encrypter.py +12 -17
- swift/common/middleware/crypto/keymaster.py +8 -15
- swift/common/middleware/crypto/kms_keymaster.py +2 -1
- swift/common/middleware/dlo.py +15 -11
- swift/common/middleware/domain_remap.py +5 -4
- swift/common/middleware/etag_quoter.py +128 -0
- swift/common/middleware/formpost.py +73 -70
- swift/common/middleware/gatekeeper.py +8 -1
- swift/common/middleware/keystoneauth.py +33 -3
- swift/common/middleware/list_endpoints.py +4 -4
- swift/common/middleware/listing_formats.py +85 -49
- swift/common/middleware/memcache.py +4 -81
- swift/common/middleware/name_check.py +3 -2
- swift/common/middleware/proxy_logging.py +160 -92
- swift/common/middleware/ratelimit.py +17 -10
- swift/common/middleware/read_only.py +6 -4
- swift/common/middleware/recon.py +59 -22
- swift/common/middleware/s3api/acl_handlers.py +25 -3
- swift/common/middleware/s3api/acl_utils.py +6 -1
- swift/common/middleware/s3api/controllers/__init__.py +6 -0
- swift/common/middleware/s3api/controllers/acl.py +3 -2
- swift/common/middleware/s3api/controllers/bucket.py +242 -137
- swift/common/middleware/s3api/controllers/logging.py +2 -2
- swift/common/middleware/s3api/controllers/multi_delete.py +43 -20
- swift/common/middleware/s3api/controllers/multi_upload.py +219 -133
- swift/common/middleware/s3api/controllers/obj.py +112 -8
- swift/common/middleware/s3api/controllers/object_lock.py +44 -0
- swift/common/middleware/s3api/controllers/s3_acl.py +2 -2
- swift/common/middleware/s3api/controllers/tagging.py +57 -0
- swift/common/middleware/s3api/controllers/versioning.py +36 -7
- swift/common/middleware/s3api/etree.py +22 -9
- swift/common/middleware/s3api/exception.py +0 -4
- swift/common/middleware/s3api/s3api.py +113 -41
- swift/common/middleware/s3api/s3request.py +384 -218
- swift/common/middleware/s3api/s3response.py +126 -23
- swift/common/middleware/s3api/s3token.py +16 -17
- swift/common/middleware/s3api/schema/delete.rng +1 -1
- swift/common/middleware/s3api/subresource.py +7 -10
- swift/common/middleware/s3api/utils.py +27 -10
- swift/common/middleware/slo.py +665 -358
- swift/common/middleware/staticweb.py +64 -37
- swift/common/middleware/symlink.py +52 -19
- swift/common/middleware/tempauth.py +76 -58
- swift/common/middleware/tempurl.py +192 -174
- swift/common/middleware/versioned_writes/__init__.py +51 -0
- swift/common/middleware/{versioned_writes.py → versioned_writes/legacy.py} +27 -26
- swift/common/middleware/versioned_writes/object_versioning.py +1482 -0
- swift/common/middleware/x_profile/exceptions.py +1 -4
- swift/common/middleware/x_profile/html_viewer.py +18 -19
- swift/common/middleware/x_profile/profile_model.py +1 -2
- swift/common/middleware/xprofile.py +10 -10
- swift-2.23.2.data/scripts/swift-container-server → swift/common/recon.py +13 -8
- swift/common/registry.py +147 -0
- swift/common/request_helpers.py +324 -57
- swift/common/ring/builder.py +67 -25
- swift/common/ring/composite_builder.py +1 -1
- swift/common/ring/ring.py +177 -51
- swift/common/ring/utils.py +1 -1
- swift/common/splice.py +10 -6
- swift/common/statsd_client.py +205 -0
- swift/common/storage_policy.py +49 -44
- swift/common/swob.py +86 -102
- swift/common/{utils.py → utils/__init__.py} +2191 -2762
- swift/common/utils/base.py +131 -0
- swift/common/utils/config.py +433 -0
- swift/common/utils/ipaddrs.py +256 -0
- swift/common/utils/libc.py +345 -0
- swift/common/utils/logs.py +859 -0
- swift/common/utils/timestamp.py +412 -0
- swift/common/wsgi.py +555 -536
- swift/container/auditor.py +14 -100
- swift/container/backend.py +552 -227
- swift/container/reconciler.py +126 -37
- swift/container/replicator.py +96 -22
- swift/container/server.py +397 -176
- swift/container/sharder.py +1580 -639
- swift/container/sync.py +94 -88
- swift/container/updater.py +53 -32
- swift/obj/auditor.py +153 -35
- swift/obj/diskfile.py +466 -217
- swift/obj/expirer.py +406 -124
- swift/obj/mem_diskfile.py +7 -4
- swift/obj/mem_server.py +1 -0
- swift/obj/reconstructor.py +523 -262
- swift/obj/replicator.py +249 -188
- swift/obj/server.py +213 -122
- swift/obj/ssync_receiver.py +145 -85
- swift/obj/ssync_sender.py +113 -54
- swift/obj/updater.py +653 -139
- swift/obj/watchers/__init__.py +0 -0
- swift/obj/watchers/dark_data.py +213 -0
- swift/proxy/controllers/account.py +11 -11
- swift/proxy/controllers/base.py +848 -604
- swift/proxy/controllers/container.py +452 -86
- swift/proxy/controllers/info.py +3 -2
- swift/proxy/controllers/obj.py +1009 -490
- swift/proxy/server.py +185 -112
- swift-2.35.0.dist-info/AUTHORS +501 -0
- swift-2.35.0.dist-info/LICENSE +202 -0
- {swift-2.23.2.dist-info → swift-2.35.0.dist-info}/METADATA +52 -61
- swift-2.35.0.dist-info/RECORD +201 -0
- {swift-2.23.2.dist-info → swift-2.35.0.dist-info}/WHEEL +1 -1
- {swift-2.23.2.dist-info → swift-2.35.0.dist-info}/entry_points.txt +43 -0
- swift-2.35.0.dist-info/pbr.json +1 -0
- swift/locale/de/LC_MESSAGES/swift.po +0 -1216
- swift/locale/en_GB/LC_MESSAGES/swift.po +0 -1207
- swift/locale/es/LC_MESSAGES/swift.po +0 -1085
- swift/locale/fr/LC_MESSAGES/swift.po +0 -909
- swift/locale/it/LC_MESSAGES/swift.po +0 -894
- swift/locale/ja/LC_MESSAGES/swift.po +0 -965
- swift/locale/ko_KR/LC_MESSAGES/swift.po +0 -964
- swift/locale/pt_BR/LC_MESSAGES/swift.po +0 -881
- swift/locale/ru/LC_MESSAGES/swift.po +0 -891
- swift/locale/tr_TR/LC_MESSAGES/swift.po +0 -832
- swift/locale/zh_CN/LC_MESSAGES/swift.po +0 -833
- swift/locale/zh_TW/LC_MESSAGES/swift.po +0 -838
- swift-2.23.2.data/scripts/swift-account-auditor +0 -23
- swift-2.23.2.data/scripts/swift-account-info +0 -51
- swift-2.23.2.data/scripts/swift-account-reaper +0 -23
- swift-2.23.2.data/scripts/swift-account-replicator +0 -34
- swift-2.23.2.data/scripts/swift-account-server +0 -23
- swift-2.23.2.data/scripts/swift-container-auditor +0 -23
- swift-2.23.2.data/scripts/swift-container-info +0 -51
- swift-2.23.2.data/scripts/swift-container-reconciler +0 -21
- swift-2.23.2.data/scripts/swift-container-replicator +0 -34
- swift-2.23.2.data/scripts/swift-container-sharder +0 -33
- swift-2.23.2.data/scripts/swift-container-sync +0 -23
- swift-2.23.2.data/scripts/swift-container-updater +0 -23
- swift-2.23.2.data/scripts/swift-dispersion-report +0 -24
- swift-2.23.2.data/scripts/swift-form-signature +0 -20
- swift-2.23.2.data/scripts/swift-init +0 -119
- swift-2.23.2.data/scripts/swift-object-auditor +0 -29
- swift-2.23.2.data/scripts/swift-object-expirer +0 -33
- swift-2.23.2.data/scripts/swift-object-info +0 -60
- swift-2.23.2.data/scripts/swift-object-reconstructor +0 -33
- swift-2.23.2.data/scripts/swift-object-relinker +0 -41
- swift-2.23.2.data/scripts/swift-object-replicator +0 -37
- swift-2.23.2.data/scripts/swift-object-server +0 -27
- swift-2.23.2.data/scripts/swift-object-updater +0 -23
- swift-2.23.2.data/scripts/swift-proxy-server +0 -23
- swift-2.23.2.data/scripts/swift-recon +0 -24
- swift-2.23.2.data/scripts/swift-ring-builder +0 -24
- swift-2.23.2.data/scripts/swift-ring-builder-analyzer +0 -22
- swift-2.23.2.data/scripts/swift-ring-composer +0 -22
- swift-2.23.2.dist-info/DESCRIPTION.rst +0 -166
- swift-2.23.2.dist-info/RECORD +0 -220
- swift-2.23.2.dist-info/metadata.json +0 -1
- swift-2.23.2.dist-info/pbr.json +0 -1
- {swift-2.23.2.dist-info → swift-2.35.0.dist-info}/top_level.txt +0 -0
swift/container/sharder.py
CHANGED
@@ -12,31 +12,38 @@
|
|
12
12
|
# implied.
|
13
13
|
# See the License for the specific language governing permissions and
|
14
14
|
# limitations under the License.
|
15
|
-
|
15
|
+
import collections
|
16
16
|
import errno
|
17
17
|
import json
|
18
|
+
import logging
|
19
|
+
import operator
|
20
|
+
from optparse import OptionParser
|
18
21
|
import time
|
19
22
|
from collections import defaultdict
|
23
|
+
from operator import itemgetter
|
20
24
|
from random import random
|
21
25
|
|
22
26
|
import os
|
23
|
-
import
|
27
|
+
from urllib.parse import quote
|
24
28
|
from eventlet import Timeout
|
29
|
+
from contextlib import contextmanager
|
25
30
|
|
26
31
|
from swift.common import internal_client
|
27
|
-
from swift.common.constraints import check_drive
|
32
|
+
from swift.common.constraints import check_drive, AUTO_CREATE_ACCOUNT_PREFIX
|
28
33
|
from swift.common.direct_client import (direct_put_container,
|
29
34
|
DirectClientException)
|
30
|
-
from swift.common.
|
35
|
+
from swift.common.daemon import run_daemon
|
36
|
+
from swift.common.request_helpers import USE_REPLICATION_NETWORK_HEADER
|
31
37
|
from swift.common.ring.utils import is_local_device
|
32
38
|
from swift.common.swob import str_to_wsgi
|
33
39
|
from swift.common.utils import get_logger, config_true_value, \
|
34
40
|
dump_recon_cache, whataremyips, Timestamp, ShardRange, GreenAsyncPile, \
|
35
|
-
|
36
|
-
|
41
|
+
config_positive_int_value, quorum_size, parse_override_options, \
|
42
|
+
Everything, config_auto_int_value, ShardRangeList, config_percent_value, \
|
43
|
+
node_to_string, parse_options
|
37
44
|
from swift.container.backend import ContainerBroker, \
|
38
45
|
RECORD_TYPE_SHARD, UNSHARDED, SHARDING, SHARDED, COLLAPSED, \
|
39
|
-
SHARD_UPDATE_STATES
|
46
|
+
SHARD_UPDATE_STATES, sift_shard_ranges, SHARD_UPDATE_STAT_STATES
|
40
47
|
from swift.container.replicator import ContainerReplicator
|
41
48
|
|
42
49
|
|
@@ -44,6 +51,8 @@ CLEAVE_SUCCESS = 0
|
|
44
51
|
CLEAVE_FAILED = 1
|
45
52
|
CLEAVE_EMPTY = 2
|
46
53
|
|
54
|
+
DEFAULT_PERIODIC_WARNINGS_INTERVAL = 24 * 3600
|
55
|
+
|
47
56
|
|
48
57
|
def sharding_enabled(broker):
|
49
58
|
# NB all shards will by default have been created with
|
@@ -55,7 +64,7 @@ def sharding_enabled(broker):
|
|
55
64
|
# if broker has been marked deleted it will have lost sysmeta, but we still
|
56
65
|
# need to process the broker (for example, to shrink any shard ranges) so
|
57
66
|
# fallback to checking if it has any shard ranges
|
58
|
-
if broker.
|
67
|
+
if broker.has_other_shard_ranges():
|
59
68
|
return True
|
60
69
|
return False
|
61
70
|
|
@@ -75,61 +84,166 @@ def make_shard_ranges(broker, shard_data, shards_account_prefix):
|
|
75
84
|
return shard_ranges
|
76
85
|
|
77
86
|
|
78
|
-
def
|
87
|
+
def _find_discontinuity(paths, start):
|
88
|
+
# select the path that reaches furthest from start into the namespace
|
89
|
+
start_paths = [path for path in paths if path.lower == start]
|
90
|
+
start_paths.sort(key=lambda p: p.upper)
|
91
|
+
longest_start_path = start_paths[-1]
|
92
|
+
# search for paths that end further into the namespace (note: these must
|
93
|
+
# have a lower that differs from the start_path upper, otherwise they would
|
94
|
+
# be part of the start_path longer!)
|
95
|
+
end_paths = [path for path in paths
|
96
|
+
if path.upper > longest_start_path.upper]
|
97
|
+
if end_paths:
|
98
|
+
# select those that begin nearest the start of the namespace
|
99
|
+
end_paths.sort(key=lambda p: p.lower)
|
100
|
+
end_paths = [p for p in end_paths if p.lower == end_paths[0].lower]
|
101
|
+
# select the longest of those
|
102
|
+
end_paths.sort(key=lambda p: p.upper)
|
103
|
+
longest_end_path = end_paths[-1]
|
104
|
+
else:
|
105
|
+
longest_end_path = None
|
106
|
+
return longest_start_path, longest_end_path
|
107
|
+
|
108
|
+
|
109
|
+
def find_paths_with_gaps(shard_ranges, within_range=None):
|
79
110
|
"""
|
80
|
-
Find
|
81
|
-
|
111
|
+
Find gaps in the shard ranges and pairs of shard range paths that lead to
|
112
|
+
and from those gaps. For each gap a single pair of adjacent paths is
|
113
|
+
selected. The concatenation of all selected paths and gaps will span the
|
114
|
+
entire namespace with no overlaps.
|
115
|
+
|
116
|
+
:param shard_ranges: a list of instances of ShardRange.
|
117
|
+
:param within_range: an optional ShardRange that constrains the search
|
118
|
+
space; the method will only return gaps within this range. The default
|
119
|
+
is the entire namespace.
|
120
|
+
:return: A list of tuples of ``(start_path, gap_range, end_path)`` where
|
121
|
+
``start_path`` is a list of ShardRanges leading to the gap,
|
122
|
+
``gap_range`` is a ShardRange synthesized to describe the namespace
|
123
|
+
gap, and ``end_path`` is a list of ShardRanges leading from the gap.
|
124
|
+
When gaps start or end at the namespace minimum or maximum bounds,
|
125
|
+
``start_path`` and ``end_path`` may be 'null' paths that contain a
|
126
|
+
single ShardRange covering either the minimum or maximum of the
|
127
|
+
namespace.
|
128
|
+
"""
|
129
|
+
timestamp = Timestamp.now()
|
130
|
+
within_range = within_range or ShardRange('entire/namespace', timestamp)
|
131
|
+
shard_ranges = ShardRangeList(shard_ranges)
|
132
|
+
# note: find_paths results do not include shrinking ranges
|
133
|
+
paths = find_paths(shard_ranges)
|
134
|
+
# add paths covering no namespace at start and end of namespace to ensure
|
135
|
+
# that a start_path and end_path is always found even when there is a gap
|
136
|
+
# at the start or end of the namespace
|
137
|
+
null_start = ShardRange('null/start', timestamp,
|
138
|
+
lower=ShardRange.MIN,
|
139
|
+
upper=ShardRange.MIN,
|
140
|
+
state=ShardRange.FOUND)
|
141
|
+
null_end = ShardRange('null/end', timestamp,
|
142
|
+
lower=ShardRange.MAX,
|
143
|
+
upper=ShardRange.MAX,
|
144
|
+
state=ShardRange.FOUND)
|
145
|
+
paths.extend([ShardRangeList([null_start]), ShardRangeList([null_end])])
|
146
|
+
paths_with_gaps = []
|
147
|
+
start = null_start.lower
|
148
|
+
while True:
|
149
|
+
start_path, end_path = _find_discontinuity(paths, start)
|
150
|
+
if end_path is None:
|
151
|
+
# end of namespace reached
|
152
|
+
break
|
153
|
+
start = end_path.lower
|
154
|
+
if start_path.upper > end_path.lower:
|
155
|
+
# overlap
|
156
|
+
continue
|
157
|
+
gap_range = ShardRange('gap/index_%06d' % len(paths_with_gaps),
|
158
|
+
timestamp,
|
159
|
+
lower=start_path.upper,
|
160
|
+
upper=end_path.lower)
|
161
|
+
if gap_range.overlaps(within_range):
|
162
|
+
paths_with_gaps.append((start_path, gap_range, end_path))
|
163
|
+
return paths_with_gaps
|
82
164
|
|
83
|
-
|
84
|
-
|
165
|
+
|
166
|
+
def _is_parent_or_child(shard_range, other, time_period):
|
85
167
|
"""
|
86
|
-
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
168
|
+
Test if shard range ``shard_range`` is the parent or a child of another
|
169
|
+
shard range ``other`` within past time period ``time_period``. This method
|
170
|
+
is limited to work only within the scope of the same user-facing account
|
171
|
+
(with and without shard prefix).
|
172
|
+
|
173
|
+
:param shard_range: an instance of ``ShardRange``.
|
174
|
+
:param other: an instance of ``ShardRange``.
|
175
|
+
:param time_period: the specified past time period in seconds. Value of
|
176
|
+
0 means all time in the past.
|
177
|
+
:return: True if ``shard_range`` is the parent or a child of ``other``
|
178
|
+
within past time period, False otherwise, assuming that they are within
|
179
|
+
the same account.
|
180
|
+
"""
|
181
|
+
exclude_age = (time.time() - float(time_period)) if time_period > 0 else 0
|
182
|
+
if shard_range.is_child_of(other) and shard_range.timestamp >= exclude_age:
|
183
|
+
return True
|
184
|
+
if other.is_child_of(shard_range) and other.timestamp >= exclude_age:
|
185
|
+
return True
|
186
|
+
return False
|
187
|
+
|
188
|
+
|
189
|
+
def find_overlapping_ranges(
|
190
|
+
shard_ranges, exclude_parent_child=False, time_period=0):
|
100
191
|
"""
|
101
192
|
Find all pairs of overlapping ranges in the given list.
|
102
193
|
|
103
194
|
:param shard_ranges: A list of :class:`~swift.utils.ShardRange`
|
195
|
+
:param exclude_parent_child: If True then overlapping pairs that have a
|
196
|
+
parent-child relationship within the past time period
|
197
|
+
``time_period`` are excluded from the returned set. Default is
|
198
|
+
False.
|
199
|
+
:param time_period: the specified past time period in seconds. Value of
|
200
|
+
0 means all time in the past.
|
104
201
|
:return: a set of tuples, each tuple containing ranges that overlap with
|
105
202
|
each other.
|
106
203
|
"""
|
107
204
|
result = set()
|
108
|
-
for shard_range in shard_ranges:
|
109
|
-
|
110
|
-
|
205
|
+
for i, shard_range in enumerate(shard_ranges):
|
206
|
+
if exclude_parent_child:
|
207
|
+
overlapping = [
|
208
|
+
sr for sr in shard_ranges[i + 1:]
|
209
|
+
if shard_range.name != sr.name and shard_range.overlaps(sr) and
|
210
|
+
not _is_parent_or_child(shard_range, sr, time_period)]
|
211
|
+
else:
|
212
|
+
overlapping = [
|
213
|
+
sr for sr in shard_ranges[i + 1:]
|
214
|
+
if shard_range.name != sr.name and shard_range.overlaps(sr)]
|
111
215
|
if overlapping:
|
112
216
|
overlapping.append(shard_range)
|
113
|
-
overlapping.sort()
|
217
|
+
overlapping.sort(key=ShardRange.sort_key)
|
114
218
|
result.add(tuple(overlapping))
|
115
219
|
|
116
220
|
return result
|
117
221
|
|
118
222
|
|
119
223
|
def is_sharding_candidate(shard_range, threshold):
|
224
|
+
# note: use *object* count as the condition for sharding: tombstones will
|
225
|
+
# eventually be reclaimed so should not trigger sharding
|
120
226
|
return (shard_range.state == ShardRange.ACTIVE and
|
121
227
|
shard_range.object_count >= threshold)
|
122
228
|
|
123
229
|
|
230
|
+
def is_shrinking_candidate(shard_range, shrink_threshold, expansion_limit,
|
231
|
+
states=None):
|
232
|
+
# typically shrink_threshold < expansion_limit but check both just in case
|
233
|
+
# note: use *row* count (objects plus tombstones) as the condition for
|
234
|
+
# shrinking to avoid inadvertently moving large numbers of tombstones into
|
235
|
+
# an acceptor
|
236
|
+
states = states or (ShardRange.ACTIVE,)
|
237
|
+
return (shard_range.state in states and
|
238
|
+
shard_range.row_count < shrink_threshold and
|
239
|
+
shard_range.row_count <= expansion_limit)
|
240
|
+
|
241
|
+
|
124
242
|
def find_sharding_candidates(broker, threshold, shard_ranges=None):
|
125
243
|
# this should only execute on root containers; the goal is to find
|
126
244
|
# large shard containers that should be sharded.
|
127
245
|
# First cut is simple: assume root container shard usage stats are good
|
128
246
|
# enough to make decision.
|
129
|
-
# TODO: object counts may well not be the appropriate metric for
|
130
|
-
# deciding to shrink because a shard with low object_count may have a
|
131
|
-
# large number of deleted object rows that will need to be merged with
|
132
|
-
# a neighbour. We may need to expose row count as well as object count.
|
133
247
|
if shard_ranges is None:
|
134
248
|
shard_ranges = broker.get_shard_ranges(states=[ShardRange.ACTIVE])
|
135
249
|
candidates = []
|
@@ -143,63 +257,376 @@ def find_sharding_candidates(broker, threshold, shard_ranges=None):
|
|
143
257
|
return candidates
|
144
258
|
|
145
259
|
|
146
|
-
def find_shrinking_candidates(broker, shrink_threshold,
|
260
|
+
def find_shrinking_candidates(broker, shrink_threshold, expansion_limit):
|
261
|
+
# this is only here to preserve a legacy public function signature;
|
262
|
+
# superseded by find_compactible_shard_sequences
|
263
|
+
merge_pairs = {}
|
264
|
+
# restrict search to sequences with one donor
|
265
|
+
results = find_compactible_shard_sequences(broker, shrink_threshold,
|
266
|
+
expansion_limit, 1, -1,
|
267
|
+
include_shrinking=True)
|
268
|
+
for sequence in results:
|
269
|
+
# map acceptor -> donor list
|
270
|
+
merge_pairs[sequence[-1]] = sequence[-2]
|
271
|
+
return merge_pairs
|
272
|
+
|
273
|
+
|
274
|
+
def find_compactible_shard_sequences(broker,
|
275
|
+
shrink_threshold,
|
276
|
+
expansion_limit,
|
277
|
+
max_shrinking,
|
278
|
+
max_expanding,
|
279
|
+
include_shrinking=False):
|
280
|
+
"""
|
281
|
+
Find sequences of shard ranges that could be compacted into a single
|
282
|
+
acceptor shard range.
|
283
|
+
|
284
|
+
This function does not modify shard ranges.
|
285
|
+
|
286
|
+
:param broker: A :class:`~swift.container.backend.ContainerBroker`.
|
287
|
+
:param shrink_threshold: the number of rows below which a shard may be
|
288
|
+
considered for shrinking into another shard
|
289
|
+
:param expansion_limit: the maximum number of rows that an acceptor shard
|
290
|
+
range should have after other shard ranges have been compacted into it
|
291
|
+
:param max_shrinking: the maximum number of shard ranges that should be
|
292
|
+
compacted into each acceptor; -1 implies unlimited.
|
293
|
+
:param max_expanding: the maximum number of acceptors to be found (i.e. the
|
294
|
+
maximum number of sequences to be returned); -1 implies unlimited.
|
295
|
+
:param include_shrinking: if True then existing compactible sequences are
|
296
|
+
included in the results; default is False.
|
297
|
+
:returns: A list of :class:`~swift.common.utils.ShardRangeList` each
|
298
|
+
containing a sequence of neighbouring shard ranges that may be
|
299
|
+
compacted; the final shard range in the list is the acceptor
|
300
|
+
"""
|
147
301
|
# this should only execute on root containers that have sharded; the
|
148
302
|
# goal is to find small shard containers that could be retired by
|
149
303
|
# merging with a neighbour.
|
150
304
|
# First cut is simple: assume root container shard usage stats are good
|
151
305
|
# enough to make decision; only merge with upper neighbour so that
|
152
306
|
# upper bounds never change (shard names include upper bound).
|
153
|
-
# TODO: object counts may well not be the appropriate metric for
|
154
|
-
# deciding to shrink because a shard with low object_count may have a
|
155
|
-
# large number of deleted object rows that will need to be merged with
|
156
|
-
# a neighbour. We may need to expose row count as well as object count.
|
157
307
|
shard_ranges = broker.get_shard_ranges()
|
158
308
|
own_shard_range = broker.get_own_shard_range()
|
159
|
-
if len(shard_ranges) == 1:
|
160
|
-
# special case to enable final shard to shrink into root
|
161
|
-
shard_ranges.append(own_shard_range)
|
162
309
|
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
310
|
+
def sequence_complete(sequence):
|
311
|
+
# a sequence is considered complete if any of the following are true:
|
312
|
+
# - the final shard range has more objects than the shrink_threshold,
|
313
|
+
# so should not be shrunk (this shard will be the acceptor)
|
314
|
+
# - the max number of shard ranges to be compacted (max_shrinking) has
|
315
|
+
# been reached
|
316
|
+
# - the total number of objects in the sequence has reached the
|
317
|
+
# expansion_limit
|
318
|
+
if (sequence and
|
319
|
+
(not is_shrinking_candidate(
|
320
|
+
sequence[-1], shrink_threshold, expansion_limit,
|
321
|
+
states=(ShardRange.ACTIVE, ShardRange.SHRINKING)) or
|
322
|
+
0 < max_shrinking < len(sequence) or
|
323
|
+
sequence.row_count >= expansion_limit)):
|
324
|
+
return True
|
325
|
+
return False
|
326
|
+
|
327
|
+
compactible_sequences = []
|
328
|
+
index = 0
|
329
|
+
expanding = 0
|
330
|
+
while ((max_expanding < 0 or expanding < max_expanding) and
|
331
|
+
index < len(shard_ranges)):
|
332
|
+
if not is_shrinking_candidate(
|
333
|
+
shard_ranges[index], shrink_threshold, expansion_limit,
|
334
|
+
states=(ShardRange.ACTIVE, ShardRange.SHRINKING)):
|
335
|
+
# this shard range cannot be the start of a new or existing
|
336
|
+
# compactible sequence, move on
|
337
|
+
index += 1
|
172
338
|
continue
|
173
|
-
|
174
|
-
|
175
|
-
|
339
|
+
|
340
|
+
# start of a *possible* sequence
|
341
|
+
sequence = ShardRangeList([shard_ranges[index]])
|
342
|
+
for shard_range in shard_ranges[index + 1:]:
|
343
|
+
# attempt to add contiguous shard ranges to the sequence
|
344
|
+
if sequence.upper < shard_range.lower:
|
345
|
+
# found a gap! break before consuming this range because it
|
346
|
+
# could become the first in the next sequence
|
347
|
+
break
|
348
|
+
|
349
|
+
if shard_range.state not in (ShardRange.ACTIVE,
|
350
|
+
ShardRange.SHRINKING):
|
351
|
+
# found? created? sharded? don't touch it
|
352
|
+
break
|
353
|
+
|
354
|
+
if shard_range.state == ShardRange.SHRINKING:
|
355
|
+
# already shrinking: add to sequence unconditionally
|
356
|
+
sequence.append(shard_range)
|
357
|
+
elif (sequence.row_count + shard_range.row_count
|
358
|
+
<= expansion_limit):
|
359
|
+
# add to sequence: could be a donor or acceptor
|
360
|
+
sequence.append(shard_range)
|
361
|
+
if sequence_complete(sequence):
|
362
|
+
break
|
363
|
+
else:
|
364
|
+
break
|
365
|
+
|
366
|
+
index += len(sequence)
|
367
|
+
if (index == len(shard_ranges) and
|
368
|
+
len(shard_ranges) == len(sequence) and
|
369
|
+
not sequence_complete(sequence) and
|
370
|
+
sequence.includes(own_shard_range)):
|
371
|
+
# special case: only one sequence has been found, which consumes
|
372
|
+
# all shard ranges, encompasses the entire namespace, has no more
|
373
|
+
# than expansion_limit records and whose shard ranges are all
|
374
|
+
# shrinkable; all the shards in the sequence can be shrunk to the
|
375
|
+
# root, so append own_shard_range to the sequence to act as an
|
376
|
+
# acceptor; note: only shrink to the root when *all* the remaining
|
377
|
+
# shard ranges can be simultaneously shrunk to the root.
|
378
|
+
sequence.append(own_shard_range)
|
379
|
+
|
380
|
+
if len(sequence) < 2 or sequence[-1].state not in (ShardRange.ACTIVE,
|
381
|
+
ShardRange.SHARDED):
|
382
|
+
# this sequence doesn't end with a suitable acceptor shard range
|
383
|
+
continue
|
384
|
+
|
385
|
+
# all valid sequences are counted against the max_expanding allowance
|
386
|
+
# even if the sequence is already shrinking
|
387
|
+
expanding += 1
|
388
|
+
if (all([sr.state != ShardRange.SHRINKING for sr in sequence]) or
|
389
|
+
include_shrinking):
|
390
|
+
compactible_sequences.append(sequence)
|
391
|
+
|
392
|
+
return compactible_sequences
|
393
|
+
|
394
|
+
|
395
|
+
def finalize_shrinking(broker, acceptor_ranges, donor_ranges, timestamp):
|
396
|
+
"""
|
397
|
+
Update donor shard ranges to shrinking state and merge donors and acceptors
|
398
|
+
to broker.
|
399
|
+
|
400
|
+
:param broker: A :class:`~swift.container.backend.ContainerBroker`.
|
401
|
+
:param acceptor_ranges: A list of :class:`~swift.common.utils.ShardRange`
|
402
|
+
that are to be acceptors.
|
403
|
+
:param donor_ranges: A list of :class:`~swift.common.utils.ShardRange`
|
404
|
+
that are to be donors; these will have their state and timestamp
|
405
|
+
updated.
|
406
|
+
:param timestamp: timestamp to use when updating donor state
|
407
|
+
"""
|
408
|
+
for donor in donor_ranges:
|
409
|
+
if donor.update_state(ShardRange.SHRINKING):
|
410
|
+
# Set donor state to shrinking state_timestamp defines new epoch
|
411
|
+
donor.epoch = donor.state_timestamp = timestamp
|
412
|
+
broker.merge_shard_ranges(acceptor_ranges + donor_ranges)
|
413
|
+
|
414
|
+
|
415
|
+
def process_compactible_shard_sequences(broker, sequences):
|
416
|
+
"""
|
417
|
+
Transform the given sequences of shard ranges into a list of acceptors and
|
418
|
+
a list of shrinking donors. For each given sequence the final ShardRange in
|
419
|
+
the sequence (the acceptor) is expanded to accommodate the other
|
420
|
+
ShardRanges in the sequence (the donors). The donors and acceptors are then
|
421
|
+
merged into the broker.
|
422
|
+
|
423
|
+
:param broker: A :class:`~swift.container.backend.ContainerBroker`.
|
424
|
+
:param sequences: A list of :class:`~swift.common.utils.ShardRangeList`
|
425
|
+
"""
|
426
|
+
timestamp = Timestamp.now()
|
427
|
+
acceptor_ranges = []
|
428
|
+
shrinking_ranges = []
|
429
|
+
for sequence in sequences:
|
430
|
+
donors = sequence[:-1]
|
431
|
+
shrinking_ranges.extend(donors)
|
432
|
+
# Update the acceptor container with its expanded bounds to prevent it
|
433
|
+
# treating objects cleaved from the donor as misplaced.
|
434
|
+
acceptor = sequence[-1]
|
435
|
+
if acceptor.expand(donors):
|
436
|
+
# Update the acceptor container with its expanded bounds to prevent
|
437
|
+
# it treating objects cleaved from the donor as misplaced.
|
438
|
+
acceptor.timestamp = timestamp
|
439
|
+
if acceptor.update_state(ShardRange.ACTIVE):
|
440
|
+
# Ensure acceptor state is ACTIVE (when acceptor is root)
|
441
|
+
acceptor.state_timestamp = timestamp
|
442
|
+
acceptor_ranges.append(acceptor)
|
443
|
+
finalize_shrinking(broker, acceptor_ranges, shrinking_ranges, timestamp)
|
444
|
+
|
445
|
+
|
446
|
+
def find_paths(shard_ranges):
|
447
|
+
"""
|
448
|
+
Returns a list of all continuous paths through the shard ranges. An
|
449
|
+
individual path may not necessarily span the entire namespace, but it will
|
450
|
+
span a continuous namespace without gaps.
|
451
|
+
|
452
|
+
:param shard_ranges: A list of :class:`~swift.common.utils.ShardRange`.
|
453
|
+
:return: A list of :class:`~swift.common.utils.ShardRangeList`.
|
454
|
+
"""
|
455
|
+
# A node is a point in the namespace that is used as a bound of any shard
|
456
|
+
# range. Shard ranges form the edges between nodes.
|
457
|
+
|
458
|
+
# First build a dict mapping nodes to a list of edges that leave that node
|
459
|
+
# (in other words, shard ranges whose lower bound equals the node)
|
460
|
+
node_successors = collections.defaultdict(list)
|
461
|
+
for shard_range in shard_ranges:
|
462
|
+
if shard_range.state == ShardRange.SHRINKING:
|
463
|
+
# shrinking shards are not a viable edge in any path
|
176
464
|
continue
|
177
|
-
|
178
|
-
|
465
|
+
node_successors[shard_range.lower].append(shard_range)
|
466
|
+
|
467
|
+
paths = []
|
468
|
+
|
469
|
+
def clone_path(other=None):
|
470
|
+
# create a new path, possibly cloning another path, and add it to the
|
471
|
+
# list of all paths through the shards
|
472
|
+
path = ShardRangeList() if other is None else ShardRangeList(other)
|
473
|
+
paths.append(path)
|
474
|
+
return path
|
475
|
+
|
476
|
+
# we need to keep track of every path that ends at each node so that when
|
477
|
+
# we visit the node we can extend those paths, or clones of them, with the
|
478
|
+
# edges that leave the node
|
479
|
+
paths_to_node = collections.defaultdict(list)
|
480
|
+
|
481
|
+
# visit the nodes in ascending order by name...
|
482
|
+
for node, edges in sorted(node_successors.items()):
|
483
|
+
if not edges:
|
484
|
+
# this node is a dead-end, so there's no path updates to make
|
179
485
|
continue
|
486
|
+
if not paths_to_node[node]:
|
487
|
+
# this is either the first node to be visited, or it has no paths
|
488
|
+
# leading to it, so we need to start a new path here
|
489
|
+
paths_to_node[node].append(clone_path([]))
|
490
|
+
for path_to_node in paths_to_node[node]:
|
491
|
+
# extend each path that arrives at this node with all of the
|
492
|
+
# possible edges that leave the node; if more than edge leaves the
|
493
|
+
# node then we will make clones of the path to the node and extend
|
494
|
+
# those clones, adding to the collection of all paths though the
|
495
|
+
# shards
|
496
|
+
for i, edge in enumerate(edges):
|
497
|
+
if i == len(edges) - 1:
|
498
|
+
# the last edge is used to extend the original path to the
|
499
|
+
# node; there is nothing special about the last edge, but
|
500
|
+
# doing this last means the original path to the node can
|
501
|
+
# be cloned for all other edges before being modified here
|
502
|
+
path = path_to_node
|
503
|
+
else:
|
504
|
+
# for all but one of the edges leaving the node we need to
|
505
|
+
# make a clone the original path
|
506
|
+
path = clone_path(path_to_node)
|
507
|
+
# extend the path with the edge
|
508
|
+
path.append(edge)
|
509
|
+
# keep track of which node this path now arrives at
|
510
|
+
paths_to_node[edge.upper].append(path)
|
511
|
+
return paths
|
512
|
+
|
513
|
+
|
514
|
+
def rank_paths(paths, shard_range_to_span):
|
515
|
+
"""
|
516
|
+
Sorts the given list of paths such that the most preferred path is the
|
517
|
+
first item in the list.
|
518
|
+
|
519
|
+
:param paths: A list of :class:`~swift.common.utils.ShardRangeList`.
|
520
|
+
:param shard_range_to_span: An instance of
|
521
|
+
:class:`~swift.common.utils.ShardRange` that describes the namespace
|
522
|
+
that would ideally be spanned by a path. Paths that include this
|
523
|
+
namespace will be preferred over those that do not.
|
524
|
+
:return: A sorted list of :class:`~swift.common.utils.ShardRangeList`.
|
525
|
+
"""
|
526
|
+
def sort_key(path):
|
527
|
+
# defines the order of preference for paths through shards
|
528
|
+
return (
|
529
|
+
# complete path for the namespace
|
530
|
+
path.includes(shard_range_to_span),
|
531
|
+
# most cleaving progress
|
532
|
+
path.find_lower(lambda sr: sr.state not in (
|
533
|
+
ShardRange.CLEAVED, ShardRange.ACTIVE)),
|
534
|
+
# largest object count
|
535
|
+
path.object_count,
|
536
|
+
# fewest timestamps
|
537
|
+
-1 * len(path.timestamps),
|
538
|
+
# newest timestamp
|
539
|
+
sorted(path.timestamps)[-1]
|
540
|
+
)
|
541
|
+
|
542
|
+
paths.sort(key=sort_key, reverse=True)
|
543
|
+
return paths
|
180
544
|
|
181
|
-
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
return
|
545
|
+
|
546
|
+
def combine_shard_ranges(new_shard_ranges, existing_shard_ranges):
|
547
|
+
"""
|
548
|
+
Combines new and existing shard ranges based on most recent state.
|
549
|
+
|
550
|
+
:param new_shard_ranges: a list of ShardRange instances.
|
551
|
+
:param existing_shard_ranges: a list of ShardRange instances.
|
552
|
+
:return: a list of ShardRange instances.
|
553
|
+
"""
|
554
|
+
new_shard_ranges = [dict(sr) for sr in new_shard_ranges]
|
555
|
+
existing_shard_ranges = [dict(sr) for sr in existing_shard_ranges]
|
556
|
+
to_add, to_delete = sift_shard_ranges(
|
557
|
+
new_shard_ranges,
|
558
|
+
dict((sr['name'], sr) for sr in existing_shard_ranges))
|
559
|
+
result = [ShardRange.from_dict(existing)
|
560
|
+
for existing in existing_shard_ranges
|
561
|
+
if existing['name'] not in to_delete]
|
562
|
+
result.extend([ShardRange.from_dict(sr) for sr in to_add])
|
563
|
+
return sorted([sr for sr in result if not sr.deleted],
|
564
|
+
key=ShardRange.sort_key)
|
565
|
+
|
566
|
+
|
567
|
+
def update_own_shard_range_stats(broker, own_shard_range):
|
568
|
+
"""
|
569
|
+
Update the ``own_shard_range`` with the up-to-date object stats from
|
570
|
+
the ``broker``.
|
571
|
+
|
572
|
+
Note: this method does not persist the updated ``own_shard_range``;
|
573
|
+
callers should use ``broker.merge_shard_ranges`` if the updated stats
|
574
|
+
need to be persisted.
|
575
|
+
|
576
|
+
:param broker: an instance of ``ContainerBroker``.
|
577
|
+
:param own_shard_range: and instance of ``ShardRange``.
|
578
|
+
:returns: ``own_shard_range`` with up-to-date ``object_count``
|
579
|
+
and ``bytes_used``.
|
580
|
+
"""
|
581
|
+
info = broker.get_info()
|
582
|
+
own_shard_range.update_meta(
|
583
|
+
info['object_count'], info['bytes_used'])
|
584
|
+
return own_shard_range
|
200
585
|
|
201
586
|
|
202
587
|
class CleavingContext(object):
|
588
|
+
"""
|
589
|
+
Encapsulates metadata associated with the process of cleaving a retiring
|
590
|
+
DB. This metadata includes:
|
591
|
+
|
592
|
+
* ``ref``: The unique part of the key that is used when persisting a
|
593
|
+
serialized ``CleavingContext`` as sysmeta in the DB. The unique part of
|
594
|
+
the key is based off the DB id. This ensures that each context is
|
595
|
+
associated with a specific DB file. The unique part of the key is
|
596
|
+
included in the ``CleavingContext`` but should not be modified by any
|
597
|
+
caller.
|
598
|
+
|
599
|
+
* ``cursor``: the upper bound of the last shard range to have been
|
600
|
+
cleaved from the retiring DB.
|
601
|
+
|
602
|
+
* ``max_row``: the retiring DB's max row; this is updated to the value of
|
603
|
+
the retiring DB's ``max_row`` every time a ``CleavingContext`` is
|
604
|
+
loaded for that DB, and may change during the process of cleaving the
|
605
|
+
DB.
|
606
|
+
|
607
|
+
* ``cleave_to_row``: the value of ``max_row`` at the moment when cleaving
|
608
|
+
starts for the DB. When cleaving completes (i.e. the cleave cursor has
|
609
|
+
reached the upper bound of the cleaving namespace), ``cleave_to_row``
|
610
|
+
is compared to the current ``max_row``: if the two values are not equal
|
611
|
+
then rows have been added to the DB which may not have been cleaved, in
|
612
|
+
which case the ``CleavingContext`` is ``reset`` and cleaving is
|
613
|
+
re-started.
|
614
|
+
|
615
|
+
* ``last_cleave_to_row``: the minimum DB row from which cleaving should
|
616
|
+
select objects to cleave; this is initially set to None i.e. all rows
|
617
|
+
should be cleaved. If the ``CleavingContext`` is ``reset`` then the
|
618
|
+
``last_cleave_to_row`` is set to the current value of
|
619
|
+
``cleave_to_row``, which in turn is set to the current value of
|
620
|
+
``max_row`` by a subsequent call to ``start``. The repeated cleaving
|
621
|
+
therefore only selects objects in rows greater than the
|
622
|
+
``last_cleave_to_row``, rather than cleaving the whole DB again.
|
623
|
+
|
624
|
+
* ``ranges_done``: the number of shard ranges that have been cleaved from
|
625
|
+
the retiring DB.
|
626
|
+
|
627
|
+
* ``ranges_todo``: the number of shard ranges that are yet to be
|
628
|
+
cleaved from the retiring DB.
|
629
|
+
"""
|
203
630
|
def __init__(self, ref, cursor='', max_row=None, cleave_to_row=None,
|
204
631
|
last_cleave_to_row=None, cleaving_done=False,
|
205
632
|
misplaced_done=False, ranges_done=0, ranges_todo=0):
|
@@ -229,18 +656,13 @@ class CleavingContext(object):
|
|
229
656
|
return '%s(%s)' % (self.__class__.__name__, ', '.join(
|
230
657
|
'%s=%r' % prop for prop in self))
|
231
658
|
|
232
|
-
def _encode(cls, value):
|
233
|
-
if value is not None and six.PY2 and isinstance(value, six.text_type):
|
234
|
-
return value.encode('utf-8')
|
235
|
-
return value
|
236
|
-
|
237
659
|
@property
|
238
660
|
def cursor(self):
|
239
661
|
return self._cursor
|
240
662
|
|
241
663
|
@cursor.setter
|
242
664
|
def cursor(self, value):
|
243
|
-
self._cursor =
|
665
|
+
self._cursor = value
|
244
666
|
|
245
667
|
@property
|
246
668
|
def marker(self):
|
@@ -253,37 +675,33 @@ class CleavingContext(object):
|
|
253
675
|
@classmethod
|
254
676
|
def load_all(cls, broker):
|
255
677
|
"""
|
256
|
-
Returns all cleaving contexts stored in the broker.
|
678
|
+
Returns all cleaving contexts stored in the broker's DB.
|
257
679
|
|
258
|
-
:param broker:
|
680
|
+
:param broker: an instance of :class:`ContainerBroker`
|
259
681
|
:return: list of tuples of (CleavingContext, timestamp)
|
260
682
|
"""
|
261
683
|
brokers = broker.get_brokers()
|
262
684
|
sysmeta = brokers[-1].get_sharding_sysmeta_with_timestamps()
|
263
685
|
|
686
|
+
contexts = []
|
264
687
|
for key, (val, timestamp) in sysmeta.items():
|
265
|
-
# If the value is
|
688
|
+
# If the value is blank, then the metadata is
|
266
689
|
# marked for deletion
|
267
|
-
if key.startswith("Context-") and
|
690
|
+
if key.startswith("Context-") and val:
|
268
691
|
try:
|
269
|
-
|
692
|
+
contexts.append((cls(**json.loads(val)), timestamp))
|
270
693
|
except ValueError:
|
271
694
|
continue
|
695
|
+
return contexts
|
272
696
|
|
273
697
|
@classmethod
|
274
698
|
def load(cls, broker):
|
275
699
|
"""
|
276
|
-
Returns a
|
277
|
-
broker's
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
modified such that its max row changes then a different context, or no
|
282
|
-
context, will be loaded.
|
283
|
-
|
284
|
-
:return: A dict to which cleave progress metadata may be added. The
|
285
|
-
dict initially has a key ``ref`` which should not be modified by
|
286
|
-
any caller.
|
700
|
+
Returns a CleavingContext tracking the cleaving progress of the given
|
701
|
+
broker's DB.
|
702
|
+
|
703
|
+
:param broker: an instances of :class:`ContainerBroker`
|
704
|
+
:return: An instance of :class:`CleavingContext`.
|
287
705
|
"""
|
288
706
|
brokers = broker.get_brokers()
|
289
707
|
ref = cls._make_ref(brokers[0])
|
@@ -294,6 +712,12 @@ class CleavingContext(object):
|
|
294
712
|
return cls(**data)
|
295
713
|
|
296
714
|
def store(self, broker):
|
715
|
+
"""
|
716
|
+
Persists the serialized ``CleavingContext`` as sysmeta in the given
|
717
|
+
broker's DB.
|
718
|
+
|
719
|
+
:param broker: an instances of :class:`ContainerBroker`
|
720
|
+
"""
|
297
721
|
broker.set_sharding_sysmeta('Context-' + self.ref,
|
298
722
|
json.dumps(dict(self)))
|
299
723
|
|
@@ -312,6 +736,11 @@ class CleavingContext(object):
|
|
312
736
|
self.cleaving_done = False
|
313
737
|
self.cleave_to_row = self.max_row
|
314
738
|
|
739
|
+
def range_done(self, new_cursor):
|
740
|
+
self.ranges_done += 1
|
741
|
+
self.ranges_todo -= 1
|
742
|
+
self.cursor = new_cursor
|
743
|
+
|
315
744
|
def done(self):
|
316
745
|
return all((self.misplaced_done, self.cleaving_done,
|
317
746
|
self.max_row == self.cleave_to_row))
|
@@ -322,51 +751,108 @@ class CleavingContext(object):
|
|
322
751
|
broker.set_sharding_sysmeta('Context-' + self.ref, '')
|
323
752
|
|
324
753
|
|
325
|
-
|
326
|
-
|
327
|
-
|
754
|
+
class ContainerSharderConf(object):
|
755
|
+
def __init__(self, conf=None):
|
756
|
+
conf = conf if conf else {}
|
757
|
+
|
758
|
+
def get_val(key, validator, default):
|
759
|
+
"""
|
760
|
+
Get a value from conf and validate it.
|
761
|
+
|
762
|
+
:param key: key to lookup value in the ``conf`` dict.
|
763
|
+
:param validator: A function that will passed the value from the
|
764
|
+
``conf`` dict and should return the value to be set. This
|
765
|
+
function should raise a ValueError if the ``conf`` value if not
|
766
|
+
valid.
|
767
|
+
:param default: value to use if ``key`` is not found in ``conf``.
|
768
|
+
:raises: ValueError if the value read from ``conf`` is invalid.
|
769
|
+
:returns: the configuration value.
|
770
|
+
"""
|
771
|
+
try:
|
772
|
+
return validator(conf.get(key, default))
|
773
|
+
except ValueError as err:
|
774
|
+
raise ValueError('Error setting %s: %s' % (key, err))
|
775
|
+
|
776
|
+
self.shard_container_threshold = get_val(
|
777
|
+
'shard_container_threshold', config_positive_int_value, 1000000)
|
778
|
+
self.max_shrinking = get_val(
|
779
|
+
'max_shrinking', int, 1)
|
780
|
+
self.max_expanding = get_val(
|
781
|
+
'max_expanding', int, -1)
|
782
|
+
self.shard_scanner_batch_size = get_val(
|
783
|
+
'shard_scanner_batch_size', config_positive_int_value, 10)
|
784
|
+
self.cleave_batch_size = get_val(
|
785
|
+
'cleave_batch_size', config_positive_int_value, 2)
|
786
|
+
self.cleave_row_batch_size = get_val(
|
787
|
+
'cleave_row_batch_size', config_positive_int_value, 10000)
|
788
|
+
self.broker_timeout = get_val(
|
789
|
+
'broker_timeout', config_positive_int_value, 60)
|
790
|
+
self.recon_candidates_limit = get_val(
|
791
|
+
'recon_candidates_limit', int, 5)
|
792
|
+
self.recon_sharded_timeout = get_val(
|
793
|
+
'recon_sharded_timeout', int, 43200)
|
794
|
+
self.container_sharding_timeout = get_val(
|
795
|
+
'container_sharding_timeout', int, 172800)
|
796
|
+
self.conn_timeout = get_val(
|
797
|
+
'conn_timeout', float, 5)
|
798
|
+
self.auto_shard = get_val(
|
799
|
+
'auto_shard', config_true_value, False)
|
800
|
+
# deprecated percent options still loaded...
|
801
|
+
self.shrink_threshold = get_val(
|
802
|
+
'shard_shrink_point', self.percent_of_threshold, 10)
|
803
|
+
self.expansion_limit = get_val(
|
804
|
+
'shard_shrink_merge_point', self.percent_of_threshold, 75)
|
805
|
+
# ...but superseded by absolute options if present in conf
|
806
|
+
self.shrink_threshold = get_val(
|
807
|
+
'shrink_threshold', int, self.shrink_threshold)
|
808
|
+
self.expansion_limit = get_val(
|
809
|
+
'expansion_limit', int, self.expansion_limit)
|
810
|
+
self.rows_per_shard = get_val(
|
811
|
+
'rows_per_shard', config_positive_int_value,
|
812
|
+
max(self.shard_container_threshold // 2, 1))
|
813
|
+
self.minimum_shard_size = get_val(
|
814
|
+
'minimum_shard_size', config_positive_int_value,
|
815
|
+
max(self.rows_per_shard // 5, 1))
|
816
|
+
|
817
|
+
def percent_of_threshold(self, val):
|
818
|
+
return int(config_percent_value(val) * self.shard_container_threshold)
|
819
|
+
|
820
|
+
@classmethod
|
821
|
+
def validate_conf(cls, namespace):
|
822
|
+
ops = {'<': operator.lt,
|
823
|
+
'<=': operator.le}
|
824
|
+
checks = (('minimum_shard_size', '<=', 'rows_per_shard'),
|
825
|
+
('shrink_threshold', '<=', 'minimum_shard_size'),
|
826
|
+
('rows_per_shard', '<', 'shard_container_threshold'),
|
827
|
+
('expansion_limit', '<', 'shard_container_threshold'))
|
828
|
+
for key1, op, key2 in checks:
|
829
|
+
try:
|
830
|
+
val1 = getattr(namespace, key1)
|
831
|
+
val2 = getattr(namespace, key2)
|
832
|
+
except AttributeError:
|
833
|
+
# swift-manage-shard-ranges uses a subset of conf options for
|
834
|
+
# each command so only validate those actually in the namespace
|
835
|
+
continue
|
836
|
+
if not ops[op](val1, val2):
|
837
|
+
raise ValueError('%s (%d) must be %s %s (%d)'
|
838
|
+
% (key1, val1, op, key2, val2))
|
839
|
+
|
328
840
|
|
841
|
+
DEFAULT_SHARDER_CONF = vars(ContainerSharderConf())
|
329
842
|
|
330
|
-
|
843
|
+
|
844
|
+
class ContainerSharder(ContainerSharderConf, ContainerReplicator):
|
331
845
|
"""Shards containers."""
|
846
|
+
log_route = 'container-sharder'
|
332
847
|
|
333
848
|
def __init__(self, conf, logger=None):
|
334
|
-
logger = logger or get_logger(conf, log_route=
|
335
|
-
|
336
|
-
self
|
337
|
-
|
338
|
-
|
339
|
-
def percent_value(key, default):
|
340
|
-
try:
|
341
|
-
value = conf.get(key, default)
|
342
|
-
return config_float_value(value, 0, 100) / 100.0
|
343
|
-
except ValueError as err:
|
344
|
-
raise ValueError("%s: %s" % (str(err), key))
|
345
|
-
|
346
|
-
self.shard_shrink_point = percent_value('shard_shrink_point',
|
347
|
-
DEFAULT_SHARD_SHRINK_POINT)
|
348
|
-
self.shrink_merge_point = percent_value('shard_shrink_merge_point',
|
349
|
-
DEFAULT_SHARD_MERGE_POINT)
|
350
|
-
self.shard_container_threshold = config_positive_int_value(
|
351
|
-
conf.get('shard_container_threshold',
|
352
|
-
DEFAULT_SHARD_CONTAINER_THRESHOLD))
|
353
|
-
self.shrink_size = (self.shard_container_threshold *
|
354
|
-
self.shard_shrink_point)
|
355
|
-
self.merge_size = (self.shard_container_threshold *
|
356
|
-
self.shrink_merge_point)
|
357
|
-
self.split_size = self.shard_container_threshold // 2
|
358
|
-
self.scanner_batch_size = config_positive_int_value(
|
359
|
-
conf.get('shard_scanner_batch_size', 10))
|
360
|
-
self.cleave_batch_size = config_positive_int_value(
|
361
|
-
conf.get('cleave_batch_size', 2))
|
362
|
-
self.cleave_row_batch_size = config_positive_int_value(
|
363
|
-
conf.get('cleave_row_batch_size', 10000))
|
364
|
-
self.auto_shard = config_true_value(conf.get('auto_shard', False))
|
849
|
+
logger = logger or get_logger(conf, log_route=self.log_route)
|
850
|
+
ContainerReplicator.__init__(self, conf, logger=logger)
|
851
|
+
ContainerSharderConf.__init__(self, conf)
|
852
|
+
ContainerSharderConf.validate_conf(self)
|
853
|
+
self.shards_account_prefix = (AUTO_CREATE_ACCOUNT_PREFIX + 'shards_')
|
365
854
|
self.sharding_candidates = []
|
366
|
-
self.
|
367
|
-
conf.get('recon_candidates_limit', 5))
|
368
|
-
self.broker_timeout = config_positive_int_value(
|
369
|
-
conf.get('broker_timeout', 60))
|
855
|
+
self.shrinking_candidates = []
|
370
856
|
replica_count = self.ring.replica_count
|
371
857
|
quorum = quorum_size(replica_count)
|
372
858
|
self.shard_replication_quorum = config_auto_int_value(
|
@@ -388,7 +874,6 @@ class ContainerSharder(ContainerReplicator):
|
|
388
874
|
self.existing_shard_replication_quorum = replica_count
|
389
875
|
|
390
876
|
# internal client
|
391
|
-
self.conn_timeout = float(conf.get('conn_timeout', 5))
|
392
877
|
request_tries = config_positive_int_value(
|
393
878
|
conf.get('request_tries', 3))
|
394
879
|
internal_client_conf_path = conf.get('internal_client_conf_path',
|
@@ -398,7 +883,9 @@ class ContainerSharder(ContainerReplicator):
|
|
398
883
|
internal_client_conf_path,
|
399
884
|
'Swift Container Sharder',
|
400
885
|
request_tries,
|
401
|
-
|
886
|
+
use_replication_network=True,
|
887
|
+
global_conf={'log_name': '%s-ic' % conf.get(
|
888
|
+
'log_name', self.log_route)})
|
402
889
|
except (OSError, IOError) as err:
|
403
890
|
if err.errno != errno.ENOENT and \
|
404
891
|
not str(err).endswith(' not found'):
|
@@ -406,7 +893,67 @@ class ContainerSharder(ContainerReplicator):
|
|
406
893
|
raise SystemExit(
|
407
894
|
'Unable to load internal client from config: %r (%s)' %
|
408
895
|
(internal_client_conf_path, err))
|
896
|
+
self.stats_interval = float(conf.get('stats_interval', '3600'))
|
409
897
|
self.reported = 0
|
898
|
+
self.periodic_warnings_interval = float(
|
899
|
+
conf.get('periodic_warnings_interval',
|
900
|
+
DEFAULT_PERIODIC_WARNINGS_INTERVAL))
|
901
|
+
self.periodic_warnings_start = time.time()
|
902
|
+
self.periodic_warnings = set()
|
903
|
+
|
904
|
+
def _get_broker_details(self, broker):
|
905
|
+
try:
|
906
|
+
db_file = broker.db_file
|
907
|
+
except Exception: # noqa
|
908
|
+
db_file = ''
|
909
|
+
try:
|
910
|
+
path = broker.path
|
911
|
+
except Exception: # noqa
|
912
|
+
path = ''
|
913
|
+
return db_file, path
|
914
|
+
|
915
|
+
def _format_log_msg(self, broker, msg, *args):
|
916
|
+
# make best effort to include broker properties...
|
917
|
+
db_file, path = self._get_broker_details(broker)
|
918
|
+
if args:
|
919
|
+
msg = msg % args
|
920
|
+
return '%s, path: %s, db: %s' % (msg, quote(path), db_file)
|
921
|
+
|
922
|
+
def _log(self, level, broker, msg, *args):
|
923
|
+
if not self.logger.isEnabledFor(level):
|
924
|
+
return
|
925
|
+
|
926
|
+
self.logger.log(level, self._format_log_msg(broker, msg, *args))
|
927
|
+
|
928
|
+
def debug(self, broker, msg, *args, **kwargs):
|
929
|
+
self._log(logging.DEBUG, broker, msg, *args, **kwargs)
|
930
|
+
|
931
|
+
def info(self, broker, msg, *args, **kwargs):
|
932
|
+
self._log(logging.INFO, broker, msg, *args, **kwargs)
|
933
|
+
|
934
|
+
def warning(self, broker, msg, *args, **kwargs):
|
935
|
+
self._log(logging.WARNING, broker, msg, *args, **kwargs)
|
936
|
+
|
937
|
+
def periodic_warning(self, broker, msg, *args, **kwargs):
|
938
|
+
now = time.time()
|
939
|
+
if now - self.periodic_warnings_start >= \
|
940
|
+
self.periodic_warnings_interval:
|
941
|
+
self.periodic_warnings.clear()
|
942
|
+
self.periodic_warnings_start = now
|
943
|
+
|
944
|
+
db_file, path = self._get_broker_details(broker)
|
945
|
+
key = (db_file, msg)
|
946
|
+
if key not in self.periodic_warnings:
|
947
|
+
self.periodic_warnings.add(key)
|
948
|
+
self._log(logging.WARNING, broker, msg, *args, **kwargs)
|
949
|
+
|
950
|
+
def error(self, broker, msg, *args, **kwargs):
|
951
|
+
self._log(logging.ERROR, broker, msg, *args, **kwargs)
|
952
|
+
|
953
|
+
def exception(self, broker, msg, *args, **kwargs):
|
954
|
+
if not self.logger.isEnabledFor(logging.ERROR):
|
955
|
+
return
|
956
|
+
self.logger.exception(self._format_log_msg(broker, msg, *args))
|
410
957
|
|
411
958
|
def _zero_stats(self):
|
412
959
|
"""Zero out the stats."""
|
@@ -415,6 +962,7 @@ class ContainerSharder(ContainerReplicator):
|
|
415
962
|
# stats are maintained under the 'sharding' key in self.stats
|
416
963
|
self.stats['sharding'] = defaultdict(lambda: defaultdict(int))
|
417
964
|
self.sharding_candidates = []
|
965
|
+
self.shrinking_candidates = []
|
418
966
|
|
419
967
|
def _append_stat(self, category, key, value):
|
420
968
|
if not self.stats['sharding'][category][key]:
|
@@ -435,11 +983,15 @@ class ContainerSharder(ContainerReplicator):
|
|
435
983
|
else:
|
436
984
|
self.stats['sharding'][category][key] = max(current, value)
|
437
985
|
|
438
|
-
def _increment_stat(self, category, key,
|
439
|
-
self.
|
440
|
-
|
441
|
-
|
442
|
-
|
986
|
+
def _increment_stat(self, category, key, statsd=False):
|
987
|
+
self._update_stat(category, key, step=1, statsd=statsd)
|
988
|
+
|
989
|
+
def _update_stat(self, category, key, step=1, statsd=False):
|
990
|
+
if step:
|
991
|
+
self.stats['sharding'][category][key] += step
|
992
|
+
if statsd:
|
993
|
+
statsd_key = '%s_%s' % (category, key)
|
994
|
+
self.logger.update_stats(statsd_key, step)
|
443
995
|
|
444
996
|
def _make_stats_info(self, broker, node, own_shard_range):
|
445
997
|
try:
|
@@ -458,40 +1010,90 @@ class ContainerSharder(ContainerReplicator):
|
|
458
1010
|
|
459
1011
|
def _identify_sharding_candidate(self, broker, node):
|
460
1012
|
own_shard_range = broker.get_own_shard_range()
|
1013
|
+
update_own_shard_range_stats(broker, own_shard_range)
|
461
1014
|
if is_sharding_candidate(
|
462
1015
|
own_shard_range, self.shard_container_threshold):
|
463
1016
|
self.sharding_candidates.append(
|
464
1017
|
self._make_stats_info(broker, node, own_shard_range))
|
465
1018
|
|
466
|
-
def
|
467
|
-
|
468
|
-
|
1019
|
+
def _identify_shrinking_candidate(self, broker, node):
|
1020
|
+
sequences = find_compactible_shard_sequences(
|
1021
|
+
broker, self.shrink_threshold, self.expansion_limit,
|
1022
|
+
self.max_shrinking, self.max_expanding)
|
1023
|
+
# compactible_ranges are all apart from final acceptor in each sequence
|
1024
|
+
compactible_ranges = sum(len(seq) - 1 for seq in sequences)
|
1025
|
+
|
1026
|
+
if compactible_ranges:
|
1027
|
+
own_shard_range = broker.get_own_shard_range()
|
1028
|
+
update_own_shard_range_stats(broker, own_shard_range)
|
1029
|
+
shrink_candidate = self._make_stats_info(
|
1030
|
+
broker, node, own_shard_range)
|
1031
|
+
# The number of ranges/donors that can be shrunk if the
|
1032
|
+
# tool is used with the current max_shrinking, max_expanding
|
1033
|
+
# settings.
|
1034
|
+
shrink_candidate['compactible_ranges'] = compactible_ranges
|
1035
|
+
self.shrinking_candidates.append(shrink_candidate)
|
1036
|
+
|
1037
|
+
def _transform_candidate_stats(self, category, candidates, sort_keys):
|
469
1038
|
category['found'] = len(candidates)
|
470
|
-
candidates.sort(key=
|
1039
|
+
candidates.sort(key=itemgetter(*sort_keys), reverse=True)
|
471
1040
|
if self.recon_candidates_limit >= 0:
|
472
1041
|
category['top'] = candidates[:self.recon_candidates_limit]
|
473
1042
|
else:
|
474
1043
|
category['top'] = candidates
|
475
1044
|
|
476
1045
|
def _record_sharding_progress(self, broker, node, error):
|
1046
|
+
db_state = broker.get_db_state()
|
1047
|
+
if db_state not in (UNSHARDED, SHARDING, SHARDED):
|
1048
|
+
return
|
477
1049
|
own_shard_range = broker.get_own_shard_range()
|
478
|
-
if
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
1050
|
+
if own_shard_range.state not in ShardRange.CLEAVING_STATES:
|
1051
|
+
return
|
1052
|
+
|
1053
|
+
if db_state == SHARDED:
|
1054
|
+
contexts = CleavingContext.load_all(broker)
|
1055
|
+
if not contexts:
|
1056
|
+
return
|
1057
|
+
context_ts = max(float(ts) for c, ts in contexts)
|
1058
|
+
if context_ts + self.recon_sharded_timeout \
|
1059
|
+
< float(Timestamp.now()):
|
1060
|
+
# last context timestamp too old for the
|
1061
|
+
# broker to be recorded
|
1062
|
+
return
|
1063
|
+
|
1064
|
+
update_own_shard_range_stats(broker, own_shard_range)
|
1065
|
+
info = self._make_stats_info(broker, node, own_shard_range)
|
1066
|
+
info['state'] = own_shard_range.state_text
|
1067
|
+
info['db_state'] = broker.get_db_state()
|
1068
|
+
states = [ShardRange.FOUND, ShardRange.CREATED,
|
1069
|
+
ShardRange.CLEAVED, ShardRange.ACTIVE]
|
1070
|
+
shard_ranges = broker.get_shard_ranges(states=states)
|
1071
|
+
state_count = {}
|
1072
|
+
for state in states:
|
1073
|
+
state_count[ShardRange.STATES[state]] = 0
|
1074
|
+
for shard_range in shard_ranges:
|
1075
|
+
state_count[shard_range.state_text] += 1
|
1076
|
+
info.update(state_count)
|
1077
|
+
info['error'] = error and str(error)
|
1078
|
+
self._append_stat('sharding_in_progress', 'all', info)
|
1079
|
+
|
1080
|
+
if broker.sharding_required() and (
|
1081
|
+
own_shard_range.epoch is not None) and (
|
1082
|
+
float(own_shard_range.epoch) +
|
1083
|
+
self.container_sharding_timeout <
|
1084
|
+
time.time()):
|
1085
|
+
# Note: There is no requirement that own_shard_range.epoch equals
|
1086
|
+
# the time at which the own_shard_range was merged into the
|
1087
|
+
# container DB, which predicates sharding starting. But s-m-s-r and
|
1088
|
+
# auto-sharding do set epoch and then merge, so we use it to tell
|
1089
|
+
# whether sharding has been taking too long or not.
|
1090
|
+
self.warning(
|
1091
|
+
broker, 'Cleaving has not completed in %.2f seconds since %s. '
|
1092
|
+
'DB state: %s, own_shard_range state: %s, state count of '
|
1093
|
+
'shard ranges: %s' %
|
1094
|
+
(time.time() - float(own_shard_range.epoch),
|
1095
|
+
own_shard_range.epoch.isoformat, db_state,
|
1096
|
+
own_shard_range.state_text, str(state_count)))
|
495
1097
|
|
496
1098
|
def _report_stats(self):
|
497
1099
|
# report accumulated stats since start of one sharder cycle
|
@@ -502,7 +1104,7 @@ class ContainerSharder(ContainerReplicator):
|
|
502
1104
|
('created', default_stats),
|
503
1105
|
('cleaved', default_stats + ('min_time', 'max_time',)),
|
504
1106
|
('misplaced', default_stats + ('found', 'placed', 'unplaced')),
|
505
|
-
('audit_root', default_stats),
|
1107
|
+
('audit_root', default_stats + ('has_overlap', 'num_overlap')),
|
506
1108
|
('audit_shard', default_stats),
|
507
1109
|
)
|
508
1110
|
|
@@ -515,7 +1117,16 @@ class ContainerSharder(ContainerReplicator):
|
|
515
1117
|
msg = ' '.join(['%s:%s' % (k, str(stats[k])) for k in keys])
|
516
1118
|
self.logger.info('Since %s %s - %s', last_report, category, msg)
|
517
1119
|
|
518
|
-
|
1120
|
+
# transform the sharding and shrinking candidate states
|
1121
|
+
# first sharding
|
1122
|
+
category = self.stats['sharding']['sharding_candidates']
|
1123
|
+
self._transform_candidate_stats(category, self.sharding_candidates,
|
1124
|
+
sort_keys=('object_count',))
|
1125
|
+
|
1126
|
+
# next shrinking
|
1127
|
+
category = self.stats['sharding']['shrinking_candidates']
|
1128
|
+
self._transform_candidate_stats(category, self.shrinking_candidates,
|
1129
|
+
sort_keys=('compactible_ranges',))
|
519
1130
|
|
520
1131
|
dump_recon_cache(
|
521
1132
|
{'sharding_stats': self.stats,
|
@@ -525,7 +1136,7 @@ class ContainerSharder(ContainerReplicator):
|
|
525
1136
|
self.reported = now
|
526
1137
|
|
527
1138
|
def _periodic_report_stats(self):
|
528
|
-
if (time.time() - self.reported) >=
|
1139
|
+
if (time.time() - self.reported) >= self.stats_interval:
|
529
1140
|
self._report_stats()
|
530
1141
|
|
531
1142
|
def _check_node(self, node):
|
@@ -553,64 +1164,67 @@ class ContainerSharder(ContainerReplicator):
|
|
553
1164
|
params = params or {}
|
554
1165
|
params.setdefault('format', 'json')
|
555
1166
|
headers = {'X-Backend-Record-Type': 'shard',
|
1167
|
+
'X-Backend-Record-Shard-Format': 'full',
|
556
1168
|
'X-Backend-Override-Deleted': 'true',
|
557
1169
|
'X-Backend-Include-Deleted': str(include_deleted)}
|
558
1170
|
if newest:
|
559
1171
|
headers['X-Newest'] = 'true'
|
560
1172
|
try:
|
561
|
-
|
562
|
-
|
563
|
-
|
564
|
-
|
565
|
-
|
566
|
-
|
567
|
-
|
568
|
-
|
569
|
-
|
570
|
-
|
571
|
-
|
572
|
-
|
573
|
-
broker.root_path, err)
|
574
|
-
return None
|
575
|
-
|
576
|
-
try:
|
577
|
-
data = json.loads(resp.body)
|
578
|
-
if not isinstance(data, list):
|
579
|
-
raise ValueError('not a list')
|
580
|
-
return [ShardRange.from_dict(shard_range)
|
581
|
-
for shard_range in data]
|
582
|
-
except (ValueError, TypeError, KeyError) as err:
|
583
|
-
self.logger.error(
|
584
|
-
"Failed to get shard ranges from %s: invalid data: %r",
|
585
|
-
broker.root_path, err)
|
1173
|
+
resp = self.int_client.make_request(
|
1174
|
+
'GET', path, headers, acceptable_statuses=(2,),
|
1175
|
+
params=params)
|
1176
|
+
except internal_client.UnexpectedResponse as err:
|
1177
|
+
self.warning(broker, "Failed to get shard ranges from %s: %s",
|
1178
|
+
quote(broker.root_path), err)
|
1179
|
+
return None
|
1180
|
+
record_type = resp.headers.get('x-backend-record-type')
|
1181
|
+
if record_type != 'shard':
|
1182
|
+
err = 'unexpected record type %r' % record_type
|
1183
|
+
self.error(broker, "Failed to get shard ranges from %s: %s",
|
1184
|
+
quote(broker.root_path), err)
|
586
1185
|
return None
|
587
|
-
finally:
|
588
|
-
self.logger.txn_id = None
|
589
1186
|
|
590
|
-
|
1187
|
+
try:
|
1188
|
+
data = json.loads(resp.body)
|
1189
|
+
if not isinstance(data, list):
|
1190
|
+
raise ValueError('not a list')
|
1191
|
+
return [ShardRange.from_dict(shard_range)
|
1192
|
+
for shard_range in data]
|
1193
|
+
except (ValueError, TypeError, KeyError) as err:
|
1194
|
+
self.error(broker,
|
1195
|
+
"Failed to get shard ranges from %s: invalid data: %r",
|
1196
|
+
quote(broker.root_path), err)
|
1197
|
+
return None
|
1198
|
+
|
1199
|
+
def _put_container(self, broker, node, part, account, container, headers,
|
1200
|
+
body):
|
591
1201
|
try:
|
592
1202
|
direct_put_container(node, part, account, container,
|
593
1203
|
conn_timeout=self.conn_timeout,
|
594
1204
|
response_timeout=self.node_timeout,
|
595
1205
|
headers=headers, contents=body)
|
596
1206
|
except DirectClientException as err:
|
597
|
-
self.
|
598
|
-
|
599
|
-
|
1207
|
+
self.warning(broker,
|
1208
|
+
'Failed to put shard ranges to %s %s/%s: %s',
|
1209
|
+
node_to_string(node, replication=True),
|
1210
|
+
quote(account), quote(container), err.http_status)
|
600
1211
|
except (Exception, Timeout) as err:
|
601
|
-
self.
|
602
|
-
|
603
|
-
|
1212
|
+
self.exception(broker,
|
1213
|
+
'Failed to put shard ranges to %s %s/%s: %s',
|
1214
|
+
node_to_string(node, replication=True),
|
1215
|
+
quote(account), quote(container), err)
|
604
1216
|
else:
|
605
1217
|
return True
|
606
1218
|
return False
|
607
1219
|
|
608
|
-
def _send_shard_ranges(self, account, container, shard_ranges,
|
1220
|
+
def _send_shard_ranges(self, broker, account, container, shard_ranges,
|
609
1221
|
headers=None):
|
610
|
-
body = json.dumps([dict(sr
|
1222
|
+
body = json.dumps([dict(sr, reported=0)
|
1223
|
+
for sr in shard_ranges]).encode('ascii')
|
611
1224
|
part, nodes = self.ring.get_nodes(account, container)
|
612
1225
|
headers = headers or {}
|
613
1226
|
headers.update({'X-Backend-Record-Type': RECORD_TYPE_SHARD,
|
1227
|
+
USE_REPLICATION_NETWORK_HEADER: 'True',
|
614
1228
|
'User-Agent': 'container-sharder %s' % os.getpid(),
|
615
1229
|
'X-Timestamp': Timestamp.now().normal,
|
616
1230
|
'Content-Length': len(body),
|
@@ -618,7 +1232,7 @@ class ContainerSharder(ContainerReplicator):
|
|
618
1232
|
|
619
1233
|
pool = GreenAsyncPile(len(nodes))
|
620
1234
|
for node in nodes:
|
621
|
-
pool.spawn(self._put_container, node, part, account,
|
1235
|
+
pool.spawn(self._put_container, broker, node, part, account,
|
622
1236
|
container, headers, body)
|
623
1237
|
|
624
1238
|
results = pool.waitall(None)
|
@@ -634,20 +1248,19 @@ class ContainerSharder(ContainerReplicator):
|
|
634
1248
|
:param shard_range: a :class:`~swift.common.utils.ShardRange`
|
635
1249
|
:param root_path: the path of the shard's root container
|
636
1250
|
:param policy_index: the storage policy index
|
637
|
-
:returns: a tuple of ``(part, broker, node_id)`` where
|
638
|
-
shard container's partition,
|
1251
|
+
:returns: a tuple of ``(part, broker, node_id, put_timestamp)`` where
|
1252
|
+
``part`` is the shard container's partition,
|
1253
|
+
``broker`` is an instance of
|
639
1254
|
:class:`~swift.container.backend.ContainerBroker`,
|
640
|
-
``node_id`` is the id of the selected node
|
1255
|
+
``node_id`` is the id of the selected node,
|
1256
|
+
``put_timestamp`` is the put_timestamp if the broker needed to
|
1257
|
+
be initialized.
|
641
1258
|
"""
|
642
1259
|
part = self.ring.get_part(shard_range.account, shard_range.container)
|
643
1260
|
node = self.find_local_handoff_for_part(part)
|
644
|
-
put_timestamp = Timestamp.now().internal
|
645
|
-
if not node:
|
646
|
-
raise DeviceUnavailable(
|
647
|
-
'No mounted devices found suitable for creating shard broker '
|
648
|
-
'for %s in partition %s' % (shard_range.name, part))
|
649
1261
|
|
650
|
-
|
1262
|
+
put_timestamp = Timestamp.now().internal
|
1263
|
+
shard_broker, initialized = ContainerBroker.create_broker(
|
651
1264
|
os.path.join(self.root, node['device']), part, shard_range.account,
|
652
1265
|
shard_range.container, epoch=shard_range.epoch,
|
653
1266
|
storage_policy_index=policy_index, put_timestamp=put_timestamp)
|
@@ -655,11 +1268,19 @@ class ContainerSharder(ContainerReplicator):
|
|
655
1268
|
# Get the valid info into the broker.container, etc
|
656
1269
|
shard_broker.get_info()
|
657
1270
|
shard_broker.merge_shard_ranges(shard_range)
|
658
|
-
shard_broker.set_sharding_sysmeta('Root', root_path)
|
1271
|
+
shard_broker.set_sharding_sysmeta('Quoted-Root', quote(root_path))
|
1272
|
+
# NB: we *used* to do
|
1273
|
+
# shard_broker.set_sharding_sysmeta('Root', root_path)
|
1274
|
+
# but that isn't safe for container names with nulls or newlines (or
|
1275
|
+
# possibly some other characters). We consciously *don't* make any
|
1276
|
+
# attempt to set the old meta; during an upgrade, some shards may think
|
1277
|
+
# they are in fact roots, but it cleans up well enough once everyone's
|
1278
|
+
# upgraded.
|
659
1279
|
shard_broker.update_metadata({
|
660
1280
|
'X-Container-Sysmeta-Sharding':
|
661
1281
|
('True', Timestamp.now().internal)})
|
662
1282
|
|
1283
|
+
put_timestamp = put_timestamp if initialized else None
|
663
1284
|
return part, shard_broker, node['id'], put_timestamp
|
664
1285
|
|
665
1286
|
def _audit_root_container(self, broker):
|
@@ -669,105 +1290,276 @@ class ContainerSharder(ContainerReplicator):
|
|
669
1290
|
warnings = []
|
670
1291
|
own_shard_range = broker.get_own_shard_range()
|
671
1292
|
|
672
|
-
if own_shard_range.state in
|
673
|
-
shard_ranges = broker.get_shard_ranges()
|
674
|
-
|
675
|
-
|
1293
|
+
if own_shard_range.state in ShardRange.SHARDING_STATES:
|
1294
|
+
shard_ranges = [sr for sr in broker.get_shard_ranges()
|
1295
|
+
if sr.state != ShardRange.SHRINKING]
|
1296
|
+
paths_with_gaps = find_paths_with_gaps(shard_ranges)
|
1297
|
+
if paths_with_gaps:
|
676
1298
|
warnings.append(
|
677
1299
|
'missing range(s): %s' %
|
678
|
-
' '.join(['%s-%s' % (lower, upper)
|
679
|
-
for
|
1300
|
+
' '.join(['%s-%s' % (gap.lower, gap.upper)
|
1301
|
+
for (_, gap, _) in paths_with_gaps]))
|
680
1302
|
|
681
1303
|
for state in ShardRange.STATES:
|
682
|
-
|
683
|
-
|
684
|
-
|
1304
|
+
if state == ShardRange.SHRINKING:
|
1305
|
+
# Shrinking is how we resolve overlaps; we've got to
|
1306
|
+
# allow multiple shards in that state
|
1307
|
+
continue
|
1308
|
+
shard_ranges = broker.get_shard_ranges(states=[state])
|
1309
|
+
# Transient overlaps can occur during the period immediately after
|
1310
|
+
# sharding if a root learns about new child shards before it learns
|
1311
|
+
# that the parent has sharded. These overlaps are normally
|
1312
|
+
# corrected as an up-to-date version of the parent shard range is
|
1313
|
+
# replicated to the root. Parent-child overlaps are therefore
|
1314
|
+
# ignored for a reclaim age after the child was created. After
|
1315
|
+
# that, parent-child overlaps may indicate that there is
|
1316
|
+
# permanently stale parent shard range data, perhaps from a node
|
1317
|
+
# that has been offline, so these are reported.
|
1318
|
+
overlaps = find_overlapping_ranges(
|
1319
|
+
shard_ranges, exclude_parent_child=True,
|
1320
|
+
time_period=self.reclaim_age)
|
1321
|
+
if overlaps:
|
1322
|
+
self._increment_stat('audit_root', 'has_overlap')
|
1323
|
+
self._update_stat('audit_root', 'num_overlap',
|
1324
|
+
step=len(overlaps))
|
1325
|
+
all_overlaps = ', '.join(
|
1326
|
+
[' '.join(['%s-%s' % (sr.lower, sr.upper)
|
1327
|
+
for sr in overlapping_ranges])
|
1328
|
+
for overlapping_ranges in sorted(list(overlaps))])
|
685
1329
|
warnings.append(
|
686
|
-
'overlapping ranges in state %
|
687
|
-
(ShardRange.STATES[state],
|
688
|
-
|
689
|
-
|
1330
|
+
'overlapping ranges in state %r: %s' %
|
1331
|
+
(ShardRange.STATES[state], all_overlaps))
|
1332
|
+
|
1333
|
+
# We've seen a case in production where the roots own_shard_range
|
1334
|
+
# epoch is reset to None, and state set to ACTIVE (like re-defaulted)
|
1335
|
+
# Epoch it important to sharding so we want to detect if this happens
|
1336
|
+
# 1. So we can alert, and 2. to see how common it is.
|
1337
|
+
if own_shard_range.epoch is None and broker.db_epoch:
|
1338
|
+
warnings.append('own_shard_range reset to None should be %s'
|
1339
|
+
% broker.db_epoch)
|
690
1340
|
|
691
1341
|
if warnings:
|
692
|
-
self.
|
693
|
-
|
694
|
-
(broker.db_file, broker.path, ', '.join(warnings)))
|
1342
|
+
self.warning(broker, 'Audit failed for root: %s',
|
1343
|
+
', '.join(warnings))
|
695
1344
|
self._increment_stat('audit_root', 'failure', statsd=True)
|
696
1345
|
return False
|
697
1346
|
|
698
1347
|
self._increment_stat('audit_root', 'success', statsd=True)
|
699
1348
|
return True
|
700
1349
|
|
701
|
-
def
|
702
|
-
|
703
|
-
|
1350
|
+
def _merge_shard_ranges_from_root(self, broker, shard_ranges,
|
1351
|
+
own_shard_range):
|
1352
|
+
"""
|
1353
|
+
Merge appropriate items from the given ``shard_ranges`` into the
|
1354
|
+
``broker``. The selection of items that are merged will depend upon the
|
1355
|
+
state of the shard.
|
1356
|
+
|
1357
|
+
:param broker: A :class:`~swift.container.backend.ContainerBroker`.
|
1358
|
+
:param shard_ranges: A list of instances of
|
1359
|
+
:class:`~swift.common.utils.ShardRange` describing the shard ranges
|
1360
|
+
fetched from the root container.
|
1361
|
+
:param own_shard_range: A :class:`~swift.common.utils.ShardRange`
|
1362
|
+
describing the shard's own shard range.
|
1363
|
+
:return: a tuple of ``own_shard_range, own_shard_range_from_root``. The
|
1364
|
+
returned``own_shard_range`` will have been updated if the matching
|
1365
|
+
``own_shard_range_from_root`` has newer data.
|
1366
|
+
``own_shard_range_from_root`` will be None if no such matching
|
1367
|
+
shard range is found in ``shard_ranges``.
|
1368
|
+
"""
|
1369
|
+
own_shard_range_from_root = None
|
1370
|
+
children_shard_ranges = []
|
1371
|
+
other_shard_ranges = []
|
1372
|
+
for shard_range in shard_ranges:
|
1373
|
+
# look for this shard range in the list of shard ranges received
|
1374
|
+
# from root; the root may have different lower and upper bounds for
|
1375
|
+
# this shard (e.g. if this shard has been expanded in the root to
|
1376
|
+
# accept a shrinking shard) so we only match on name.
|
1377
|
+
if shard_range.name == own_shard_range.name:
|
1378
|
+
# If we find our own shard range in the root response, merge
|
1379
|
+
# it and reload own shard range (note: own_range_from_root may
|
1380
|
+
# not necessarily be 'newer' than the own shard range we
|
1381
|
+
# already have, but merging will get us to the 'newest' state)
|
1382
|
+
self.debug(broker, 'Updating own shard range from root')
|
1383
|
+
own_shard_range_from_root = shard_range
|
1384
|
+
broker.merge_shard_ranges(own_shard_range_from_root)
|
1385
|
+
orig_own_shard_range = own_shard_range
|
1386
|
+
own_shard_range = broker.get_own_shard_range()
|
1387
|
+
if (orig_own_shard_range != own_shard_range or
|
1388
|
+
orig_own_shard_range.state != own_shard_range.state):
|
1389
|
+
self.info(broker,
|
1390
|
+
'Updated own shard range from %s to %s',
|
1391
|
+
orig_own_shard_range, own_shard_range)
|
1392
|
+
elif shard_range.is_child_of(own_shard_range):
|
1393
|
+
children_shard_ranges.append(shard_range)
|
1394
|
+
else:
|
1395
|
+
other_shard_ranges.append(shard_range)
|
1396
|
+
|
1397
|
+
if children_shard_ranges and not broker.is_sharded():
|
1398
|
+
# Merging shard ranges from the root is only necessary until this
|
1399
|
+
# DB is fully cleaved and reaches SHARDED DB state, after which it
|
1400
|
+
# is useful for debugging for the set of sub-shards to which a
|
1401
|
+
# shards has sharded to be frozen.
|
1402
|
+
self.debug(broker, 'Updating %d children shard ranges from root',
|
1403
|
+
len(children_shard_ranges))
|
1404
|
+
broker.merge_shard_ranges(children_shard_ranges)
|
1405
|
+
|
1406
|
+
if (other_shard_ranges
|
1407
|
+
and own_shard_range.state in ShardRange.CLEAVING_STATES
|
1408
|
+
and not broker.is_sharded()):
|
1409
|
+
# Other shard ranges returned from the root may need to be merged
|
1410
|
+
# for the purposes of sharding or shrinking this shard:
|
1411
|
+
#
|
1412
|
+
# Shrinking states: If the up-to-date state is shrinking, the
|
1413
|
+
# shards fetched from root may contain shards into which this shard
|
1414
|
+
# is to shrink itself. Shrinking is initiated by modifying multiple
|
1415
|
+
# neighboring shard range states *in the root*, rather than
|
1416
|
+
# modifying a shard directly. We therefore need to learn about
|
1417
|
+
# *other* neighboring shard ranges from the root, possibly
|
1418
|
+
# including the root itself. We need to include shrunk state too,
|
1419
|
+
# because one replica of a shard may already have moved the
|
1420
|
+
# own_shard_range state to shrunk while another replica may still
|
1421
|
+
# be in the process of shrinking.
|
1422
|
+
#
|
1423
|
+
# Sharding states: Normally a shard will shard to its own children.
|
1424
|
+
# However, in some circumstances a shard may need to shard to other
|
1425
|
+
# non-children sub-shards. For example, a shard range repair may
|
1426
|
+
# cause a child sub-shard to be deleted and its namespace covered
|
1427
|
+
# by another 'acceptor' shard.
|
1428
|
+
#
|
1429
|
+
# Therefore, if the up-to-date own_shard_range state indicates that
|
1430
|
+
# sharding or shrinking is in progress, then other shard ranges
|
1431
|
+
# will be merged, with the following caveats: we never expect a
|
1432
|
+
# shard to shard to any ancestor shard range including the root,
|
1433
|
+
# but containers might ultimately *shrink* to root; we never want
|
1434
|
+
# to cleave to a container that is itself sharding or shrinking;
|
1435
|
+
# the merged shard ranges should not result in gaps or overlaps in
|
1436
|
+
# the namespace of this shard.
|
1437
|
+
#
|
1438
|
+
# Note: the search for ancestors is guaranteed to find the parent
|
1439
|
+
# and root *if they are present*, but if any ancestor is missing
|
1440
|
+
# then there is a chance that older generations in the
|
1441
|
+
# other_shard_ranges will not be filtered and could be merged. That
|
1442
|
+
# is only a problem if they are somehow still in ACTIVE state, and
|
1443
|
+
# no overlap is detected, so the ancestor is merged.
|
1444
|
+
ancestor_names = [
|
1445
|
+
sr.name for sr in own_shard_range.find_ancestors(shard_ranges)]
|
1446
|
+
filtered_other_shard_ranges = [
|
1447
|
+
sr for sr in other_shard_ranges
|
1448
|
+
if (sr.name not in ancestor_names
|
1449
|
+
and (sr.state not in ShardRange.CLEAVING_STATES
|
1450
|
+
or sr.deleted))
|
1451
|
+
]
|
1452
|
+
if own_shard_range.state in ShardRange.SHRINKING_STATES:
|
1453
|
+
root_shard_range = own_shard_range.find_root(
|
1454
|
+
other_shard_ranges)
|
1455
|
+
if (root_shard_range and
|
1456
|
+
root_shard_range.state == ShardRange.ACTIVE):
|
1457
|
+
filtered_other_shard_ranges.append(root_shard_range)
|
1458
|
+
existing_shard_ranges = broker.get_shard_ranges()
|
1459
|
+
combined_shard_ranges = combine_shard_ranges(
|
1460
|
+
filtered_other_shard_ranges, existing_shard_ranges)
|
1461
|
+
overlaps = find_overlapping_ranges(combined_shard_ranges)
|
1462
|
+
paths_with_gaps = find_paths_with_gaps(
|
1463
|
+
combined_shard_ranges, own_shard_range)
|
1464
|
+
if not (overlaps or paths_with_gaps):
|
1465
|
+
# only merge if shard ranges appear to be *good*
|
1466
|
+
self.debug(broker,
|
1467
|
+
'Updating %s other shard range(s) from root',
|
1468
|
+
len(filtered_other_shard_ranges))
|
1469
|
+
broker.merge_shard_ranges(filtered_other_shard_ranges)
|
1470
|
+
|
1471
|
+
return own_shard_range, own_shard_range_from_root
|
1472
|
+
|
1473
|
+
def _delete_shard_container(self, broker, own_shard_range):
|
1474
|
+
"""
|
1475
|
+
Mark a shard container as deleted if it was sharded or shrunk more than
|
1476
|
+
reclaim_age in the past. (The DB file will be removed by the replicator
|
1477
|
+
after a further reclaim_age.)
|
1478
|
+
|
1479
|
+
:param broker: A :class:`~swift.container.backend.ContainerBroker`.
|
1480
|
+
:param own_shard_range: A :class:`~swift.common.utils.ShardRange`
|
1481
|
+
describing the shard's own shard range.
|
1482
|
+
"""
|
1483
|
+
delete_age = time.time() - self.reclaim_age
|
1484
|
+
deletable_states = (ShardRange.SHARDED, ShardRange.SHRUNK)
|
1485
|
+
if (own_shard_range.state in deletable_states and
|
1486
|
+
own_shard_range.deleted and
|
1487
|
+
own_shard_range.timestamp < delete_age and
|
1488
|
+
broker.empty()):
|
1489
|
+
broker.delete_db(Timestamp.now().internal)
|
1490
|
+
self.debug(broker, 'Marked shard container as deleted')
|
1491
|
+
|
1492
|
+
def _do_audit_shard_container(self, broker):
|
704
1493
|
warnings = []
|
705
|
-
errors = []
|
706
1494
|
if not broker.account.startswith(self.shards_account_prefix):
|
707
1495
|
warnings.append('account not in shards namespace %r' %
|
708
1496
|
self.shards_account_prefix)
|
709
1497
|
|
710
1498
|
own_shard_range = broker.get_own_shard_range(no_default=True)
|
711
1499
|
|
712
|
-
|
713
|
-
|
714
|
-
|
715
|
-
|
716
|
-
|
717
|
-
|
718
|
-
|
719
|
-
|
720
|
-
|
721
|
-
|
722
|
-
|
723
|
-
|
724
|
-
|
725
|
-
|
726
|
-
|
727
|
-
|
728
|
-
|
729
|
-
|
730
|
-
|
731
|
-
|
732
|
-
|
733
|
-
|
1500
|
+
if not own_shard_range:
|
1501
|
+
self.warning(broker, 'Audit failed for shard: missing own shard '
|
1502
|
+
'range (skipping)')
|
1503
|
+
return False, warnings
|
1504
|
+
|
1505
|
+
# Get the root view of the world, at least that part of the world
|
1506
|
+
# that overlaps with this shard's namespace. The
|
1507
|
+
# 'states=auditing' parameter will cause the root to include
|
1508
|
+
# its own shard range in the response, which is necessary for the
|
1509
|
+
# particular case when this shard should be shrinking to the root
|
1510
|
+
# container; when not shrinking to root, but to another acceptor,
|
1511
|
+
# the root range should be in sharded state and will not interfere
|
1512
|
+
# with cleaving, listing or updating behaviour.
|
1513
|
+
shard_ranges = self._fetch_shard_ranges(
|
1514
|
+
broker, newest=True,
|
1515
|
+
params={'marker': str_to_wsgi(own_shard_range.lower_str),
|
1516
|
+
'end_marker': str_to_wsgi(own_shard_range.upper_str),
|
1517
|
+
'states': 'auditing'},
|
1518
|
+
include_deleted=True)
|
1519
|
+
if shard_ranges:
|
1520
|
+
own_shard_range, own_shard_range_from_root = \
|
1521
|
+
self._merge_shard_ranges_from_root(
|
1522
|
+
broker, shard_ranges, own_shard_range)
|
1523
|
+
if not own_shard_range_from_root:
|
1524
|
+
# this is not necessarily an error - some replicas of the
|
1525
|
+
# root may not yet know about this shard container, or the
|
1526
|
+
# shard's own shard range could become deleted and
|
1527
|
+
# reclaimed from the root under rare conditions
|
1528
|
+
warnings.append('root has no matching shard range')
|
1529
|
+
elif not own_shard_range.deleted:
|
1530
|
+
warnings.append('unable to get shard ranges from root')
|
1531
|
+
# else, our shard range is deleted, so root may have reclaimed it
|
1532
|
+
|
1533
|
+
self._delete_shard_container(broker, own_shard_range)
|
1534
|
+
|
1535
|
+
return True, warnings
|
734
1536
|
|
1537
|
+
def _audit_shard_container(self, broker):
|
1538
|
+
self._increment_stat('audit_shard', 'attempted')
|
1539
|
+
success, warnings = self._do_audit_shard_container(broker)
|
735
1540
|
if warnings:
|
736
|
-
self.
|
737
|
-
|
738
|
-
|
739
|
-
|
740
|
-
|
741
|
-
self.logger.warning(
|
742
|
-
'Audit failed for shard %s (%s) - skipping: %s' %
|
743
|
-
(broker.db_file, broker.path, ', '.join(errors)))
|
744
|
-
self._increment_stat('audit_shard', 'failure', statsd=True)
|
745
|
-
return False
|
746
|
-
|
747
|
-
if shard_range:
|
748
|
-
self.logger.debug('Updating shard from root %s', dict(shard_range))
|
749
|
-
broker.merge_shard_ranges(shard_range)
|
750
|
-
own_shard_range = broker.get_own_shard_range()
|
751
|
-
delete_age = time.time() - self.reclaim_age
|
752
|
-
if (own_shard_range.state == ShardRange.SHARDED and
|
753
|
-
own_shard_range.deleted and
|
754
|
-
own_shard_range.timestamp < delete_age and
|
755
|
-
broker.empty()):
|
756
|
-
broker.delete_db(Timestamp.now().internal)
|
757
|
-
self.logger.debug('Deleted shard container %s (%s)',
|
758
|
-
broker.db_file, broker.path)
|
759
|
-
self._increment_stat('audit_shard', 'success', statsd=True)
|
760
|
-
return True
|
1541
|
+
self.warning(broker, 'Audit warnings for shard: %s',
|
1542
|
+
', '.join(warnings))
|
1543
|
+
self._increment_stat(
|
1544
|
+
'audit_shard', 'success' if success else 'failure', statsd=True)
|
1545
|
+
return success
|
761
1546
|
|
762
1547
|
def _audit_cleave_contexts(self, broker):
|
763
1548
|
now = Timestamp.now()
|
764
1549
|
for context, last_mod in CleavingContext.load_all(broker):
|
765
|
-
|
766
|
-
|
1550
|
+
last_mod = Timestamp(last_mod)
|
1551
|
+
is_done = context.done() and last_mod.timestamp + \
|
1552
|
+
self.recon_sharded_timeout < now.timestamp
|
1553
|
+
is_stale = last_mod.timestamp + self.reclaim_age < now.timestamp
|
1554
|
+
if is_done or is_stale:
|
767
1555
|
context.delete(broker)
|
768
1556
|
|
769
1557
|
def _audit_container(self, broker):
|
770
1558
|
if broker.is_deleted():
|
1559
|
+
if broker.is_old_enough_to_reclaim(time.time(), self.reclaim_age) \
|
1560
|
+
and not broker.is_empty_enough_to_reclaim():
|
1561
|
+
self.periodic_warning(
|
1562
|
+
broker, 'Reclaimable db stuck waiting for shrinking')
|
771
1563
|
# if the container has been marked as deleted, all metadata will
|
772
1564
|
# have been erased so no point auditing. But we want it to pass, in
|
773
1565
|
# case any objects exist inside it.
|
@@ -777,18 +1569,32 @@ class ContainerSharder(ContainerReplicator):
|
|
777
1569
|
return self._audit_root_container(broker)
|
778
1570
|
return self._audit_shard_container(broker)
|
779
1571
|
|
780
|
-
def yield_objects(self, broker, src_shard_range, since_row=None
|
1572
|
+
def yield_objects(self, broker, src_shard_range, since_row=None,
|
1573
|
+
batch_size=None):
|
781
1574
|
"""
|
782
|
-
Iterates through all
|
783
|
-
yielding them in lists of up to
|
1575
|
+
Iterates through all object rows in ``src_shard_range`` in name order
|
1576
|
+
yielding them in lists of up to ``batch_size`` in length. All batches
|
1577
|
+
of rows that are not marked deleted are yielded before all batches of
|
1578
|
+
rows that are marked deleted.
|
784
1579
|
|
785
1580
|
:param broker: A :class:`~swift.container.backend.ContainerBroker`.
|
786
1581
|
:param src_shard_range: A :class:`~swift.common.utils.ShardRange`
|
787
1582
|
describing the source range.
|
788
|
-
:param since_row: include only
|
789
|
-
the given row id; by default all rows are included.
|
790
|
-
:
|
1583
|
+
:param since_row: include only object rows whose ROWID is greater than
|
1584
|
+
the given row id; by default all object rows are included.
|
1585
|
+
:param batch_size: The maximum number of object rows to include in each
|
1586
|
+
yielded batch; defaults to cleave_row_batch_size.
|
1587
|
+
:return: a generator of tuples of (list of rows, broker info dict)
|
791
1588
|
"""
|
1589
|
+
if (src_shard_range.lower == ShardRange.MAX or
|
1590
|
+
src_shard_range.upper == ShardRange.MIN):
|
1591
|
+
# this is an unexpected condition but handled with an early return
|
1592
|
+
# just in case, because:
|
1593
|
+
# lower == ShardRange.MAX -> marker == ''
|
1594
|
+
# which could result in rows being erroneously yielded.
|
1595
|
+
return
|
1596
|
+
|
1597
|
+
batch_size = batch_size or self.cleave_row_batch_size
|
792
1598
|
for include_deleted in (False, True):
|
793
1599
|
marker = src_shard_range.lower_str
|
794
1600
|
while True:
|
@@ -796,87 +1602,82 @@ class ContainerSharder(ContainerReplicator):
|
|
796
1602
|
info['max_row'] = broker.get_max_row()
|
797
1603
|
start = time.time()
|
798
1604
|
objects = broker.get_objects(
|
799
|
-
|
1605
|
+
limit=batch_size,
|
800
1606
|
marker=marker,
|
801
1607
|
end_marker=src_shard_range.end_marker,
|
802
1608
|
include_deleted=include_deleted,
|
803
1609
|
since_row=since_row)
|
1610
|
+
self.debug(broker, 'got %s rows (deleted=%s) in %ss',
|
1611
|
+
len(objects), include_deleted, time.time() - start)
|
804
1612
|
if objects:
|
805
|
-
self.logger.debug('got %s objects from %s in %ss',
|
806
|
-
len(objects), broker.db_file,
|
807
|
-
time.time() - start)
|
808
1613
|
yield objects, info
|
809
1614
|
|
810
|
-
if len(objects) <
|
1615
|
+
if len(objects) < batch_size:
|
811
1616
|
break
|
812
1617
|
marker = objects[-1]['name']
|
813
1618
|
|
814
1619
|
def yield_objects_to_shard_range(self, broker, src_shard_range,
|
815
1620
|
dest_shard_ranges):
|
816
1621
|
"""
|
817
|
-
Iterates through all
|
818
|
-
destination shard ranges provided by the ``
|
819
|
-
Yields tuples of (object
|
820
|
-
|
821
|
-
|
1622
|
+
Iterates through all object rows in ``src_shard_range`` to place them
|
1623
|
+
in destination shard ranges provided by the ``dest_shard_ranges``
|
1624
|
+
function. Yields tuples of ``(batch of object rows, destination shard
|
1625
|
+
range in which those object rows belong, broker info)``.
|
1626
|
+
|
1627
|
+
If no destination shard range exists for a batch of object rows then
|
1628
|
+
tuples are yielded of ``(batch of object rows, None, broker info)``.
|
1629
|
+
This indicates to the caller that there are a non-zero number of object
|
1630
|
+
rows for which no destination shard range was found.
|
1631
|
+
|
1632
|
+
Note that the same destination shard range may be referenced in more
|
1633
|
+
than one yielded tuple.
|
822
1634
|
|
823
1635
|
:param broker: A :class:`~swift.container.backend.ContainerBroker`.
|
824
1636
|
:param src_shard_range: A :class:`~swift.common.utils.ShardRange`
|
825
1637
|
describing the source range.
|
826
1638
|
:param dest_shard_ranges: A function which should return a list of
|
827
|
-
destination shard ranges in
|
828
|
-
|
829
|
-
|
1639
|
+
destination shard ranges sorted in the order defined by
|
1640
|
+
:meth:`~swift.common.utils.ShardRange.sort_key`.
|
1641
|
+
:return: a generator of tuples of ``(object row list, shard range,
|
1642
|
+
broker info dict)`` where ``shard_range`` may be ``None``.
|
830
1643
|
"""
|
831
|
-
|
832
|
-
|
833
|
-
|
834
|
-
|
1644
|
+
# calling dest_shard_ranges() may result in a request to fetch shard
|
1645
|
+
# ranges, so first check that the broker actually has misplaced object
|
1646
|
+
# rows in the source namespace
|
1647
|
+
for _ in self.yield_objects(broker, src_shard_range, batch_size=1):
|
1648
|
+
break
|
1649
|
+
else:
|
1650
|
+
return
|
835
1651
|
|
836
|
-
|
837
|
-
|
838
|
-
|
839
|
-
|
840
|
-
|
841
|
-
|
842
|
-
if
|
843
|
-
|
844
|
-
|
845
|
-
|
846
|
-
|
847
|
-
|
848
|
-
|
849
|
-
|
850
|
-
|
851
|
-
|
852
|
-
|
853
|
-
|
854
|
-
|
855
|
-
|
856
|
-
|
857
|
-
|
858
|
-
|
859
|
-
|
860
|
-
|
861
|
-
|
862
|
-
|
863
|
-
|
864
|
-
|
865
|
-
# yield the objects in current dest_shard_range
|
866
|
-
yield (objs[last_index:next_index],
|
867
|
-
dest_shard_range,
|
868
|
-
info)
|
869
|
-
last_index = next_index
|
870
|
-
dest_shard_range = next_or_none(dest_shard_range_iter)
|
871
|
-
next_index += 1
|
872
|
-
|
873
|
-
if next_index != last_index:
|
874
|
-
# yield tail of current batch of objects
|
875
|
-
# NB there may be more objects for the current
|
876
|
-
# dest_shard_range in the next batch from yield_objects
|
877
|
-
yield (objs[last_index:next_index],
|
878
|
-
None if unplaced else dest_shard_range,
|
879
|
-
info)
|
1652
|
+
dest_shard_range_iter = iter(dest_shard_ranges())
|
1653
|
+
src_shard_range_marker = src_shard_range.lower
|
1654
|
+
for dest_shard_range in dest_shard_range_iter:
|
1655
|
+
if dest_shard_range.upper <= src_shard_range.lower:
|
1656
|
+
continue
|
1657
|
+
|
1658
|
+
if dest_shard_range.lower > src_shard_range_marker:
|
1659
|
+
# no destination for a sub-namespace of the source namespace
|
1660
|
+
sub_src_range = src_shard_range.copy(
|
1661
|
+
lower=src_shard_range_marker, upper=dest_shard_range.lower)
|
1662
|
+
for objs, info in self.yield_objects(broker, sub_src_range):
|
1663
|
+
yield objs, None, info
|
1664
|
+
|
1665
|
+
sub_src_range = src_shard_range.copy(
|
1666
|
+
lower=max(dest_shard_range.lower, src_shard_range.lower),
|
1667
|
+
upper=min(dest_shard_range.upper, src_shard_range.upper))
|
1668
|
+
for objs, info in self.yield_objects(broker, sub_src_range):
|
1669
|
+
yield objs, dest_shard_range, info
|
1670
|
+
|
1671
|
+
src_shard_range_marker = dest_shard_range.upper
|
1672
|
+
if dest_shard_range.upper >= src_shard_range.upper:
|
1673
|
+
# the entire source namespace has been traversed
|
1674
|
+
break
|
1675
|
+
else:
|
1676
|
+
# dest_shard_ranges_iter was exhausted before reaching the end of
|
1677
|
+
# the source namespace
|
1678
|
+
sub_src_range = src_shard_range.copy(lower=src_shard_range_marker)
|
1679
|
+
for objs, info in self.yield_objects(broker, sub_src_range):
|
1680
|
+
yield objs, None, info
|
880
1681
|
|
881
1682
|
def _post_replicate_hook(self, broker, info, responses):
|
882
1683
|
# override superclass behaviour
|
@@ -886,11 +1687,15 @@ class ContainerSharder(ContainerReplicator):
|
|
886
1687
|
dest_broker, node_id, info):
|
887
1688
|
success, responses = self._replicate_object(
|
888
1689
|
part, dest_broker.db_file, node_id)
|
1690
|
+
replication_successes = responses.count(True)
|
889
1691
|
quorum = quorum_size(self.ring.replica_count)
|
890
|
-
if not success and
|
891
|
-
self.
|
892
|
-
'Failed to sufficiently replicate misplaced objects
|
893
|
-
|
1692
|
+
if not success and replication_successes < quorum:
|
1693
|
+
self.warning(
|
1694
|
+
broker, 'Failed to sufficiently replicate misplaced objects '
|
1695
|
+
'shard %s in state %s: %s successes, %s required '
|
1696
|
+
'(not removing objects), shard db: %s',
|
1697
|
+
dest_shard_range.name, dest_shard_range.state_text,
|
1698
|
+
replication_successes, quorum, dest_broker.db_file)
|
894
1699
|
return False
|
895
1700
|
|
896
1701
|
if broker.get_info()['id'] != info['id']:
|
@@ -908,9 +1713,9 @@ class ContainerSharder(ContainerReplicator):
|
|
908
1713
|
success = True
|
909
1714
|
|
910
1715
|
if not success:
|
911
|
-
self.
|
912
|
-
|
913
|
-
|
1716
|
+
self.warning(broker, 'Refused to remove misplaced objects for '
|
1717
|
+
'dest %s in state %s',
|
1718
|
+
dest_shard_range.name, dest_shard_range.state_text)
|
914
1719
|
return success
|
915
1720
|
|
916
1721
|
def _move_objects(self, src_broker, src_shard_range, policy_index,
|
@@ -928,16 +1733,19 @@ class ContainerSharder(ContainerReplicator):
|
|
928
1733
|
continue
|
929
1734
|
|
930
1735
|
if dest_shard_range.name == src_broker.path:
|
931
|
-
self.
|
932
|
-
|
1736
|
+
self.debug(src_broker,
|
1737
|
+
'Skipping source as misplaced objects destination')
|
933
1738
|
# in shrinking context, the misplaced objects might actually be
|
934
1739
|
# correctly placed if the root has expanded this shard but this
|
935
1740
|
# broker has not yet been updated
|
936
1741
|
continue
|
937
1742
|
|
938
1743
|
if dest_shard_range not in dest_brokers:
|
939
|
-
part, dest_broker, node_id,
|
940
|
-
|
1744
|
+
part, dest_broker, node_id, put_timestamp = \
|
1745
|
+
self._get_shard_broker(
|
1746
|
+
dest_shard_range, src_broker.root_path, policy_index)
|
1747
|
+
stat = 'db_exists' if put_timestamp is None else 'db_created'
|
1748
|
+
self._increment_stat('misplaced', stat, statsd=True)
|
941
1749
|
# save the broker info that was sampled prior to the *first*
|
942
1750
|
# yielded objects for this destination
|
943
1751
|
destination = {'part': part,
|
@@ -951,20 +1759,20 @@ class ContainerSharder(ContainerReplicator):
|
|
951
1759
|
placed += len(objs)
|
952
1760
|
|
953
1761
|
if unplaced:
|
954
|
-
self.
|
955
|
-
|
956
|
-
'in %s' % (unplaced, src_broker.path))
|
1762
|
+
self.warning(src_broker, 'Failed to find destination for at least '
|
1763
|
+
'%s misplaced objects', unplaced)
|
957
1764
|
|
958
1765
|
# TODO: consider executing the replication jobs concurrently
|
959
1766
|
for dest_shard_range, dest_args in dest_brokers.items():
|
960
|
-
self.
|
961
|
-
|
1767
|
+
self.debug(src_broker,
|
1768
|
+
'moving misplaced objects found in range %s',
|
1769
|
+
dest_shard_range)
|
962
1770
|
success &= self._replicate_and_delete(
|
963
1771
|
src_broker, dest_shard_range, **dest_args)
|
964
1772
|
|
965
|
-
self.
|
966
|
-
self.
|
967
|
-
return success, placed
|
1773
|
+
self._update_stat('misplaced', 'placed', step=placed, statsd=True)
|
1774
|
+
self._update_stat('misplaced', 'unplaced', step=unplaced, statsd=True)
|
1775
|
+
return success, placed, unplaced
|
968
1776
|
|
969
1777
|
def _make_shard_range_fetcher(self, broker, src_shard_range):
|
970
1778
|
# returns a function that will lazy load shard ranges on demand;
|
@@ -1005,12 +1813,12 @@ class ContainerSharder(ContainerReplicator):
|
|
1005
1813
|
|
1006
1814
|
def _make_misplaced_object_bounds(self, broker):
|
1007
1815
|
bounds = []
|
1008
|
-
|
1009
|
-
if
|
1816
|
+
db_state = broker.get_db_state()
|
1817
|
+
if db_state == SHARDED:
|
1010
1818
|
# Anything in the object table is treated as a misplaced object.
|
1011
1819
|
bounds.append(('', ''))
|
1012
1820
|
|
1013
|
-
if not bounds and
|
1821
|
+
if not bounds and db_state == SHARDING:
|
1014
1822
|
# Objects outside of this container's own range are misplaced.
|
1015
1823
|
# Objects in already cleaved shard ranges are also misplaced.
|
1016
1824
|
cleave_context = CleavingContext.load(broker)
|
@@ -1038,8 +1846,7 @@ class ContainerSharder(ContainerReplicator):
|
|
1038
1846
|
:return: True if all misplaced objects were sufficiently replicated to
|
1039
1847
|
their correct shard containers, False otherwise
|
1040
1848
|
"""
|
1041
|
-
self.
|
1042
|
-
broker.path, broker.db_file)
|
1849
|
+
self.debug(broker, 'Looking for misplaced objects')
|
1043
1850
|
self._increment_stat('misplaced', 'attempted')
|
1044
1851
|
src_broker = src_broker or broker
|
1045
1852
|
if src_bounds is None:
|
@@ -1047,22 +1854,27 @@ class ContainerSharder(ContainerReplicator):
|
|
1047
1854
|
# (ab)use ShardRange instances to encapsulate source namespaces
|
1048
1855
|
src_ranges = [ShardRange('dont/care', Timestamp.now(), lower, upper)
|
1049
1856
|
for lower, upper in src_bounds]
|
1050
|
-
self.
|
1857
|
+
self.debug(broker, 'misplaced object source bounds %s', src_bounds)
|
1051
1858
|
policy_index = broker.storage_policy_index
|
1052
1859
|
success = True
|
1053
|
-
|
1860
|
+
num_placed = num_unplaced = 0
|
1054
1861
|
for src_shard_range in src_ranges:
|
1055
|
-
part_success,
|
1862
|
+
part_success, part_placed, part_unplaced = self._move_objects(
|
1056
1863
|
src_broker, src_shard_range, policy_index,
|
1057
1864
|
self._make_shard_range_fetcher(broker, src_shard_range))
|
1058
1865
|
success &= part_success
|
1059
|
-
|
1866
|
+
num_placed += part_placed
|
1867
|
+
num_unplaced += part_unplaced
|
1060
1868
|
|
1061
|
-
if
|
1869
|
+
if num_placed or num_unplaced:
|
1870
|
+
# the found stat records the number of DBs in which any misplaced
|
1871
|
+
# rows were found, not the total number of misplaced rows
|
1062
1872
|
self._increment_stat('misplaced', 'found', statsd=True)
|
1063
|
-
self.
|
1064
|
-
|
1065
|
-
self.
|
1873
|
+
self.debug(broker, 'Placed %s misplaced objects (%s unplaced)',
|
1874
|
+
num_placed, num_unplaced)
|
1875
|
+
self._increment_stat('misplaced', 'success' if success else 'failure',
|
1876
|
+
statsd=True)
|
1877
|
+
self.debug(broker, 'Finished handling misplaced objects')
|
1066
1878
|
return success
|
1067
1879
|
|
1068
1880
|
def _find_shard_ranges(self, broker):
|
@@ -1078,25 +1890,26 @@ class ContainerSharder(ContainerReplicator):
|
|
1078
1890
|
own_shard_range = broker.get_own_shard_range()
|
1079
1891
|
shard_ranges = broker.get_shard_ranges()
|
1080
1892
|
if shard_ranges and shard_ranges[-1].upper >= own_shard_range.upper:
|
1081
|
-
self.
|
1893
|
+
self.debug(broker, 'Scan for shard ranges already completed')
|
1082
1894
|
return 0
|
1083
1895
|
|
1084
|
-
self.
|
1896
|
+
self.info(broker, 'Starting scan for shard ranges')
|
1085
1897
|
self._increment_stat('scanned', 'attempted')
|
1086
1898
|
|
1087
1899
|
start = time.time()
|
1088
1900
|
shard_data, last_found = broker.find_shard_ranges(
|
1089
|
-
self.
|
1090
|
-
existing_ranges=shard_ranges
|
1901
|
+
self.rows_per_shard, limit=self.shard_scanner_batch_size,
|
1902
|
+
existing_ranges=shard_ranges,
|
1903
|
+
minimum_shard_size=self.minimum_shard_size)
|
1091
1904
|
elapsed = time.time() - start
|
1092
1905
|
|
1093
1906
|
if not shard_data:
|
1094
1907
|
if last_found:
|
1095
|
-
self.
|
1908
|
+
self.info(broker, "Already found all shard ranges")
|
1096
1909
|
self._increment_stat('scanned', 'success', statsd=True)
|
1097
1910
|
else:
|
1098
1911
|
# we didn't find anything
|
1099
|
-
self.
|
1912
|
+
self.warning(broker, "No shard ranges found")
|
1100
1913
|
self._increment_stat('scanned', 'failure', statsd=True)
|
1101
1914
|
return 0
|
1102
1915
|
|
@@ -1104,14 +1917,14 @@ class ContainerSharder(ContainerReplicator):
|
|
1104
1917
|
broker, shard_data, self.shards_account_prefix)
|
1105
1918
|
broker.merge_shard_ranges(shard_ranges)
|
1106
1919
|
num_found = len(shard_ranges)
|
1107
|
-
self.
|
1108
|
-
|
1109
|
-
self.
|
1920
|
+
self.info(broker, "Completed scan for shard ranges: %d found",
|
1921
|
+
num_found)
|
1922
|
+
self._update_stat('scanned', 'found', step=num_found)
|
1110
1923
|
self._min_stat('scanned', 'min_time', round(elapsed / num_found, 3))
|
1111
1924
|
self._max_stat('scanned', 'max_time', round(elapsed / num_found, 3))
|
1112
1925
|
|
1113
1926
|
if last_found:
|
1114
|
-
self.
|
1927
|
+
self.info(broker, "Final shard range reached.")
|
1115
1928
|
self._increment_stat('scanned', 'success', statsd=True)
|
1116
1929
|
return num_found
|
1117
1930
|
|
@@ -1119,26 +1932,34 @@ class ContainerSharder(ContainerReplicator):
|
|
1119
1932
|
# Create shard containers that are ready to receive redirected object
|
1120
1933
|
# updates. Do this now, so that redirection can begin immediately
|
1121
1934
|
# without waiting for cleaving to complete.
|
1122
|
-
found_ranges = broker.get_shard_ranges(states=ShardRange.FOUND)
|
1935
|
+
found_ranges = broker.get_shard_ranges(states=[ShardRange.FOUND])
|
1123
1936
|
created_ranges = []
|
1124
1937
|
for shard_range in found_ranges:
|
1125
1938
|
self._increment_stat('created', 'attempted')
|
1126
1939
|
shard_range.update_state(ShardRange.CREATED)
|
1127
1940
|
headers = {
|
1128
1941
|
'X-Backend-Storage-Policy-Index': broker.storage_policy_index,
|
1129
|
-
'X-Container-Sysmeta-Shard-Root':
|
1130
|
-
|
1942
|
+
'X-Container-Sysmeta-Shard-Quoted-Root': quote(
|
1943
|
+
broker.root_path),
|
1944
|
+
'X-Container-Sysmeta-Sharding': 'True',
|
1945
|
+
'X-Backend-Auto-Create': 'True'}
|
1946
|
+
# NB: we *used* to send along
|
1947
|
+
# 'X-Container-Sysmeta-Shard-Root': broker.root_path
|
1948
|
+
# but that isn't safe for container names with nulls or newlines
|
1949
|
+
# (or possibly some other characters). We consciously *don't* make
|
1950
|
+
# any attempt to set the old meta; during an upgrade, some shards
|
1951
|
+
# may think they are in fact roots, but it cleans up well enough
|
1952
|
+
# once everyone's upgraded.
|
1131
1953
|
success = self._send_shard_ranges(
|
1132
|
-
shard_range.account, shard_range.container,
|
1954
|
+
broker, shard_range.account, shard_range.container,
|
1133
1955
|
[shard_range], headers=headers)
|
1134
1956
|
if success:
|
1135
|
-
self.
|
1136
|
-
|
1957
|
+
self.debug(broker, 'PUT new shard range container for %s',
|
1958
|
+
shard_range)
|
1137
1959
|
self._increment_stat('created', 'success', statsd=True)
|
1138
1960
|
else:
|
1139
|
-
self.
|
1140
|
-
|
1141
|
-
shard_range, broker.path)
|
1961
|
+
self.error(broker, 'PUT of new shard container %r failed',
|
1962
|
+
shard_range)
|
1142
1963
|
self._increment_stat('created', 'failure', statsd=True)
|
1143
1964
|
# break, not continue, because elsewhere it is assumed that
|
1144
1965
|
# finding and cleaving shard ranges progresses linearly, so we
|
@@ -1150,31 +1971,17 @@ class ContainerSharder(ContainerReplicator):
|
|
1150
1971
|
if created_ranges:
|
1151
1972
|
broker.merge_shard_ranges(created_ranges)
|
1152
1973
|
if not broker.is_root_container():
|
1153
|
-
self._send_shard_ranges(
|
1154
|
-
|
1155
|
-
self.
|
1156
|
-
|
1157
|
-
len(created_ranges))
|
1974
|
+
self._send_shard_ranges(broker, broker.root_account,
|
1975
|
+
broker.root_container, created_ranges)
|
1976
|
+
self.info(broker, "Completed creating %d shard range containers",
|
1977
|
+
len(created_ranges))
|
1158
1978
|
return len(created_ranges)
|
1159
1979
|
|
1160
|
-
def
|
1161
|
-
|
1162
|
-
|
1163
|
-
|
1164
|
-
self._increment_stat('cleaved', 'attempted')
|
1980
|
+
def _cleave_shard_broker(self, broker, cleaving_context, shard_range,
|
1981
|
+
own_shard_range, shard_broker, put_timestamp,
|
1982
|
+
shard_part, node_id):
|
1983
|
+
result = CLEAVE_SUCCESS
|
1165
1984
|
start = time.time()
|
1166
|
-
policy_index = broker.storage_policy_index
|
1167
|
-
try:
|
1168
|
-
shard_part, shard_broker, node_id, put_timestamp = \
|
1169
|
-
self._get_shard_broker(shard_range, broker.root_path,
|
1170
|
-
policy_index)
|
1171
|
-
except DeviceUnavailable as duex:
|
1172
|
-
self.logger.warning(str(duex))
|
1173
|
-
self._increment_stat('cleaved', 'failure', statsd=True)
|
1174
|
-
return CLEAVE_FAILED
|
1175
|
-
|
1176
|
-
own_shard_range = broker.get_own_shard_range()
|
1177
|
-
|
1178
1985
|
# only cleave from the retiring db - misplaced objects handler will
|
1179
1986
|
# deal with any objects in the fresh db
|
1180
1987
|
source_broker = broker.get_brokers()[0]
|
@@ -1193,23 +2000,15 @@ class ContainerSharder(ContainerReplicator):
|
|
1193
2000
|
since_row=sync_from_row):
|
1194
2001
|
shard_broker.merge_items(objects)
|
1195
2002
|
if objects is None:
|
1196
|
-
self.
|
1197
|
-
|
2003
|
+
self.info(broker, "Cleaving %r - zero objects found",
|
2004
|
+
shard_range)
|
1198
2005
|
if shard_broker.get_info()['put_timestamp'] == put_timestamp:
|
1199
2006
|
# This was just created; don't need to replicate this
|
1200
2007
|
# SR because there was nothing there. So cleanup and
|
1201
2008
|
# remove the shard_broker from its hand off location.
|
1202
|
-
self.delete_db(shard_broker)
|
1203
|
-
cleaving_context.cursor = shard_range.upper_str
|
1204
|
-
cleaving_context.ranges_done += 1
|
1205
|
-
cleaving_context.ranges_todo -= 1
|
1206
|
-
if shard_range.upper >= own_shard_range.upper:
|
1207
|
-
# cleaving complete
|
1208
|
-
cleaving_context.cleaving_done = True
|
1209
|
-
cleaving_context.store(broker)
|
1210
2009
|
# Because nothing was here we wont count it in the shard
|
1211
2010
|
# batch count.
|
1212
|
-
|
2011
|
+
result = CLEAVE_EMPTY
|
1213
2012
|
# Else, it wasn't newly created by us, and
|
1214
2013
|
# we don't know what's in it or why. Let it get
|
1215
2014
|
# replicated and counted in the batch count.
|
@@ -1225,20 +2024,25 @@ class ContainerSharder(ContainerReplicator):
|
|
1225
2024
|
[{'sync_point': source_max_row, 'remote_id': source_db_id}] +
|
1226
2025
|
source_broker.get_syncs())
|
1227
2026
|
else:
|
1228
|
-
self.
|
1229
|
-
|
2027
|
+
self.debug(broker, "Cleaving %r - shard db already in sync",
|
2028
|
+
shard_range)
|
1230
2029
|
|
1231
2030
|
replication_quorum = self.existing_shard_replication_quorum
|
1232
|
-
if
|
1233
|
-
|
1234
|
-
|
1235
|
-
|
1236
|
-
|
1237
|
-
|
1238
|
-
|
1239
|
-
|
1240
|
-
|
1241
|
-
|
2031
|
+
if own_shard_range.state in ShardRange.SHRINKING_STATES:
|
2032
|
+
if shard_range.includes(own_shard_range):
|
2033
|
+
# When shrinking to a single acceptor that completely encloses
|
2034
|
+
# this shard's namespace, include deleted own (donor) shard
|
2035
|
+
# range in the replicated db so that when acceptor next updates
|
2036
|
+
# root it will atomically update its namespace *and* delete the
|
2037
|
+
# donor. This reduces the chance of a temporary listing gap if
|
2038
|
+
# this shard fails to update the root with its SHRUNK/deleted
|
2039
|
+
# state. Don't do this when sharding a shard or shrinking to
|
2040
|
+
# multiple acceptors because in those cases the donor namespace
|
2041
|
+
# should not be deleted until *all* shards are cleaved.
|
2042
|
+
if own_shard_range.update_state(ShardRange.SHRUNK):
|
2043
|
+
own_shard_range.set_deleted()
|
2044
|
+
broker.merge_shard_ranges(own_shard_range)
|
2045
|
+
shard_broker.merge_shard_ranges(own_shard_range)
|
1242
2046
|
elif shard_range.state == ShardRange.CREATED:
|
1243
2047
|
# The shard range object stats may have changed since the shard
|
1244
2048
|
# range was found, so update with stats of objects actually
|
@@ -1247,53 +2051,74 @@ class ContainerSharder(ContainerReplicator):
|
|
1247
2051
|
info = shard_broker.get_info()
|
1248
2052
|
shard_range.update_meta(
|
1249
2053
|
info['object_count'], info['bytes_used'])
|
2054
|
+
# Update state to CLEAVED; only do this when sharding, not when
|
2055
|
+
# shrinking
|
1250
2056
|
shard_range.update_state(ShardRange.CLEAVED)
|
1251
2057
|
shard_broker.merge_shard_ranges(shard_range)
|
1252
2058
|
replication_quorum = self.shard_replication_quorum
|
1253
2059
|
|
1254
|
-
|
1255
|
-
|
1256
|
-
|
1257
|
-
|
1258
|
-
|
1259
|
-
|
2060
|
+
if result == CLEAVE_EMPTY:
|
2061
|
+
self.delete_db(shard_broker)
|
2062
|
+
else: # result == CLEAVE_SUCCESS:
|
2063
|
+
self.info(broker, 'Replicating new shard container %s for %s',
|
2064
|
+
quote(shard_broker.path), own_shard_range)
|
2065
|
+
|
2066
|
+
success, responses = self._replicate_object(
|
2067
|
+
shard_part, shard_broker.db_file, node_id)
|
2068
|
+
|
2069
|
+
replication_successes = responses.count(True)
|
2070
|
+
if (not success and (not responses or
|
2071
|
+
replication_successes < replication_quorum)):
|
2072
|
+
# insufficient replication or replication not even attempted;
|
2073
|
+
# break because we don't want to progress the cleave cursor
|
2074
|
+
# until each shard range has been successfully cleaved
|
2075
|
+
self.warning(
|
2076
|
+
broker, 'Failed to sufficiently replicate cleaved shard '
|
2077
|
+
'%s in state %s: %s successes, %s required, '
|
2078
|
+
'shard db: %s',
|
2079
|
+
shard_broker.path, shard_range.state_text,
|
2080
|
+
replication_successes, replication_quorum,
|
2081
|
+
shard_broker.db_file)
|
2082
|
+
self._increment_stat('cleaved', 'failure', statsd=True)
|
2083
|
+
result = CLEAVE_FAILED
|
2084
|
+
else:
|
2085
|
+
elapsed = round(time.time() - start, 3)
|
2086
|
+
self._min_stat('cleaved', 'min_time', elapsed)
|
2087
|
+
self._max_stat('cleaved', 'max_time', elapsed)
|
2088
|
+
self.info(broker, 'Cleaved %s in %gs', shard_range,
|
2089
|
+
elapsed)
|
2090
|
+
self._increment_stat('cleaved', 'success', statsd=True)
|
2091
|
+
|
2092
|
+
if result in (CLEAVE_SUCCESS, CLEAVE_EMPTY):
|
2093
|
+
broker.merge_shard_ranges(shard_range)
|
2094
|
+
cleaving_context.range_done(shard_range.upper_str)
|
2095
|
+
if shard_range.upper >= own_shard_range.upper:
|
2096
|
+
# cleaving complete
|
2097
|
+
cleaving_context.cleaving_done = True
|
2098
|
+
cleaving_context.store(broker)
|
2099
|
+
return result
|
1260
2100
|
|
1261
|
-
|
1262
|
-
|
1263
|
-
|
1264
|
-
|
1265
|
-
|
1266
|
-
|
1267
|
-
|
1268
|
-
|
1269
|
-
|
1270
|
-
|
1271
|
-
|
1272
|
-
|
1273
|
-
|
1274
|
-
|
1275
|
-
|
1276
|
-
self._max_stat('cleaved', 'max_time', elapsed)
|
1277
|
-
broker.merge_shard_ranges(shard_range)
|
1278
|
-
cleaving_context.cursor = shard_range.upper_str
|
1279
|
-
cleaving_context.ranges_done += 1
|
1280
|
-
cleaving_context.ranges_todo -= 1
|
1281
|
-
if shard_range.upper >= own_shard_range.upper:
|
1282
|
-
# cleaving complete
|
1283
|
-
cleaving_context.cleaving_done = True
|
1284
|
-
cleaving_context.store(broker)
|
1285
|
-
self.logger.info(
|
1286
|
-
'Cleaved %s for shard range %s in %gs.',
|
1287
|
-
broker.path, shard_range, elapsed)
|
1288
|
-
self._increment_stat('cleaved', 'success', statsd=True)
|
1289
|
-
return CLEAVE_SUCCESS
|
2101
|
+
def _cleave_shard_range(self, broker, cleaving_context, shard_range,
|
2102
|
+
own_shard_range):
|
2103
|
+
self.info(broker, "Cleaving from row %s into %s for %r",
|
2104
|
+
cleaving_context.last_cleave_to_row,
|
2105
|
+
quote(shard_range.name), shard_range)
|
2106
|
+
self._increment_stat('cleaved', 'attempted')
|
2107
|
+
policy_index = broker.storage_policy_index
|
2108
|
+
shard_part, shard_broker, node_id, put_timestamp = \
|
2109
|
+
self._get_shard_broker(shard_range, broker.root_path,
|
2110
|
+
policy_index)
|
2111
|
+
stat = 'db_exists' if put_timestamp is None else 'db_created'
|
2112
|
+
self._increment_stat('cleaved', stat, statsd=True)
|
2113
|
+
return self._cleave_shard_broker(
|
2114
|
+
broker, cleaving_context, shard_range, own_shard_range,
|
2115
|
+
shard_broker, put_timestamp, shard_part, node_id)
|
1290
2116
|
|
1291
2117
|
def _cleave(self, broker):
|
1292
2118
|
# Returns True if misplaced objects have been moved and the entire
|
1293
2119
|
# container namespace has been successfully cleaved, False otherwise
|
1294
2120
|
if broker.is_sharded():
|
1295
|
-
self.
|
1296
|
-
broker.account, broker.container)
|
2121
|
+
self.debug(broker, 'Passing over already sharded container')
|
1297
2122
|
return True
|
1298
2123
|
|
1299
2124
|
cleaving_context = CleavingContext.load(broker)
|
@@ -1301,9 +2126,8 @@ class ContainerSharder(ContainerReplicator):
|
|
1301
2126
|
# ensure any misplaced objects in the source broker are moved; note
|
1302
2127
|
# that this invocation of _move_misplaced_objects is targetted at
|
1303
2128
|
# the *retiring* db.
|
1304
|
-
self.
|
1305
|
-
|
1306
|
-
broker.path)
|
2129
|
+
self.debug(broker,
|
2130
|
+
'Moving any misplaced objects from sharding container')
|
1307
2131
|
bounds = self._make_default_misplaced_object_bounds(broker)
|
1308
2132
|
cleaving_context.misplaced_done = self._move_misplaced_objects(
|
1309
2133
|
broker, src_broker=broker.get_brokers()[0],
|
@@ -1311,53 +2135,78 @@ class ContainerSharder(ContainerReplicator):
|
|
1311
2135
|
cleaving_context.store(broker)
|
1312
2136
|
|
1313
2137
|
if cleaving_context.cleaving_done:
|
1314
|
-
self.
|
1315
|
-
broker.path)
|
2138
|
+
self.debug(broker, 'Cleaving already complete for container')
|
1316
2139
|
return cleaving_context.misplaced_done
|
1317
2140
|
|
1318
|
-
|
2141
|
+
shard_ranges = broker.get_shard_ranges(marker=cleaving_context.marker)
|
2142
|
+
# Ignore shrinking shard ranges: we never want to cleave objects to a
|
2143
|
+
# shrinking shard. Shrinking shard ranges are to be expected in a root;
|
2144
|
+
# shrinking shard ranges (other than own shard range) are not normally
|
2145
|
+
# expected in a shard but can occur if there is an overlapping shard
|
2146
|
+
# range that has been discovered from the root.
|
2147
|
+
ranges_todo = [sr for sr in shard_ranges
|
2148
|
+
if sr.state != ShardRange.SHRINKING]
|
1319
2149
|
if cleaving_context.cursor:
|
1320
|
-
# always update ranges_todo in case
|
1321
|
-
#
|
2150
|
+
# always update ranges_todo in case shard ranges have changed since
|
2151
|
+
# last visit
|
1322
2152
|
cleaving_context.ranges_todo = len(ranges_todo)
|
1323
|
-
self.
|
1324
|
-
|
1325
|
-
|
1326
|
-
broker.path)
|
2153
|
+
self.debug(broker, 'Continuing to cleave (%s done, %s todo)',
|
2154
|
+
cleaving_context.ranges_done,
|
2155
|
+
cleaving_context.ranges_todo)
|
1327
2156
|
else:
|
1328
2157
|
cleaving_context.start()
|
2158
|
+
own_shard_range = broker.get_own_shard_range()
|
2159
|
+
cleaving_context.cursor = own_shard_range.lower_str
|
1329
2160
|
cleaving_context.ranges_todo = len(ranges_todo)
|
1330
|
-
self.
|
1331
|
-
|
2161
|
+
self.info(broker, 'Starting to cleave (%s todo)',
|
2162
|
+
cleaving_context.ranges_todo)
|
2163
|
+
|
2164
|
+
own_shard_range = broker.get_own_shard_range(no_default=True)
|
2165
|
+
if own_shard_range is None:
|
2166
|
+
# A default should never be SHRINKING or SHRUNK but because we
|
2167
|
+
# may write own_shard_range back to broker, let's make sure
|
2168
|
+
# it can't be defaulted.
|
2169
|
+
self.warning(broker, 'Failed to get own_shard_range')
|
2170
|
+
ranges_todo = [] # skip cleaving
|
1332
2171
|
|
1333
2172
|
ranges_done = []
|
1334
2173
|
for shard_range in ranges_todo:
|
1335
|
-
if
|
2174
|
+
if cleaving_context.cleaving_done:
|
2175
|
+
# note: there may still be ranges_todo, for example: if this
|
2176
|
+
# shard is shrinking and has merged a root shard range in
|
2177
|
+
# sharded state along with an active acceptor shard range, but
|
2178
|
+
# the root range is irrelevant
|
1336
2179
|
break
|
1337
|
-
|
1338
|
-
|
1339
|
-
ShardRange.ACTIVE):
|
1340
|
-
cleave_result = self._cleave_shard_range(
|
1341
|
-
broker, cleaving_context, shard_range)
|
1342
|
-
if cleave_result == CLEAVE_SUCCESS:
|
1343
|
-
ranges_done.append(shard_range)
|
1344
|
-
if len(ranges_done) == self.cleave_batch_size:
|
1345
|
-
break
|
1346
|
-
elif cleave_result == CLEAVE_FAILED:
|
1347
|
-
break
|
1348
|
-
# else, no errors, but no rows found either. keep going,
|
1349
|
-
# and don't count it against our batch size
|
1350
|
-
else:
|
1351
|
-
self.logger.warning('Unexpected shard range state for cleave',
|
1352
|
-
shard_range.state)
|
2180
|
+
|
2181
|
+
if len(ranges_done) == self.cleave_batch_size:
|
1353
2182
|
break
|
1354
2183
|
|
1355
|
-
|
1356
|
-
|
1357
|
-
|
1358
|
-
|
1359
|
-
|
1360
|
-
|
2184
|
+
if shard_range.lower > cleaving_context.cursor:
|
2185
|
+
self.info(broker, 'Stopped cleave at gap: %r - %r' %
|
2186
|
+
(cleaving_context.cursor, shard_range.lower))
|
2187
|
+
break
|
2188
|
+
|
2189
|
+
if shard_range.state not in (ShardRange.CREATED,
|
2190
|
+
ShardRange.CLEAVED,
|
2191
|
+
ShardRange.ACTIVE):
|
2192
|
+
self.info(broker, 'Stopped cleave at unready %s', shard_range)
|
2193
|
+
break
|
2194
|
+
|
2195
|
+
cleave_result = self._cleave_shard_range(
|
2196
|
+
broker, cleaving_context, shard_range, own_shard_range)
|
2197
|
+
|
2198
|
+
if cleave_result == CLEAVE_SUCCESS:
|
2199
|
+
ranges_done.append(shard_range)
|
2200
|
+
elif cleave_result == CLEAVE_FAILED:
|
2201
|
+
break
|
2202
|
+
# else: CLEAVE_EMPTY: no errors, but no rows found either. keep
|
2203
|
+
# going, and don't count it against our batch size
|
2204
|
+
|
2205
|
+
# _cleave_shard_range always store()s the context on success; *also* do
|
2206
|
+
# that here in case we hit a failure right off the bat or ended loop
|
2207
|
+
# with skipped ranges
|
2208
|
+
cleaving_context.store(broker)
|
2209
|
+
self.debug(broker, 'Cleaved %s shard ranges', len(ranges_done))
|
1361
2210
|
return (cleaving_context.misplaced_done and
|
1362
2211
|
cleaving_context.cleaving_done)
|
1363
2212
|
|
@@ -1367,13 +2216,23 @@ class ContainerSharder(ContainerReplicator):
|
|
1367
2216
|
# Move all CLEAVED shards to ACTIVE state and if a shard then
|
1368
2217
|
# delete own shard range; these changes will be simultaneously
|
1369
2218
|
# reported in the next update to the root container.
|
1370
|
-
|
1371
|
-
|
1372
|
-
|
1373
|
-
|
1374
|
-
|
1375
|
-
|
2219
|
+
own_shard_range = broker.get_own_shard_range(no_default=True)
|
2220
|
+
if own_shard_range is None:
|
2221
|
+
# This is more of a belts and braces, not sure we could even
|
2222
|
+
# get this far with without an own_shard_range. But because
|
2223
|
+
# we will be writing own_shard_range back, we need to make sure
|
2224
|
+
self.warning(broker, 'Failed to get own_shard_range')
|
2225
|
+
return False
|
1376
2226
|
own_shard_range.update_meta(0, 0)
|
2227
|
+
if own_shard_range.state in ShardRange.SHRINKING_STATES:
|
2228
|
+
own_shard_range.update_state(ShardRange.SHRUNK)
|
2229
|
+
modified_shard_ranges = []
|
2230
|
+
else:
|
2231
|
+
own_shard_range.update_state(ShardRange.SHARDED)
|
2232
|
+
modified_shard_ranges = broker.get_shard_ranges(
|
2233
|
+
states=[ShardRange.CLEAVED])
|
2234
|
+
for sr in modified_shard_ranges:
|
2235
|
+
sr.update_state(ShardRange.ACTIVE)
|
1377
2236
|
if (not broker.is_root_container() and not
|
1378
2237
|
own_shard_range.deleted):
|
1379
2238
|
own_shard_range = own_shard_range.copy(
|
@@ -1381,16 +2240,12 @@ class ContainerSharder(ContainerReplicator):
|
|
1381
2240
|
modified_shard_ranges.append(own_shard_range)
|
1382
2241
|
broker.merge_shard_ranges(modified_shard_ranges)
|
1383
2242
|
if broker.set_sharded_state():
|
1384
|
-
cleaving_context.delete(broker)
|
1385
2243
|
return True
|
1386
2244
|
else:
|
1387
|
-
self.
|
1388
|
-
'Failed to remove retiring db file for %s',
|
1389
|
-
broker.path)
|
2245
|
+
self.warning(broker, 'Failed to remove retiring db file')
|
1390
2246
|
else:
|
1391
|
-
self.
|
1392
|
-
|
1393
|
-
% (broker.db_files[0], dict(cleaving_context)))
|
2247
|
+
self.warning(broker, 'Repeat cleaving required, context: %s',
|
2248
|
+
dict(cleaving_context))
|
1394
2249
|
cleaving_context.reset()
|
1395
2250
|
cleaving_context.store(broker)
|
1396
2251
|
|
@@ -1400,102 +2255,138 @@ class ContainerSharder(ContainerReplicator):
|
|
1400
2255
|
candidates = find_sharding_candidates(
|
1401
2256
|
broker, self.shard_container_threshold, shard_ranges)
|
1402
2257
|
if candidates:
|
1403
|
-
self.
|
1404
|
-
|
2258
|
+
self.debug(broker, 'Identified %s sharding candidates',
|
2259
|
+
len(candidates))
|
1405
2260
|
broker.merge_shard_ranges(candidates)
|
1406
2261
|
|
1407
2262
|
def _find_and_enable_shrinking_candidates(self, broker):
|
1408
2263
|
if not broker.is_sharded():
|
1409
|
-
self.
|
1410
|
-
broker.path)
|
2264
|
+
self.warning(broker, 'Cannot shrink a not yet sharded container')
|
1411
2265
|
return
|
1412
2266
|
|
1413
|
-
|
1414
|
-
broker, self.
|
1415
|
-
|
2267
|
+
compactible_sequences = find_compactible_shard_sequences(
|
2268
|
+
broker, self.shrink_threshold, self.expansion_limit,
|
2269
|
+
self.max_shrinking, self.max_expanding, include_shrinking=True)
|
2270
|
+
self.debug(broker, 'Found %s compactible sequences of length(s) %s' %
|
2271
|
+
(len(compactible_sequences),
|
2272
|
+
[len(s) for s in compactible_sequences]))
|
2273
|
+
process_compactible_shard_sequences(broker, compactible_sequences)
|
1416
2274
|
own_shard_range = broker.get_own_shard_range()
|
1417
|
-
for
|
1418
|
-
|
1419
|
-
|
1420
|
-
|
2275
|
+
for sequence in compactible_sequences:
|
2276
|
+
acceptor = sequence[-1]
|
2277
|
+
donors = ShardRangeList(sequence[:-1])
|
2278
|
+
self.debug(broker,
|
2279
|
+
'shrinking %d objects from %d shard ranges into %s' %
|
2280
|
+
(donors.object_count, len(donors), acceptor))
|
1421
2281
|
if acceptor.name != own_shard_range.name:
|
1422
|
-
self._send_shard_ranges(
|
1423
|
-
|
1424
|
-
acceptor.increment_meta(
|
1425
|
-
else:
|
1426
|
-
# no need to change namespace or stats
|
1427
|
-
acceptor.update_state(ShardRange.ACTIVE,
|
1428
|
-
state_timestamp=Timestamp.now())
|
2282
|
+
self._send_shard_ranges(broker, acceptor.account,
|
2283
|
+
acceptor.container, [acceptor])
|
2284
|
+
acceptor.increment_meta(donors.object_count, donors.bytes_used)
|
1429
2285
|
# Now send a copy of the expanded acceptor, with an updated
|
1430
|
-
# timestamp, to
|
2286
|
+
# timestamp, to each donor container. This forces each donor to
|
1431
2287
|
# asynchronously cleave its entire contents to the acceptor and
|
1432
2288
|
# delete itself. The donor will pass its own deleted shard range to
|
1433
2289
|
# the acceptor when cleaving. Subsequent updates from the donor or
|
1434
2290
|
# the acceptor will then update the root to have the deleted donor
|
1435
2291
|
# shard range.
|
1436
|
-
|
1437
|
-
|
2292
|
+
for donor in donors:
|
2293
|
+
self._send_shard_ranges(broker, donor.account,
|
2294
|
+
donor.container, [donor, acceptor])
|
1438
2295
|
|
1439
2296
|
def _update_root_container(self, broker):
|
1440
2297
|
own_shard_range = broker.get_own_shard_range(no_default=True)
|
1441
2298
|
if not own_shard_range:
|
1442
2299
|
return
|
1443
2300
|
|
1444
|
-
#
|
1445
|
-
|
2301
|
+
# Don't update the osr stats including tombstones unless its CLEAVED+
|
2302
|
+
if own_shard_range.state in SHARD_UPDATE_STAT_STATES:
|
2303
|
+
# do a reclaim *now* in order to get best estimate of tombstone
|
2304
|
+
# count that is consistent with the current object_count
|
2305
|
+
reclaimer = self._reclaim(broker)
|
2306
|
+
tombstones = reclaimer.get_tombstone_count()
|
2307
|
+
self.debug(broker, 'tombstones = %d', tombstones)
|
2308
|
+
# shrinking candidates are found in the root DB so that's the only
|
2309
|
+
# place we need up to date tombstone stats.
|
2310
|
+
own_shard_range.update_tombstones(tombstones)
|
2311
|
+
update_own_shard_range_stats(broker, own_shard_range)
|
2312
|
+
|
2313
|
+
if not own_shard_range.reported:
|
2314
|
+
broker.merge_shard_ranges(own_shard_range)
|
2315
|
+
|
2316
|
+
# we can't use `state not in SHARD_UPDATE_STAT_STATES` to return
|
2317
|
+
# because there are cases we still want to update root even if the
|
2318
|
+
# stats are wrong. Such as it's a new shard or something else has
|
2319
|
+
# decided to remove the latch to update root.
|
2320
|
+
if own_shard_range.reported:
|
2321
|
+
return
|
2322
|
+
|
1446
2323
|
# now get a consistent list of own and other shard ranges
|
1447
2324
|
shard_ranges = broker.get_shard_ranges(
|
1448
2325
|
include_own=True,
|
1449
2326
|
include_deleted=True)
|
1450
2327
|
# send everything
|
1451
|
-
self._send_shard_ranges(
|
1452
|
-
|
1453
|
-
|
2328
|
+
if self._send_shard_ranges(broker, broker.root_account,
|
2329
|
+
broker.root_container, shard_ranges,
|
2330
|
+
{'Referer': quote(broker.path)}):
|
2331
|
+
# on success, mark ourselves as reported so we don't keep
|
2332
|
+
# hammering the root
|
2333
|
+
own_shard_range.reported = True
|
2334
|
+
broker.merge_shard_ranges(own_shard_range)
|
2335
|
+
self.debug(broker, 'updated root objs=%d, tombstones=%s',
|
2336
|
+
own_shard_range.object_count,
|
2337
|
+
own_shard_range.tombstones)
|
1454
2338
|
|
1455
2339
|
def _process_broker(self, broker, node, part):
|
1456
2340
|
broker.get_info() # make sure account/container are populated
|
1457
|
-
|
1458
|
-
|
1459
|
-
|
2341
|
+
db_state = broker.get_db_state()
|
2342
|
+
is_deleted = broker.is_deleted()
|
2343
|
+
self.debug(broker, 'Starting processing, state %s%s', db_state,
|
2344
|
+
' (deleted)' if is_deleted else '')
|
1460
2345
|
|
1461
2346
|
if not self._audit_container(broker):
|
1462
2347
|
return
|
1463
2348
|
|
1464
2349
|
# now look and deal with misplaced objects.
|
2350
|
+
move_start_ts = time.time()
|
1465
2351
|
self._move_misplaced_objects(broker)
|
2352
|
+
self.logger.timing_since(
|
2353
|
+
'sharder.sharding.move_misplaced', move_start_ts)
|
1466
2354
|
|
1467
|
-
|
1468
|
-
# This container is deleted so we can skip it. We still want
|
1469
|
-
# deleted containers to go via misplaced items because they may
|
1470
|
-
# have new objects sitting in them that may need to move.
|
1471
|
-
return
|
2355
|
+
is_leader = node['index'] == 0 and self.auto_shard and not is_deleted
|
1472
2356
|
|
1473
|
-
|
1474
|
-
if state in (UNSHARDED, COLLAPSED):
|
2357
|
+
if db_state in (UNSHARDED, COLLAPSED):
|
1475
2358
|
if is_leader and broker.is_root_container():
|
1476
2359
|
# bootstrap sharding of root container
|
2360
|
+
own_shard_range = broker.get_own_shard_range()
|
2361
|
+
update_own_shard_range_stats(broker, own_shard_range)
|
1477
2362
|
self._find_and_enable_sharding_candidates(
|
1478
|
-
broker, shard_ranges=[
|
2363
|
+
broker, shard_ranges=[own_shard_range])
|
1479
2364
|
|
1480
2365
|
own_shard_range = broker.get_own_shard_range()
|
1481
|
-
if own_shard_range.state in
|
1482
|
-
|
1483
|
-
ShardRange.SHARDED):
|
1484
|
-
if broker.get_shard_ranges():
|
2366
|
+
if own_shard_range.state in ShardRange.CLEAVING_STATES:
|
2367
|
+
if broker.has_other_shard_ranges():
|
1485
2368
|
# container has been given shard ranges rather than
|
1486
|
-
# found them e.g. via replication or a shrink event
|
2369
|
+
# found them e.g. via replication or a shrink event,
|
2370
|
+
# or manually triggered cleaving.
|
2371
|
+
db_start_ts = time.time()
|
1487
2372
|
if broker.set_sharding_state():
|
1488
|
-
|
2373
|
+
db_state = SHARDING
|
2374
|
+
self.info(broker, 'Kick off container cleaving, '
|
2375
|
+
'own shard range in state %r',
|
2376
|
+
own_shard_range.state_text)
|
2377
|
+
self.logger.timing_since(
|
2378
|
+
'sharder.sharding.set_state', db_start_ts)
|
1489
2379
|
elif is_leader:
|
1490
2380
|
if broker.set_sharding_state():
|
1491
|
-
|
2381
|
+
db_state = SHARDING
|
1492
2382
|
else:
|
1493
|
-
self.
|
1494
|
-
|
1495
|
-
|
1496
|
-
|
2383
|
+
self.debug(broker,
|
2384
|
+
'Own shard range in state %r but no shard '
|
2385
|
+
'ranges and not leader; remaining unsharded',
|
2386
|
+
own_shard_range.state_text)
|
1497
2387
|
|
1498
|
-
if
|
2388
|
+
if db_state == SHARDING:
|
2389
|
+
cleave_start_ts = time.time()
|
1499
2390
|
if is_leader:
|
1500
2391
|
num_found = self._find_shard_ranges(broker)
|
1501
2392
|
else:
|
@@ -1510,38 +2401,53 @@ class ContainerSharder(ContainerReplicator):
|
|
1510
2401
|
|
1511
2402
|
# always try to cleave any pending shard ranges
|
1512
2403
|
cleave_complete = self._cleave(broker)
|
2404
|
+
self.logger.timing_since(
|
2405
|
+
'sharder.sharding.cleave', cleave_start_ts)
|
1513
2406
|
|
1514
2407
|
if cleave_complete:
|
1515
|
-
self.logger.info('Completed cleaving of %s', broker.path)
|
1516
2408
|
if self._complete_sharding(broker):
|
1517
|
-
|
2409
|
+
db_state = SHARDED
|
1518
2410
|
self._increment_stat('visited', 'completed', statsd=True)
|
2411
|
+
self.info(broker, 'Completed cleaving, DB set to sharded '
|
2412
|
+
'state')
|
2413
|
+
self.logger.timing_since(
|
2414
|
+
'sharder.sharding.completed',
|
2415
|
+
float(broker.get_own_shard_range().epoch))
|
1519
2416
|
else:
|
1520
|
-
self.
|
1521
|
-
|
2417
|
+
self.info(broker, 'Completed cleaving, DB remaining in '
|
2418
|
+
'sharding state')
|
2419
|
+
|
2420
|
+
if not broker.is_deleted():
|
2421
|
+
if db_state == SHARDED and broker.is_root_container():
|
2422
|
+
# look for shrink stats
|
2423
|
+
send_start_ts = time.time()
|
2424
|
+
self._identify_shrinking_candidate(broker, node)
|
2425
|
+
if is_leader:
|
2426
|
+
self._find_and_enable_shrinking_candidates(broker)
|
2427
|
+
self._find_and_enable_sharding_candidates(broker)
|
2428
|
+
for shard_range in broker.get_shard_ranges(
|
2429
|
+
states=[ShardRange.SHARDING]):
|
2430
|
+
self._send_shard_ranges(broker, shard_range.account,
|
2431
|
+
shard_range.container,
|
2432
|
+
[shard_range])
|
2433
|
+
self.logger.timing_since(
|
2434
|
+
'sharder.sharding.send_sr', send_start_ts)
|
1522
2435
|
|
1523
|
-
|
1524
|
-
|
1525
|
-
|
1526
|
-
|
1527
|
-
|
1528
|
-
|
1529
|
-
|
1530
|
-
|
1531
|
-
|
1532
|
-
|
1533
|
-
|
1534
|
-
|
1535
|
-
|
1536
|
-
|
1537
|
-
|
1538
|
-
# shards move to ACTIVE state and the sharded shard
|
1539
|
-
# simultaneously become deleted.
|
1540
|
-
self._update_root_container(broker)
|
1541
|
-
|
1542
|
-
self.logger.debug('Finished processing %s/%s state %s',
|
1543
|
-
broker.account, broker.container,
|
1544
|
-
broker.get_db_state())
|
2436
|
+
if not broker.is_root_container():
|
2437
|
+
# Update the root container with this container's shard range
|
2438
|
+
# info; do this even when sharded in case previous attempts
|
2439
|
+
# failed; don't do this if there is no own shard range. When
|
2440
|
+
# sharding a shard, this is when the root will see the new
|
2441
|
+
# shards move to ACTIVE state and the sharded shard
|
2442
|
+
# simultaneously become deleted.
|
2443
|
+
update_start_ts = time.time()
|
2444
|
+
self._update_root_container(broker)
|
2445
|
+
self.logger.timing_since(
|
2446
|
+
'sharder.sharding.update_root', update_start_ts)
|
2447
|
+
|
2448
|
+
self.debug(broker,
|
2449
|
+
'Finished processing, state %s%s',
|
2450
|
+
broker.get_db_state(), ' (deleted)' if is_deleted else '')
|
1545
2451
|
|
1546
2452
|
def _one_shard_cycle(self, devices_to_shard, partitions_to_shard):
|
1547
2453
|
"""
|
@@ -1555,6 +2461,7 @@ class ContainerSharder(ContainerReplicator):
|
|
1555
2461
|
- if not a root container, reports shard range stats to the root
|
1556
2462
|
container
|
1557
2463
|
"""
|
2464
|
+
|
1558
2465
|
self.logger.info('Container sharder cycle starting, auto-sharding %s',
|
1559
2466
|
self.auto_shard)
|
1560
2467
|
if isinstance(devices_to_shard, (list, tuple)):
|
@@ -1564,9 +2471,9 @@ class ContainerSharder(ContainerReplicator):
|
|
1564
2471
|
self.logger.info('(Override partitions: %s)',
|
1565
2472
|
', '.join(str(p) for p in partitions_to_shard))
|
1566
2473
|
self._zero_stats()
|
1567
|
-
self._local_device_ids =
|
2474
|
+
self._local_device_ids = {}
|
1568
2475
|
dirs = []
|
1569
|
-
self.ips = whataremyips(
|
2476
|
+
self.ips = whataremyips(self.bind_ip)
|
1570
2477
|
for node in self.ring.devs:
|
1571
2478
|
device_path = self._check_node(node)
|
1572
2479
|
if not device_path:
|
@@ -1575,7 +2482,7 @@ class ContainerSharder(ContainerReplicator):
|
|
1575
2482
|
if os.path.isdir(datadir):
|
1576
2483
|
# Populate self._local_device_ids so we can find devices for
|
1577
2484
|
# shard containers later
|
1578
|
-
self._local_device_ids
|
2485
|
+
self._local_device_ids[node['id']] = node
|
1579
2486
|
if node['device'] not in devices_to_shard:
|
1580
2487
|
continue
|
1581
2488
|
part_filt = self._partition_dir_filter(
|
@@ -1583,7 +2490,7 @@ class ContainerSharder(ContainerReplicator):
|
|
1583
2490
|
partitions_to_shard)
|
1584
2491
|
dirs.append((datadir, node, part_filt))
|
1585
2492
|
if not dirs:
|
1586
|
-
self.logger.
|
2493
|
+
self.logger.info('Found no containers directories')
|
1587
2494
|
for part, path, node in self.roundrobin_datadirs(dirs):
|
1588
2495
|
# NB: get_part_nodes always provides an 'index' key;
|
1589
2496
|
# this will be used in leader selection
|
@@ -1608,36 +2515,47 @@ class ContainerSharder(ContainerReplicator):
|
|
1608
2515
|
self._increment_stat('visited', 'skipped')
|
1609
2516
|
except (Exception, Timeout) as err:
|
1610
2517
|
self._increment_stat('visited', 'failure', statsd=True)
|
1611
|
-
self.
|
1612
|
-
|
2518
|
+
self.exception(broker, 'Unhandled exception while processing: '
|
2519
|
+
'%s', err)
|
1613
2520
|
error = err
|
1614
2521
|
try:
|
1615
2522
|
self._record_sharding_progress(broker, node, error)
|
1616
2523
|
except (Exception, Timeout) as error:
|
1617
|
-
self.
|
1618
|
-
|
1619
|
-
path, error)
|
2524
|
+
self.exception(broker, 'Unhandled exception while dumping '
|
2525
|
+
'progress: %s', error)
|
1620
2526
|
self._periodic_report_stats()
|
1621
2527
|
|
1622
2528
|
self._report_stats()
|
1623
2529
|
|
2530
|
+
@contextmanager
|
2531
|
+
def _set_auto_shard_from_command_line(self, **kwargs):
|
2532
|
+
conf_auto_shard = self.auto_shard
|
2533
|
+
auto_shard = kwargs.get('auto_shard', None)
|
2534
|
+
if auto_shard is not None:
|
2535
|
+
self.auto_shard = config_true_value(auto_shard)
|
2536
|
+
try:
|
2537
|
+
yield
|
2538
|
+
finally:
|
2539
|
+
self.auto_shard = conf_auto_shard
|
2540
|
+
|
1624
2541
|
def run_forever(self, *args, **kwargs):
|
1625
2542
|
"""Run the container sharder until stopped."""
|
1626
|
-
self.
|
1627
|
-
|
1628
|
-
|
1629
|
-
|
1630
|
-
|
1631
|
-
|
1632
|
-
|
1633
|
-
|
1634
|
-
|
1635
|
-
|
1636
|
-
|
1637
|
-
|
1638
|
-
|
1639
|
-
|
1640
|
-
|
2543
|
+
with self._set_auto_shard_from_command_line(**kwargs):
|
2544
|
+
self.reported = time.time()
|
2545
|
+
time.sleep(random() * self.interval)
|
2546
|
+
while True:
|
2547
|
+
begin = time.time()
|
2548
|
+
try:
|
2549
|
+
self._one_shard_cycle(devices_to_shard=Everything(),
|
2550
|
+
partitions_to_shard=Everything())
|
2551
|
+
except (Exception, Timeout):
|
2552
|
+
self.logger.increment('errors')
|
2553
|
+
self.logger.exception('Exception in sharder')
|
2554
|
+
elapsed = time.time() - begin
|
2555
|
+
self.logger.info(
|
2556
|
+
'Container sharder cycle completed: %.02fs', elapsed)
|
2557
|
+
if elapsed < self.interval:
|
2558
|
+
time.sleep(self.interval - elapsed)
|
1641
2559
|
|
1642
2560
|
def run_once(self, *args, **kwargs):
|
1643
2561
|
"""Run the container sharder once."""
|
@@ -1645,9 +2563,32 @@ class ContainerSharder(ContainerReplicator):
|
|
1645
2563
|
override_options = parse_override_options(once=True, **kwargs)
|
1646
2564
|
devices_to_shard = override_options.devices or Everything()
|
1647
2565
|
partitions_to_shard = override_options.partitions or Everything()
|
1648
|
-
|
1649
|
-
|
1650
|
-
|
1651
|
-
|
1652
|
-
|
1653
|
-
|
2566
|
+
with self._set_auto_shard_from_command_line(**kwargs):
|
2567
|
+
begin = self.reported = time.time()
|
2568
|
+
self._one_shard_cycle(devices_to_shard=devices_to_shard,
|
2569
|
+
partitions_to_shard=partitions_to_shard)
|
2570
|
+
elapsed = time.time() - begin
|
2571
|
+
self.logger.info(
|
2572
|
+
'Container sharder "once" mode completed: %.02fs', elapsed)
|
2573
|
+
|
2574
|
+
|
2575
|
+
def main():
|
2576
|
+
parser = OptionParser("%prog CONFIG [options]")
|
2577
|
+
parser.add_option('-d', '--devices',
|
2578
|
+
help='Shard containers only on given devices. '
|
2579
|
+
'Comma-separated list. '
|
2580
|
+
'Only has effect if --once is used.')
|
2581
|
+
parser.add_option('-p', '--partitions',
|
2582
|
+
help='Shard containers only in given partitions. '
|
2583
|
+
'Comma-separated list. '
|
2584
|
+
'Only has effect if --once is used.')
|
2585
|
+
parser.add_option('--no-auto-shard', action='store_false',
|
2586
|
+
dest='auto_shard', default=None,
|
2587
|
+
help='Disable auto-sharding. Overrides the auto_shard '
|
2588
|
+
'value in the config file.')
|
2589
|
+
conf_file, options = parse_options(parser=parser, once=True)
|
2590
|
+
run_daemon(ContainerSharder, conf_file, **options)
|
2591
|
+
|
2592
|
+
|
2593
|
+
if __name__ == '__main__':
|
2594
|
+
main()
|