swift 2.23.3__py3-none-any.whl → 2.35.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- swift/__init__.py +29 -50
- swift/account/auditor.py +21 -118
- swift/account/backend.py +33 -28
- swift/account/reaper.py +37 -28
- swift/account/replicator.py +22 -0
- swift/account/server.py +60 -26
- swift/account/utils.py +28 -11
- swift-2.23.3.data/scripts/swift-account-audit → swift/cli/account_audit.py +23 -13
- swift-2.23.3.data/scripts/swift-config → swift/cli/config.py +2 -2
- swift/cli/container_deleter.py +5 -11
- swift-2.23.3.data/scripts/swift-dispersion-populate → swift/cli/dispersion_populate.py +8 -7
- swift/cli/dispersion_report.py +10 -9
- swift-2.23.3.data/scripts/swift-drive-audit → swift/cli/drive_audit.py +63 -21
- swift/cli/form_signature.py +3 -7
- swift-2.23.3.data/scripts/swift-get-nodes → swift/cli/get_nodes.py +8 -2
- swift/cli/info.py +154 -14
- swift/cli/manage_shard_ranges.py +705 -37
- swift-2.23.3.data/scripts/swift-oldies → swift/cli/oldies.py +25 -14
- swift-2.23.3.data/scripts/swift-orphans → swift/cli/orphans.py +7 -3
- swift/cli/recon.py +196 -67
- swift-2.23.3.data/scripts/swift-recon-cron → swift/cli/recon_cron.py +17 -20
- swift-2.23.3.data/scripts/swift-reconciler-enqueue → swift/cli/reconciler_enqueue.py +2 -3
- swift/cli/relinker.py +807 -126
- swift/cli/reload.py +135 -0
- swift/cli/ringbuilder.py +217 -20
- swift/cli/ringcomposer.py +0 -1
- swift/cli/shard-info.py +4 -3
- swift/common/base_storage_server.py +9 -20
- swift/common/bufferedhttp.py +48 -74
- swift/common/constraints.py +20 -15
- swift/common/container_sync_realms.py +9 -11
- swift/common/daemon.py +25 -8
- swift/common/db.py +195 -128
- swift/common/db_auditor.py +168 -0
- swift/common/db_replicator.py +95 -55
- swift/common/digest.py +141 -0
- swift/common/direct_client.py +144 -33
- swift/common/error_limiter.py +93 -0
- swift/common/exceptions.py +25 -1
- swift/common/header_key_dict.py +2 -9
- swift/common/http_protocol.py +373 -0
- swift/common/internal_client.py +129 -59
- swift/common/linkat.py +3 -4
- swift/common/manager.py +284 -67
- swift/common/memcached.py +390 -145
- swift/common/middleware/__init__.py +4 -0
- swift/common/middleware/account_quotas.py +211 -46
- swift/common/middleware/acl.py +3 -8
- swift/common/middleware/backend_ratelimit.py +230 -0
- swift/common/middleware/bulk.py +22 -34
- swift/common/middleware/catch_errors.py +1 -3
- swift/common/middleware/cname_lookup.py +6 -11
- swift/common/middleware/container_quotas.py +1 -1
- swift/common/middleware/container_sync.py +39 -17
- swift/common/middleware/copy.py +12 -0
- swift/common/middleware/crossdomain.py +22 -9
- swift/common/middleware/crypto/__init__.py +2 -1
- swift/common/middleware/crypto/crypto_utils.py +11 -15
- swift/common/middleware/crypto/decrypter.py +28 -11
- swift/common/middleware/crypto/encrypter.py +12 -17
- swift/common/middleware/crypto/keymaster.py +8 -15
- swift/common/middleware/crypto/kms_keymaster.py +2 -1
- swift/common/middleware/dlo.py +15 -11
- swift/common/middleware/domain_remap.py +5 -4
- swift/common/middleware/etag_quoter.py +128 -0
- swift/common/middleware/formpost.py +73 -70
- swift/common/middleware/gatekeeper.py +8 -1
- swift/common/middleware/keystoneauth.py +33 -3
- swift/common/middleware/list_endpoints.py +4 -4
- swift/common/middleware/listing_formats.py +85 -49
- swift/common/middleware/memcache.py +4 -95
- swift/common/middleware/name_check.py +3 -2
- swift/common/middleware/proxy_logging.py +160 -92
- swift/common/middleware/ratelimit.py +17 -10
- swift/common/middleware/read_only.py +6 -4
- swift/common/middleware/recon.py +59 -22
- swift/common/middleware/s3api/acl_handlers.py +25 -3
- swift/common/middleware/s3api/acl_utils.py +6 -1
- swift/common/middleware/s3api/controllers/__init__.py +6 -0
- swift/common/middleware/s3api/controllers/acl.py +3 -2
- swift/common/middleware/s3api/controllers/bucket.py +242 -137
- swift/common/middleware/s3api/controllers/logging.py +2 -2
- swift/common/middleware/s3api/controllers/multi_delete.py +43 -20
- swift/common/middleware/s3api/controllers/multi_upload.py +219 -133
- swift/common/middleware/s3api/controllers/obj.py +112 -8
- swift/common/middleware/s3api/controllers/object_lock.py +44 -0
- swift/common/middleware/s3api/controllers/s3_acl.py +2 -2
- swift/common/middleware/s3api/controllers/tagging.py +57 -0
- swift/common/middleware/s3api/controllers/versioning.py +36 -7
- swift/common/middleware/s3api/etree.py +22 -9
- swift/common/middleware/s3api/exception.py +0 -4
- swift/common/middleware/s3api/s3api.py +113 -41
- swift/common/middleware/s3api/s3request.py +384 -218
- swift/common/middleware/s3api/s3response.py +126 -23
- swift/common/middleware/s3api/s3token.py +16 -17
- swift/common/middleware/s3api/schema/delete.rng +1 -1
- swift/common/middleware/s3api/subresource.py +7 -10
- swift/common/middleware/s3api/utils.py +27 -10
- swift/common/middleware/slo.py +665 -358
- swift/common/middleware/staticweb.py +64 -37
- swift/common/middleware/symlink.py +51 -18
- swift/common/middleware/tempauth.py +76 -58
- swift/common/middleware/tempurl.py +191 -173
- swift/common/middleware/versioned_writes/__init__.py +51 -0
- swift/common/middleware/{versioned_writes.py → versioned_writes/legacy.py} +27 -26
- swift/common/middleware/versioned_writes/object_versioning.py +1482 -0
- swift/common/middleware/x_profile/exceptions.py +1 -4
- swift/common/middleware/x_profile/html_viewer.py +18 -19
- swift/common/middleware/x_profile/profile_model.py +1 -2
- swift/common/middleware/xprofile.py +10 -10
- swift-2.23.3.data/scripts/swift-container-server → swift/common/recon.py +13 -8
- swift/common/registry.py +147 -0
- swift/common/request_helpers.py +324 -57
- swift/common/ring/builder.py +67 -25
- swift/common/ring/composite_builder.py +1 -1
- swift/common/ring/ring.py +177 -51
- swift/common/ring/utils.py +1 -1
- swift/common/splice.py +10 -6
- swift/common/statsd_client.py +205 -0
- swift/common/storage_policy.py +49 -44
- swift/common/swob.py +86 -102
- swift/common/{utils.py → utils/__init__.py} +2163 -2772
- swift/common/utils/base.py +131 -0
- swift/common/utils/config.py +433 -0
- swift/common/utils/ipaddrs.py +256 -0
- swift/common/utils/libc.py +345 -0
- swift/common/utils/logs.py +859 -0
- swift/common/utils/timestamp.py +412 -0
- swift/common/wsgi.py +553 -535
- swift/container/auditor.py +14 -100
- swift/container/backend.py +490 -231
- swift/container/reconciler.py +126 -37
- swift/container/replicator.py +96 -22
- swift/container/server.py +358 -165
- swift/container/sharder.py +1540 -684
- swift/container/sync.py +94 -88
- swift/container/updater.py +53 -32
- swift/obj/auditor.py +153 -35
- swift/obj/diskfile.py +466 -217
- swift/obj/expirer.py +406 -124
- swift/obj/mem_diskfile.py +7 -4
- swift/obj/mem_server.py +1 -0
- swift/obj/reconstructor.py +523 -262
- swift/obj/replicator.py +249 -188
- swift/obj/server.py +207 -122
- swift/obj/ssync_receiver.py +145 -85
- swift/obj/ssync_sender.py +113 -54
- swift/obj/updater.py +652 -139
- swift/obj/watchers/__init__.py +0 -0
- swift/obj/watchers/dark_data.py +213 -0
- swift/proxy/controllers/account.py +11 -11
- swift/proxy/controllers/base.py +848 -604
- swift/proxy/controllers/container.py +433 -92
- swift/proxy/controllers/info.py +3 -2
- swift/proxy/controllers/obj.py +1000 -489
- swift/proxy/server.py +185 -112
- {swift-2.23.3.dist-info → swift-2.35.0.dist-info}/AUTHORS +58 -11
- {swift-2.23.3.dist-info → swift-2.35.0.dist-info}/METADATA +51 -56
- swift-2.35.0.dist-info/RECORD +201 -0
- {swift-2.23.3.dist-info → swift-2.35.0.dist-info}/WHEEL +1 -1
- {swift-2.23.3.dist-info → swift-2.35.0.dist-info}/entry_points.txt +43 -0
- swift-2.35.0.dist-info/pbr.json +1 -0
- swift/locale/de/LC_MESSAGES/swift.po +0 -1216
- swift/locale/en_GB/LC_MESSAGES/swift.po +0 -1207
- swift/locale/es/LC_MESSAGES/swift.po +0 -1085
- swift/locale/fr/LC_MESSAGES/swift.po +0 -909
- swift/locale/it/LC_MESSAGES/swift.po +0 -894
- swift/locale/ja/LC_MESSAGES/swift.po +0 -965
- swift/locale/ko_KR/LC_MESSAGES/swift.po +0 -964
- swift/locale/pt_BR/LC_MESSAGES/swift.po +0 -881
- swift/locale/ru/LC_MESSAGES/swift.po +0 -891
- swift/locale/tr_TR/LC_MESSAGES/swift.po +0 -832
- swift/locale/zh_CN/LC_MESSAGES/swift.po +0 -833
- swift/locale/zh_TW/LC_MESSAGES/swift.po +0 -838
- swift-2.23.3.data/scripts/swift-account-auditor +0 -23
- swift-2.23.3.data/scripts/swift-account-info +0 -51
- swift-2.23.3.data/scripts/swift-account-reaper +0 -23
- swift-2.23.3.data/scripts/swift-account-replicator +0 -34
- swift-2.23.3.data/scripts/swift-account-server +0 -23
- swift-2.23.3.data/scripts/swift-container-auditor +0 -23
- swift-2.23.3.data/scripts/swift-container-info +0 -55
- swift-2.23.3.data/scripts/swift-container-reconciler +0 -21
- swift-2.23.3.data/scripts/swift-container-replicator +0 -34
- swift-2.23.3.data/scripts/swift-container-sharder +0 -37
- swift-2.23.3.data/scripts/swift-container-sync +0 -23
- swift-2.23.3.data/scripts/swift-container-updater +0 -23
- swift-2.23.3.data/scripts/swift-dispersion-report +0 -24
- swift-2.23.3.data/scripts/swift-form-signature +0 -20
- swift-2.23.3.data/scripts/swift-init +0 -119
- swift-2.23.3.data/scripts/swift-object-auditor +0 -29
- swift-2.23.3.data/scripts/swift-object-expirer +0 -33
- swift-2.23.3.data/scripts/swift-object-info +0 -60
- swift-2.23.3.data/scripts/swift-object-reconstructor +0 -33
- swift-2.23.3.data/scripts/swift-object-relinker +0 -41
- swift-2.23.3.data/scripts/swift-object-replicator +0 -37
- swift-2.23.3.data/scripts/swift-object-server +0 -27
- swift-2.23.3.data/scripts/swift-object-updater +0 -23
- swift-2.23.3.data/scripts/swift-proxy-server +0 -23
- swift-2.23.3.data/scripts/swift-recon +0 -24
- swift-2.23.3.data/scripts/swift-ring-builder +0 -24
- swift-2.23.3.data/scripts/swift-ring-builder-analyzer +0 -22
- swift-2.23.3.data/scripts/swift-ring-composer +0 -22
- swift-2.23.3.dist-info/RECORD +0 -220
- swift-2.23.3.dist-info/pbr.json +0 -1
- {swift-2.23.3.dist-info → swift-2.35.0.dist-info}/LICENSE +0 -0
- {swift-2.23.3.dist-info → swift-2.35.0.dist-info}/top_level.txt +0 -0
swift/container/sharder.py
CHANGED
@@ -12,32 +12,38 @@
|
|
12
12
|
# implied.
|
13
13
|
# See the License for the specific language governing permissions and
|
14
14
|
# limitations under the License.
|
15
|
-
|
15
|
+
import collections
|
16
16
|
import errno
|
17
17
|
import json
|
18
|
+
import logging
|
19
|
+
import operator
|
20
|
+
from optparse import OptionParser
|
18
21
|
import time
|
19
22
|
from collections import defaultdict
|
23
|
+
from operator import itemgetter
|
20
24
|
from random import random
|
21
25
|
|
22
26
|
import os
|
23
|
-
import
|
24
|
-
from six.moves.urllib.parse import quote
|
27
|
+
from urllib.parse import quote
|
25
28
|
from eventlet import Timeout
|
29
|
+
from contextlib import contextmanager
|
26
30
|
|
27
31
|
from swift.common import internal_client
|
28
|
-
from swift.common.constraints import check_drive
|
32
|
+
from swift.common.constraints import check_drive, AUTO_CREATE_ACCOUNT_PREFIX
|
29
33
|
from swift.common.direct_client import (direct_put_container,
|
30
34
|
DirectClientException)
|
31
|
-
from swift.common.
|
35
|
+
from swift.common.daemon import run_daemon
|
36
|
+
from swift.common.request_helpers import USE_REPLICATION_NETWORK_HEADER
|
32
37
|
from swift.common.ring.utils import is_local_device
|
33
38
|
from swift.common.swob import str_to_wsgi
|
34
39
|
from swift.common.utils import get_logger, config_true_value, \
|
35
40
|
dump_recon_cache, whataremyips, Timestamp, ShardRange, GreenAsyncPile, \
|
36
|
-
|
37
|
-
|
41
|
+
config_positive_int_value, quorum_size, parse_override_options, \
|
42
|
+
Everything, config_auto_int_value, ShardRangeList, config_percent_value, \
|
43
|
+
node_to_string, parse_options
|
38
44
|
from swift.container.backend import ContainerBroker, \
|
39
45
|
RECORD_TYPE_SHARD, UNSHARDED, SHARDING, SHARDED, COLLAPSED, \
|
40
|
-
SHARD_UPDATE_STATES
|
46
|
+
SHARD_UPDATE_STATES, sift_shard_ranges, SHARD_UPDATE_STAT_STATES
|
41
47
|
from swift.container.replicator import ContainerReplicator
|
42
48
|
|
43
49
|
|
@@ -45,6 +51,8 @@ CLEAVE_SUCCESS = 0
|
|
45
51
|
CLEAVE_FAILED = 1
|
46
52
|
CLEAVE_EMPTY = 2
|
47
53
|
|
54
|
+
DEFAULT_PERIODIC_WARNINGS_INTERVAL = 24 * 3600
|
55
|
+
|
48
56
|
|
49
57
|
def sharding_enabled(broker):
|
50
58
|
# NB all shards will by default have been created with
|
@@ -56,7 +64,7 @@ def sharding_enabled(broker):
|
|
56
64
|
# if broker has been marked deleted it will have lost sysmeta, but we still
|
57
65
|
# need to process the broker (for example, to shrink any shard ranges) so
|
58
66
|
# fallback to checking if it has any shard ranges
|
59
|
-
if broker.
|
67
|
+
if broker.has_other_shard_ranges():
|
60
68
|
return True
|
61
69
|
return False
|
62
70
|
|
@@ -76,61 +84,166 @@ def make_shard_ranges(broker, shard_data, shards_account_prefix):
|
|
76
84
|
return shard_ranges
|
77
85
|
|
78
86
|
|
79
|
-
def
|
87
|
+
def _find_discontinuity(paths, start):
|
88
|
+
# select the path that reaches furthest from start into the namespace
|
89
|
+
start_paths = [path for path in paths if path.lower == start]
|
90
|
+
start_paths.sort(key=lambda p: p.upper)
|
91
|
+
longest_start_path = start_paths[-1]
|
92
|
+
# search for paths that end further into the namespace (note: these must
|
93
|
+
# have a lower that differs from the start_path upper, otherwise they would
|
94
|
+
# be part of the start_path longer!)
|
95
|
+
end_paths = [path for path in paths
|
96
|
+
if path.upper > longest_start_path.upper]
|
97
|
+
if end_paths:
|
98
|
+
# select those that begin nearest the start of the namespace
|
99
|
+
end_paths.sort(key=lambda p: p.lower)
|
100
|
+
end_paths = [p for p in end_paths if p.lower == end_paths[0].lower]
|
101
|
+
# select the longest of those
|
102
|
+
end_paths.sort(key=lambda p: p.upper)
|
103
|
+
longest_end_path = end_paths[-1]
|
104
|
+
else:
|
105
|
+
longest_end_path = None
|
106
|
+
return longest_start_path, longest_end_path
|
107
|
+
|
108
|
+
|
109
|
+
def find_paths_with_gaps(shard_ranges, within_range=None):
|
80
110
|
"""
|
81
|
-
Find
|
82
|
-
|
111
|
+
Find gaps in the shard ranges and pairs of shard range paths that lead to
|
112
|
+
and from those gaps. For each gap a single pair of adjacent paths is
|
113
|
+
selected. The concatenation of all selected paths and gaps will span the
|
114
|
+
entire namespace with no overlaps.
|
115
|
+
|
116
|
+
:param shard_ranges: a list of instances of ShardRange.
|
117
|
+
:param within_range: an optional ShardRange that constrains the search
|
118
|
+
space; the method will only return gaps within this range. The default
|
119
|
+
is the entire namespace.
|
120
|
+
:return: A list of tuples of ``(start_path, gap_range, end_path)`` where
|
121
|
+
``start_path`` is a list of ShardRanges leading to the gap,
|
122
|
+
``gap_range`` is a ShardRange synthesized to describe the namespace
|
123
|
+
gap, and ``end_path`` is a list of ShardRanges leading from the gap.
|
124
|
+
When gaps start or end at the namespace minimum or maximum bounds,
|
125
|
+
``start_path`` and ``end_path`` may be 'null' paths that contain a
|
126
|
+
single ShardRange covering either the minimum or maximum of the
|
127
|
+
namespace.
|
128
|
+
"""
|
129
|
+
timestamp = Timestamp.now()
|
130
|
+
within_range = within_range or ShardRange('entire/namespace', timestamp)
|
131
|
+
shard_ranges = ShardRangeList(shard_ranges)
|
132
|
+
# note: find_paths results do not include shrinking ranges
|
133
|
+
paths = find_paths(shard_ranges)
|
134
|
+
# add paths covering no namespace at start and end of namespace to ensure
|
135
|
+
# that a start_path and end_path is always found even when there is a gap
|
136
|
+
# at the start or end of the namespace
|
137
|
+
null_start = ShardRange('null/start', timestamp,
|
138
|
+
lower=ShardRange.MIN,
|
139
|
+
upper=ShardRange.MIN,
|
140
|
+
state=ShardRange.FOUND)
|
141
|
+
null_end = ShardRange('null/end', timestamp,
|
142
|
+
lower=ShardRange.MAX,
|
143
|
+
upper=ShardRange.MAX,
|
144
|
+
state=ShardRange.FOUND)
|
145
|
+
paths.extend([ShardRangeList([null_start]), ShardRangeList([null_end])])
|
146
|
+
paths_with_gaps = []
|
147
|
+
start = null_start.lower
|
148
|
+
while True:
|
149
|
+
start_path, end_path = _find_discontinuity(paths, start)
|
150
|
+
if end_path is None:
|
151
|
+
# end of namespace reached
|
152
|
+
break
|
153
|
+
start = end_path.lower
|
154
|
+
if start_path.upper > end_path.lower:
|
155
|
+
# overlap
|
156
|
+
continue
|
157
|
+
gap_range = ShardRange('gap/index_%06d' % len(paths_with_gaps),
|
158
|
+
timestamp,
|
159
|
+
lower=start_path.upper,
|
160
|
+
upper=end_path.lower)
|
161
|
+
if gap_range.overlaps(within_range):
|
162
|
+
paths_with_gaps.append((start_path, gap_range, end_path))
|
163
|
+
return paths_with_gaps
|
83
164
|
|
84
|
-
|
85
|
-
|
165
|
+
|
166
|
+
def _is_parent_or_child(shard_range, other, time_period):
|
86
167
|
"""
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
168
|
+
Test if shard range ``shard_range`` is the parent or a child of another
|
169
|
+
shard range ``other`` within past time period ``time_period``. This method
|
170
|
+
is limited to work only within the scope of the same user-facing account
|
171
|
+
(with and without shard prefix).
|
172
|
+
|
173
|
+
:param shard_range: an instance of ``ShardRange``.
|
174
|
+
:param other: an instance of ``ShardRange``.
|
175
|
+
:param time_period: the specified past time period in seconds. Value of
|
176
|
+
0 means all time in the past.
|
177
|
+
:return: True if ``shard_range`` is the parent or a child of ``other``
|
178
|
+
within past time period, False otherwise, assuming that they are within
|
179
|
+
the same account.
|
180
|
+
"""
|
181
|
+
exclude_age = (time.time() - float(time_period)) if time_period > 0 else 0
|
182
|
+
if shard_range.is_child_of(other) and shard_range.timestamp >= exclude_age:
|
183
|
+
return True
|
184
|
+
if other.is_child_of(shard_range) and other.timestamp >= exclude_age:
|
185
|
+
return True
|
186
|
+
return False
|
187
|
+
|
188
|
+
|
189
|
+
def find_overlapping_ranges(
|
190
|
+
shard_ranges, exclude_parent_child=False, time_period=0):
|
101
191
|
"""
|
102
192
|
Find all pairs of overlapping ranges in the given list.
|
103
193
|
|
104
194
|
:param shard_ranges: A list of :class:`~swift.utils.ShardRange`
|
195
|
+
:param exclude_parent_child: If True then overlapping pairs that have a
|
196
|
+
parent-child relationship within the past time period
|
197
|
+
``time_period`` are excluded from the returned set. Default is
|
198
|
+
False.
|
199
|
+
:param time_period: the specified past time period in seconds. Value of
|
200
|
+
0 means all time in the past.
|
105
201
|
:return: a set of tuples, each tuple containing ranges that overlap with
|
106
202
|
each other.
|
107
203
|
"""
|
108
204
|
result = set()
|
109
|
-
for shard_range in shard_ranges:
|
110
|
-
|
111
|
-
|
205
|
+
for i, shard_range in enumerate(shard_ranges):
|
206
|
+
if exclude_parent_child:
|
207
|
+
overlapping = [
|
208
|
+
sr for sr in shard_ranges[i + 1:]
|
209
|
+
if shard_range.name != sr.name and shard_range.overlaps(sr) and
|
210
|
+
not _is_parent_or_child(shard_range, sr, time_period)]
|
211
|
+
else:
|
212
|
+
overlapping = [
|
213
|
+
sr for sr in shard_ranges[i + 1:]
|
214
|
+
if shard_range.name != sr.name and shard_range.overlaps(sr)]
|
112
215
|
if overlapping:
|
113
216
|
overlapping.append(shard_range)
|
114
|
-
overlapping.sort()
|
217
|
+
overlapping.sort(key=ShardRange.sort_key)
|
115
218
|
result.add(tuple(overlapping))
|
116
219
|
|
117
220
|
return result
|
118
221
|
|
119
222
|
|
120
223
|
def is_sharding_candidate(shard_range, threshold):
|
224
|
+
# note: use *object* count as the condition for sharding: tombstones will
|
225
|
+
# eventually be reclaimed so should not trigger sharding
|
121
226
|
return (shard_range.state == ShardRange.ACTIVE and
|
122
227
|
shard_range.object_count >= threshold)
|
123
228
|
|
124
229
|
|
230
|
+
def is_shrinking_candidate(shard_range, shrink_threshold, expansion_limit,
|
231
|
+
states=None):
|
232
|
+
# typically shrink_threshold < expansion_limit but check both just in case
|
233
|
+
# note: use *row* count (objects plus tombstones) as the condition for
|
234
|
+
# shrinking to avoid inadvertently moving large numbers of tombstones into
|
235
|
+
# an acceptor
|
236
|
+
states = states or (ShardRange.ACTIVE,)
|
237
|
+
return (shard_range.state in states and
|
238
|
+
shard_range.row_count < shrink_threshold and
|
239
|
+
shard_range.row_count <= expansion_limit)
|
240
|
+
|
241
|
+
|
125
242
|
def find_sharding_candidates(broker, threshold, shard_ranges=None):
|
126
243
|
# this should only execute on root containers; the goal is to find
|
127
244
|
# large shard containers that should be sharded.
|
128
245
|
# First cut is simple: assume root container shard usage stats are good
|
129
246
|
# enough to make decision.
|
130
|
-
# TODO: object counts may well not be the appropriate metric for
|
131
|
-
# deciding to shrink because a shard with low object_count may have a
|
132
|
-
# large number of deleted object rows that will need to be merged with
|
133
|
-
# a neighbour. We may need to expose row count as well as object count.
|
134
247
|
if shard_ranges is None:
|
135
248
|
shard_ranges = broker.get_shard_ranges(states=[ShardRange.ACTIVE])
|
136
249
|
candidates = []
|
@@ -144,63 +257,376 @@ def find_sharding_candidates(broker, threshold, shard_ranges=None):
|
|
144
257
|
return candidates
|
145
258
|
|
146
259
|
|
147
|
-
def find_shrinking_candidates(broker, shrink_threshold,
|
260
|
+
def find_shrinking_candidates(broker, shrink_threshold, expansion_limit):
|
261
|
+
# this is only here to preserve a legacy public function signature;
|
262
|
+
# superseded by find_compactible_shard_sequences
|
263
|
+
merge_pairs = {}
|
264
|
+
# restrict search to sequences with one donor
|
265
|
+
results = find_compactible_shard_sequences(broker, shrink_threshold,
|
266
|
+
expansion_limit, 1, -1,
|
267
|
+
include_shrinking=True)
|
268
|
+
for sequence in results:
|
269
|
+
# map acceptor -> donor list
|
270
|
+
merge_pairs[sequence[-1]] = sequence[-2]
|
271
|
+
return merge_pairs
|
272
|
+
|
273
|
+
|
274
|
+
def find_compactible_shard_sequences(broker,
|
275
|
+
shrink_threshold,
|
276
|
+
expansion_limit,
|
277
|
+
max_shrinking,
|
278
|
+
max_expanding,
|
279
|
+
include_shrinking=False):
|
280
|
+
"""
|
281
|
+
Find sequences of shard ranges that could be compacted into a single
|
282
|
+
acceptor shard range.
|
283
|
+
|
284
|
+
This function does not modify shard ranges.
|
285
|
+
|
286
|
+
:param broker: A :class:`~swift.container.backend.ContainerBroker`.
|
287
|
+
:param shrink_threshold: the number of rows below which a shard may be
|
288
|
+
considered for shrinking into another shard
|
289
|
+
:param expansion_limit: the maximum number of rows that an acceptor shard
|
290
|
+
range should have after other shard ranges have been compacted into it
|
291
|
+
:param max_shrinking: the maximum number of shard ranges that should be
|
292
|
+
compacted into each acceptor; -1 implies unlimited.
|
293
|
+
:param max_expanding: the maximum number of acceptors to be found (i.e. the
|
294
|
+
maximum number of sequences to be returned); -1 implies unlimited.
|
295
|
+
:param include_shrinking: if True then existing compactible sequences are
|
296
|
+
included in the results; default is False.
|
297
|
+
:returns: A list of :class:`~swift.common.utils.ShardRangeList` each
|
298
|
+
containing a sequence of neighbouring shard ranges that may be
|
299
|
+
compacted; the final shard range in the list is the acceptor
|
300
|
+
"""
|
148
301
|
# this should only execute on root containers that have sharded; the
|
149
302
|
# goal is to find small shard containers that could be retired by
|
150
303
|
# merging with a neighbour.
|
151
304
|
# First cut is simple: assume root container shard usage stats are good
|
152
305
|
# enough to make decision; only merge with upper neighbour so that
|
153
306
|
# upper bounds never change (shard names include upper bound).
|
154
|
-
# TODO: object counts may well not be the appropriate metric for
|
155
|
-
# deciding to shrink because a shard with low object_count may have a
|
156
|
-
# large number of deleted object rows that will need to be merged with
|
157
|
-
# a neighbour. We may need to expose row count as well as object count.
|
158
307
|
shard_ranges = broker.get_shard_ranges()
|
159
308
|
own_shard_range = broker.get_own_shard_range()
|
160
|
-
if len(shard_ranges) == 1:
|
161
|
-
# special case to enable final shard to shrink into root
|
162
|
-
shard_ranges.append(own_shard_range)
|
163
309
|
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
|
168
|
-
|
169
|
-
|
170
|
-
|
171
|
-
|
172
|
-
|
310
|
+
def sequence_complete(sequence):
|
311
|
+
# a sequence is considered complete if any of the following are true:
|
312
|
+
# - the final shard range has more objects than the shrink_threshold,
|
313
|
+
# so should not be shrunk (this shard will be the acceptor)
|
314
|
+
# - the max number of shard ranges to be compacted (max_shrinking) has
|
315
|
+
# been reached
|
316
|
+
# - the total number of objects in the sequence has reached the
|
317
|
+
# expansion_limit
|
318
|
+
if (sequence and
|
319
|
+
(not is_shrinking_candidate(
|
320
|
+
sequence[-1], shrink_threshold, expansion_limit,
|
321
|
+
states=(ShardRange.ACTIVE, ShardRange.SHRINKING)) or
|
322
|
+
0 < max_shrinking < len(sequence) or
|
323
|
+
sequence.row_count >= expansion_limit)):
|
324
|
+
return True
|
325
|
+
return False
|
326
|
+
|
327
|
+
compactible_sequences = []
|
328
|
+
index = 0
|
329
|
+
expanding = 0
|
330
|
+
while ((max_expanding < 0 or expanding < max_expanding) and
|
331
|
+
index < len(shard_ranges)):
|
332
|
+
if not is_shrinking_candidate(
|
333
|
+
shard_ranges[index], shrink_threshold, expansion_limit,
|
334
|
+
states=(ShardRange.ACTIVE, ShardRange.SHRINKING)):
|
335
|
+
# this shard range cannot be the start of a new or existing
|
336
|
+
# compactible sequence, move on
|
337
|
+
index += 1
|
338
|
+
continue
|
339
|
+
|
340
|
+
# start of a *possible* sequence
|
341
|
+
sequence = ShardRangeList([shard_ranges[index]])
|
342
|
+
for shard_range in shard_ranges[index + 1:]:
|
343
|
+
# attempt to add contiguous shard ranges to the sequence
|
344
|
+
if sequence.upper < shard_range.lower:
|
345
|
+
# found a gap! break before consuming this range because it
|
346
|
+
# could become the first in the next sequence
|
347
|
+
break
|
348
|
+
|
349
|
+
if shard_range.state not in (ShardRange.ACTIVE,
|
350
|
+
ShardRange.SHRINKING):
|
351
|
+
# found? created? sharded? don't touch it
|
352
|
+
break
|
353
|
+
|
354
|
+
if shard_range.state == ShardRange.SHRINKING:
|
355
|
+
# already shrinking: add to sequence unconditionally
|
356
|
+
sequence.append(shard_range)
|
357
|
+
elif (sequence.row_count + shard_range.row_count
|
358
|
+
<= expansion_limit):
|
359
|
+
# add to sequence: could be a donor or acceptor
|
360
|
+
sequence.append(shard_range)
|
361
|
+
if sequence_complete(sequence):
|
362
|
+
break
|
363
|
+
else:
|
364
|
+
break
|
365
|
+
|
366
|
+
index += len(sequence)
|
367
|
+
if (index == len(shard_ranges) and
|
368
|
+
len(shard_ranges) == len(sequence) and
|
369
|
+
not sequence_complete(sequence) and
|
370
|
+
sequence.includes(own_shard_range)):
|
371
|
+
# special case: only one sequence has been found, which consumes
|
372
|
+
# all shard ranges, encompasses the entire namespace, has no more
|
373
|
+
# than expansion_limit records and whose shard ranges are all
|
374
|
+
# shrinkable; all the shards in the sequence can be shrunk to the
|
375
|
+
# root, so append own_shard_range to the sequence to act as an
|
376
|
+
# acceptor; note: only shrink to the root when *all* the remaining
|
377
|
+
# shard ranges can be simultaneously shrunk to the root.
|
378
|
+
sequence.append(own_shard_range)
|
379
|
+
|
380
|
+
if len(sequence) < 2 or sequence[-1].state not in (ShardRange.ACTIVE,
|
381
|
+
ShardRange.SHARDED):
|
382
|
+
# this sequence doesn't end with a suitable acceptor shard range
|
173
383
|
continue
|
174
|
-
|
175
|
-
|
176
|
-
|
384
|
+
|
385
|
+
# all valid sequences are counted against the max_expanding allowance
|
386
|
+
# even if the sequence is already shrinking
|
387
|
+
expanding += 1
|
388
|
+
if (all([sr.state != ShardRange.SHRINKING for sr in sequence]) or
|
389
|
+
include_shrinking):
|
390
|
+
compactible_sequences.append(sequence)
|
391
|
+
|
392
|
+
return compactible_sequences
|
393
|
+
|
394
|
+
|
395
|
+
def finalize_shrinking(broker, acceptor_ranges, donor_ranges, timestamp):
|
396
|
+
"""
|
397
|
+
Update donor shard ranges to shrinking state and merge donors and acceptors
|
398
|
+
to broker.
|
399
|
+
|
400
|
+
:param broker: A :class:`~swift.container.backend.ContainerBroker`.
|
401
|
+
:param acceptor_ranges: A list of :class:`~swift.common.utils.ShardRange`
|
402
|
+
that are to be acceptors.
|
403
|
+
:param donor_ranges: A list of :class:`~swift.common.utils.ShardRange`
|
404
|
+
that are to be donors; these will have their state and timestamp
|
405
|
+
updated.
|
406
|
+
:param timestamp: timestamp to use when updating donor state
|
407
|
+
"""
|
408
|
+
for donor in donor_ranges:
|
409
|
+
if donor.update_state(ShardRange.SHRINKING):
|
410
|
+
# Set donor state to shrinking state_timestamp defines new epoch
|
411
|
+
donor.epoch = donor.state_timestamp = timestamp
|
412
|
+
broker.merge_shard_ranges(acceptor_ranges + donor_ranges)
|
413
|
+
|
414
|
+
|
415
|
+
def process_compactible_shard_sequences(broker, sequences):
|
416
|
+
"""
|
417
|
+
Transform the given sequences of shard ranges into a list of acceptors and
|
418
|
+
a list of shrinking donors. For each given sequence the final ShardRange in
|
419
|
+
the sequence (the acceptor) is expanded to accommodate the other
|
420
|
+
ShardRanges in the sequence (the donors). The donors and acceptors are then
|
421
|
+
merged into the broker.
|
422
|
+
|
423
|
+
:param broker: A :class:`~swift.container.backend.ContainerBroker`.
|
424
|
+
:param sequences: A list of :class:`~swift.common.utils.ShardRangeList`
|
425
|
+
"""
|
426
|
+
timestamp = Timestamp.now()
|
427
|
+
acceptor_ranges = []
|
428
|
+
shrinking_ranges = []
|
429
|
+
for sequence in sequences:
|
430
|
+
donors = sequence[:-1]
|
431
|
+
shrinking_ranges.extend(donors)
|
432
|
+
# Update the acceptor container with its expanded bounds to prevent it
|
433
|
+
# treating objects cleaved from the donor as misplaced.
|
434
|
+
acceptor = sequence[-1]
|
435
|
+
if acceptor.expand(donors):
|
436
|
+
# Update the acceptor container with its expanded bounds to prevent
|
437
|
+
# it treating objects cleaved from the donor as misplaced.
|
438
|
+
acceptor.timestamp = timestamp
|
439
|
+
if acceptor.update_state(ShardRange.ACTIVE):
|
440
|
+
# Ensure acceptor state is ACTIVE (when acceptor is root)
|
441
|
+
acceptor.state_timestamp = timestamp
|
442
|
+
acceptor_ranges.append(acceptor)
|
443
|
+
finalize_shrinking(broker, acceptor_ranges, shrinking_ranges, timestamp)
|
444
|
+
|
445
|
+
|
446
|
+
def find_paths(shard_ranges):
|
447
|
+
"""
|
448
|
+
Returns a list of all continuous paths through the shard ranges. An
|
449
|
+
individual path may not necessarily span the entire namespace, but it will
|
450
|
+
span a continuous namespace without gaps.
|
451
|
+
|
452
|
+
:param shard_ranges: A list of :class:`~swift.common.utils.ShardRange`.
|
453
|
+
:return: A list of :class:`~swift.common.utils.ShardRangeList`.
|
454
|
+
"""
|
455
|
+
# A node is a point in the namespace that is used as a bound of any shard
|
456
|
+
# range. Shard ranges form the edges between nodes.
|
457
|
+
|
458
|
+
# First build a dict mapping nodes to a list of edges that leave that node
|
459
|
+
# (in other words, shard ranges whose lower bound equals the node)
|
460
|
+
node_successors = collections.defaultdict(list)
|
461
|
+
for shard_range in shard_ranges:
|
462
|
+
if shard_range.state == ShardRange.SHRINKING:
|
463
|
+
# shrinking shards are not a viable edge in any path
|
177
464
|
continue
|
178
|
-
|
179
|
-
|
465
|
+
node_successors[shard_range.lower].append(shard_range)
|
466
|
+
|
467
|
+
paths = []
|
468
|
+
|
469
|
+
def clone_path(other=None):
|
470
|
+
# create a new path, possibly cloning another path, and add it to the
|
471
|
+
# list of all paths through the shards
|
472
|
+
path = ShardRangeList() if other is None else ShardRangeList(other)
|
473
|
+
paths.append(path)
|
474
|
+
return path
|
475
|
+
|
476
|
+
# we need to keep track of every path that ends at each node so that when
|
477
|
+
# we visit the node we can extend those paths, or clones of them, with the
|
478
|
+
# edges that leave the node
|
479
|
+
paths_to_node = collections.defaultdict(list)
|
480
|
+
|
481
|
+
# visit the nodes in ascending order by name...
|
482
|
+
for node, edges in sorted(node_successors.items()):
|
483
|
+
if not edges:
|
484
|
+
# this node is a dead-end, so there's no path updates to make
|
180
485
|
continue
|
486
|
+
if not paths_to_node[node]:
|
487
|
+
# this is either the first node to be visited, or it has no paths
|
488
|
+
# leading to it, so we need to start a new path here
|
489
|
+
paths_to_node[node].append(clone_path([]))
|
490
|
+
for path_to_node in paths_to_node[node]:
|
491
|
+
# extend each path that arrives at this node with all of the
|
492
|
+
# possible edges that leave the node; if more than edge leaves the
|
493
|
+
# node then we will make clones of the path to the node and extend
|
494
|
+
# those clones, adding to the collection of all paths though the
|
495
|
+
# shards
|
496
|
+
for i, edge in enumerate(edges):
|
497
|
+
if i == len(edges) - 1:
|
498
|
+
# the last edge is used to extend the original path to the
|
499
|
+
# node; there is nothing special about the last edge, but
|
500
|
+
# doing this last means the original path to the node can
|
501
|
+
# be cloned for all other edges before being modified here
|
502
|
+
path = path_to_node
|
503
|
+
else:
|
504
|
+
# for all but one of the edges leaving the node we need to
|
505
|
+
# make a clone the original path
|
506
|
+
path = clone_path(path_to_node)
|
507
|
+
# extend the path with the edge
|
508
|
+
path.append(edge)
|
509
|
+
# keep track of which node this path now arrives at
|
510
|
+
paths_to_node[edge.upper].append(path)
|
511
|
+
return paths
|
181
512
|
|
182
|
-
|
183
|
-
|
184
|
-
|
185
|
-
|
186
|
-
|
187
|
-
|
188
|
-
|
189
|
-
|
190
|
-
|
191
|
-
|
192
|
-
|
193
|
-
|
194
|
-
|
195
|
-
|
196
|
-
|
197
|
-
|
198
|
-
|
199
|
-
|
200
|
-
|
513
|
+
|
514
|
+
def rank_paths(paths, shard_range_to_span):
|
515
|
+
"""
|
516
|
+
Sorts the given list of paths such that the most preferred path is the
|
517
|
+
first item in the list.
|
518
|
+
|
519
|
+
:param paths: A list of :class:`~swift.common.utils.ShardRangeList`.
|
520
|
+
:param shard_range_to_span: An instance of
|
521
|
+
:class:`~swift.common.utils.ShardRange` that describes the namespace
|
522
|
+
that would ideally be spanned by a path. Paths that include this
|
523
|
+
namespace will be preferred over those that do not.
|
524
|
+
:return: A sorted list of :class:`~swift.common.utils.ShardRangeList`.
|
525
|
+
"""
|
526
|
+
def sort_key(path):
|
527
|
+
# defines the order of preference for paths through shards
|
528
|
+
return (
|
529
|
+
# complete path for the namespace
|
530
|
+
path.includes(shard_range_to_span),
|
531
|
+
# most cleaving progress
|
532
|
+
path.find_lower(lambda sr: sr.state not in (
|
533
|
+
ShardRange.CLEAVED, ShardRange.ACTIVE)),
|
534
|
+
# largest object count
|
535
|
+
path.object_count,
|
536
|
+
# fewest timestamps
|
537
|
+
-1 * len(path.timestamps),
|
538
|
+
# newest timestamp
|
539
|
+
sorted(path.timestamps)[-1]
|
540
|
+
)
|
541
|
+
|
542
|
+
paths.sort(key=sort_key, reverse=True)
|
543
|
+
return paths
|
544
|
+
|
545
|
+
|
546
|
+
def combine_shard_ranges(new_shard_ranges, existing_shard_ranges):
|
547
|
+
"""
|
548
|
+
Combines new and existing shard ranges based on most recent state.
|
549
|
+
|
550
|
+
:param new_shard_ranges: a list of ShardRange instances.
|
551
|
+
:param existing_shard_ranges: a list of ShardRange instances.
|
552
|
+
:return: a list of ShardRange instances.
|
553
|
+
"""
|
554
|
+
new_shard_ranges = [dict(sr) for sr in new_shard_ranges]
|
555
|
+
existing_shard_ranges = [dict(sr) for sr in existing_shard_ranges]
|
556
|
+
to_add, to_delete = sift_shard_ranges(
|
557
|
+
new_shard_ranges,
|
558
|
+
dict((sr['name'], sr) for sr in existing_shard_ranges))
|
559
|
+
result = [ShardRange.from_dict(existing)
|
560
|
+
for existing in existing_shard_ranges
|
561
|
+
if existing['name'] not in to_delete]
|
562
|
+
result.extend([ShardRange.from_dict(sr) for sr in to_add])
|
563
|
+
return sorted([sr for sr in result if not sr.deleted],
|
564
|
+
key=ShardRange.sort_key)
|
565
|
+
|
566
|
+
|
567
|
+
def update_own_shard_range_stats(broker, own_shard_range):
|
568
|
+
"""
|
569
|
+
Update the ``own_shard_range`` with the up-to-date object stats from
|
570
|
+
the ``broker``.
|
571
|
+
|
572
|
+
Note: this method does not persist the updated ``own_shard_range``;
|
573
|
+
callers should use ``broker.merge_shard_ranges`` if the updated stats
|
574
|
+
need to be persisted.
|
575
|
+
|
576
|
+
:param broker: an instance of ``ContainerBroker``.
|
577
|
+
:param own_shard_range: and instance of ``ShardRange``.
|
578
|
+
:returns: ``own_shard_range`` with up-to-date ``object_count``
|
579
|
+
and ``bytes_used``.
|
580
|
+
"""
|
581
|
+
info = broker.get_info()
|
582
|
+
own_shard_range.update_meta(
|
583
|
+
info['object_count'], info['bytes_used'])
|
584
|
+
return own_shard_range
|
201
585
|
|
202
586
|
|
203
587
|
class CleavingContext(object):
|
588
|
+
"""
|
589
|
+
Encapsulates metadata associated with the process of cleaving a retiring
|
590
|
+
DB. This metadata includes:
|
591
|
+
|
592
|
+
* ``ref``: The unique part of the key that is used when persisting a
|
593
|
+
serialized ``CleavingContext`` as sysmeta in the DB. The unique part of
|
594
|
+
the key is based off the DB id. This ensures that each context is
|
595
|
+
associated with a specific DB file. The unique part of the key is
|
596
|
+
included in the ``CleavingContext`` but should not be modified by any
|
597
|
+
caller.
|
598
|
+
|
599
|
+
* ``cursor``: the upper bound of the last shard range to have been
|
600
|
+
cleaved from the retiring DB.
|
601
|
+
|
602
|
+
* ``max_row``: the retiring DB's max row; this is updated to the value of
|
603
|
+
the retiring DB's ``max_row`` every time a ``CleavingContext`` is
|
604
|
+
loaded for that DB, and may change during the process of cleaving the
|
605
|
+
DB.
|
606
|
+
|
607
|
+
* ``cleave_to_row``: the value of ``max_row`` at the moment when cleaving
|
608
|
+
starts for the DB. When cleaving completes (i.e. the cleave cursor has
|
609
|
+
reached the upper bound of the cleaving namespace), ``cleave_to_row``
|
610
|
+
is compared to the current ``max_row``: if the two values are not equal
|
611
|
+
then rows have been added to the DB which may not have been cleaved, in
|
612
|
+
which case the ``CleavingContext`` is ``reset`` and cleaving is
|
613
|
+
re-started.
|
614
|
+
|
615
|
+
* ``last_cleave_to_row``: the minimum DB row from which cleaving should
|
616
|
+
select objects to cleave; this is initially set to None i.e. all rows
|
617
|
+
should be cleaved. If the ``CleavingContext`` is ``reset`` then the
|
618
|
+
``last_cleave_to_row`` is set to the current value of
|
619
|
+
``cleave_to_row``, which in turn is set to the current value of
|
620
|
+
``max_row`` by a subsequent call to ``start``. The repeated cleaving
|
621
|
+
therefore only selects objects in rows greater than the
|
622
|
+
``last_cleave_to_row``, rather than cleaving the whole DB again.
|
623
|
+
|
624
|
+
* ``ranges_done``: the number of shard ranges that have been cleaved from
|
625
|
+
the retiring DB.
|
626
|
+
|
627
|
+
* ``ranges_todo``: the number of shard ranges that are yet to be
|
628
|
+
cleaved from the retiring DB.
|
629
|
+
"""
|
204
630
|
def __init__(self, ref, cursor='', max_row=None, cleave_to_row=None,
|
205
631
|
last_cleave_to_row=None, cleaving_done=False,
|
206
632
|
misplaced_done=False, ranges_done=0, ranges_todo=0):
|
@@ -230,18 +656,13 @@ class CleavingContext(object):
|
|
230
656
|
return '%s(%s)' % (self.__class__.__name__, ', '.join(
|
231
657
|
'%s=%r' % prop for prop in self))
|
232
658
|
|
233
|
-
def _encode(cls, value):
|
234
|
-
if value is not None and six.PY2 and isinstance(value, six.text_type):
|
235
|
-
return value.encode('utf-8')
|
236
|
-
return value
|
237
|
-
|
238
659
|
@property
|
239
660
|
def cursor(self):
|
240
661
|
return self._cursor
|
241
662
|
|
242
663
|
@cursor.setter
|
243
664
|
def cursor(self, value):
|
244
|
-
self._cursor =
|
665
|
+
self._cursor = value
|
245
666
|
|
246
667
|
@property
|
247
668
|
def marker(self):
|
@@ -254,37 +675,33 @@ class CleavingContext(object):
|
|
254
675
|
@classmethod
|
255
676
|
def load_all(cls, broker):
|
256
677
|
"""
|
257
|
-
Returns all cleaving contexts stored in the broker.
|
678
|
+
Returns all cleaving contexts stored in the broker's DB.
|
258
679
|
|
259
|
-
:param broker:
|
680
|
+
:param broker: an instance of :class:`ContainerBroker`
|
260
681
|
:return: list of tuples of (CleavingContext, timestamp)
|
261
682
|
"""
|
262
683
|
brokers = broker.get_brokers()
|
263
684
|
sysmeta = brokers[-1].get_sharding_sysmeta_with_timestamps()
|
264
685
|
|
686
|
+
contexts = []
|
265
687
|
for key, (val, timestamp) in sysmeta.items():
|
266
|
-
# If the value is
|
688
|
+
# If the value is blank, then the metadata is
|
267
689
|
# marked for deletion
|
268
|
-
if key.startswith("Context-") and
|
690
|
+
if key.startswith("Context-") and val:
|
269
691
|
try:
|
270
|
-
|
692
|
+
contexts.append((cls(**json.loads(val)), timestamp))
|
271
693
|
except ValueError:
|
272
694
|
continue
|
695
|
+
return contexts
|
273
696
|
|
274
697
|
@classmethod
|
275
698
|
def load(cls, broker):
|
276
699
|
"""
|
277
|
-
Returns a
|
278
|
-
broker's
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
modified such that its max row changes then a different context, or no
|
283
|
-
context, will be loaded.
|
284
|
-
|
285
|
-
:return: A dict to which cleave progress metadata may be added. The
|
286
|
-
dict initially has a key ``ref`` which should not be modified by
|
287
|
-
any caller.
|
700
|
+
Returns a CleavingContext tracking the cleaving progress of the given
|
701
|
+
broker's DB.
|
702
|
+
|
703
|
+
:param broker: an instances of :class:`ContainerBroker`
|
704
|
+
:return: An instance of :class:`CleavingContext`.
|
288
705
|
"""
|
289
706
|
brokers = broker.get_brokers()
|
290
707
|
ref = cls._make_ref(brokers[0])
|
@@ -295,6 +712,12 @@ class CleavingContext(object):
|
|
295
712
|
return cls(**data)
|
296
713
|
|
297
714
|
def store(self, broker):
|
715
|
+
"""
|
716
|
+
Persists the serialized ``CleavingContext`` as sysmeta in the given
|
717
|
+
broker's DB.
|
718
|
+
|
719
|
+
:param broker: an instances of :class:`ContainerBroker`
|
720
|
+
"""
|
298
721
|
broker.set_sharding_sysmeta('Context-' + self.ref,
|
299
722
|
json.dumps(dict(self)))
|
300
723
|
|
@@ -316,8 +739,7 @@ class CleavingContext(object):
|
|
316
739
|
def range_done(self, new_cursor):
|
317
740
|
self.ranges_done += 1
|
318
741
|
self.ranges_todo -= 1
|
319
|
-
|
320
|
-
self.cursor = new_cursor
|
742
|
+
self.cursor = new_cursor
|
321
743
|
|
322
744
|
def done(self):
|
323
745
|
return all((self.misplaced_done, self.cleaving_done,
|
@@ -329,51 +751,108 @@ class CleavingContext(object):
|
|
329
751
|
broker.set_sharding_sysmeta('Context-' + self.ref, '')
|
330
752
|
|
331
753
|
|
332
|
-
|
333
|
-
|
334
|
-
|
754
|
+
class ContainerSharderConf(object):
|
755
|
+
def __init__(self, conf=None):
|
756
|
+
conf = conf if conf else {}
|
335
757
|
|
758
|
+
def get_val(key, validator, default):
|
759
|
+
"""
|
760
|
+
Get a value from conf and validate it.
|
761
|
+
|
762
|
+
:param key: key to lookup value in the ``conf`` dict.
|
763
|
+
:param validator: A function that will passed the value from the
|
764
|
+
``conf`` dict and should return the value to be set. This
|
765
|
+
function should raise a ValueError if the ``conf`` value if not
|
766
|
+
valid.
|
767
|
+
:param default: value to use if ``key`` is not found in ``conf``.
|
768
|
+
:raises: ValueError if the value read from ``conf`` is invalid.
|
769
|
+
:returns: the configuration value.
|
770
|
+
"""
|
771
|
+
try:
|
772
|
+
return validator(conf.get(key, default))
|
773
|
+
except ValueError as err:
|
774
|
+
raise ValueError('Error setting %s: %s' % (key, err))
|
775
|
+
|
776
|
+
self.shard_container_threshold = get_val(
|
777
|
+
'shard_container_threshold', config_positive_int_value, 1000000)
|
778
|
+
self.max_shrinking = get_val(
|
779
|
+
'max_shrinking', int, 1)
|
780
|
+
self.max_expanding = get_val(
|
781
|
+
'max_expanding', int, -1)
|
782
|
+
self.shard_scanner_batch_size = get_val(
|
783
|
+
'shard_scanner_batch_size', config_positive_int_value, 10)
|
784
|
+
self.cleave_batch_size = get_val(
|
785
|
+
'cleave_batch_size', config_positive_int_value, 2)
|
786
|
+
self.cleave_row_batch_size = get_val(
|
787
|
+
'cleave_row_batch_size', config_positive_int_value, 10000)
|
788
|
+
self.broker_timeout = get_val(
|
789
|
+
'broker_timeout', config_positive_int_value, 60)
|
790
|
+
self.recon_candidates_limit = get_val(
|
791
|
+
'recon_candidates_limit', int, 5)
|
792
|
+
self.recon_sharded_timeout = get_val(
|
793
|
+
'recon_sharded_timeout', int, 43200)
|
794
|
+
self.container_sharding_timeout = get_val(
|
795
|
+
'container_sharding_timeout', int, 172800)
|
796
|
+
self.conn_timeout = get_val(
|
797
|
+
'conn_timeout', float, 5)
|
798
|
+
self.auto_shard = get_val(
|
799
|
+
'auto_shard', config_true_value, False)
|
800
|
+
# deprecated percent options still loaded...
|
801
|
+
self.shrink_threshold = get_val(
|
802
|
+
'shard_shrink_point', self.percent_of_threshold, 10)
|
803
|
+
self.expansion_limit = get_val(
|
804
|
+
'shard_shrink_merge_point', self.percent_of_threshold, 75)
|
805
|
+
# ...but superseded by absolute options if present in conf
|
806
|
+
self.shrink_threshold = get_val(
|
807
|
+
'shrink_threshold', int, self.shrink_threshold)
|
808
|
+
self.expansion_limit = get_val(
|
809
|
+
'expansion_limit', int, self.expansion_limit)
|
810
|
+
self.rows_per_shard = get_val(
|
811
|
+
'rows_per_shard', config_positive_int_value,
|
812
|
+
max(self.shard_container_threshold // 2, 1))
|
813
|
+
self.minimum_shard_size = get_val(
|
814
|
+
'minimum_shard_size', config_positive_int_value,
|
815
|
+
max(self.rows_per_shard // 5, 1))
|
816
|
+
|
817
|
+
def percent_of_threshold(self, val):
|
818
|
+
return int(config_percent_value(val) * self.shard_container_threshold)
|
336
819
|
|
337
|
-
|
820
|
+
@classmethod
|
821
|
+
def validate_conf(cls, namespace):
|
822
|
+
ops = {'<': operator.lt,
|
823
|
+
'<=': operator.le}
|
824
|
+
checks = (('minimum_shard_size', '<=', 'rows_per_shard'),
|
825
|
+
('shrink_threshold', '<=', 'minimum_shard_size'),
|
826
|
+
('rows_per_shard', '<', 'shard_container_threshold'),
|
827
|
+
('expansion_limit', '<', 'shard_container_threshold'))
|
828
|
+
for key1, op, key2 in checks:
|
829
|
+
try:
|
830
|
+
val1 = getattr(namespace, key1)
|
831
|
+
val2 = getattr(namespace, key2)
|
832
|
+
except AttributeError:
|
833
|
+
# swift-manage-shard-ranges uses a subset of conf options for
|
834
|
+
# each command so only validate those actually in the namespace
|
835
|
+
continue
|
836
|
+
if not ops[op](val1, val2):
|
837
|
+
raise ValueError('%s (%d) must be %s %s (%d)'
|
838
|
+
% (key1, val1, op, key2, val2))
|
839
|
+
|
840
|
+
|
841
|
+
DEFAULT_SHARDER_CONF = vars(ContainerSharderConf())
|
842
|
+
|
843
|
+
|
844
|
+
class ContainerSharder(ContainerSharderConf, ContainerReplicator):
|
338
845
|
"""Shards containers."""
|
846
|
+
log_route = 'container-sharder'
|
339
847
|
|
340
848
|
def __init__(self, conf, logger=None):
|
341
|
-
logger = logger or get_logger(conf, log_route=
|
342
|
-
|
343
|
-
self
|
344
|
-
|
345
|
-
|
346
|
-
def percent_value(key, default):
|
347
|
-
try:
|
348
|
-
value = conf.get(key, default)
|
349
|
-
return config_float_value(value, 0, 100) / 100.0
|
350
|
-
except ValueError as err:
|
351
|
-
raise ValueError("%s: %s" % (str(err), key))
|
352
|
-
|
353
|
-
self.shard_shrink_point = percent_value('shard_shrink_point',
|
354
|
-
DEFAULT_SHARD_SHRINK_POINT)
|
355
|
-
self.shrink_merge_point = percent_value('shard_shrink_merge_point',
|
356
|
-
DEFAULT_SHARD_MERGE_POINT)
|
357
|
-
self.shard_container_threshold = config_positive_int_value(
|
358
|
-
conf.get('shard_container_threshold',
|
359
|
-
DEFAULT_SHARD_CONTAINER_THRESHOLD))
|
360
|
-
self.shrink_size = (self.shard_container_threshold *
|
361
|
-
self.shard_shrink_point)
|
362
|
-
self.merge_size = (self.shard_container_threshold *
|
363
|
-
self.shrink_merge_point)
|
364
|
-
self.split_size = self.shard_container_threshold // 2
|
365
|
-
self.scanner_batch_size = config_positive_int_value(
|
366
|
-
conf.get('shard_scanner_batch_size', 10))
|
367
|
-
self.cleave_batch_size = config_positive_int_value(
|
368
|
-
conf.get('cleave_batch_size', 2))
|
369
|
-
self.cleave_row_batch_size = config_positive_int_value(
|
370
|
-
conf.get('cleave_row_batch_size', 10000))
|
371
|
-
self.auto_shard = config_true_value(conf.get('auto_shard', False))
|
849
|
+
logger = logger or get_logger(conf, log_route=self.log_route)
|
850
|
+
ContainerReplicator.__init__(self, conf, logger=logger)
|
851
|
+
ContainerSharderConf.__init__(self, conf)
|
852
|
+
ContainerSharderConf.validate_conf(self)
|
853
|
+
self.shards_account_prefix = (AUTO_CREATE_ACCOUNT_PREFIX + 'shards_')
|
372
854
|
self.sharding_candidates = []
|
373
|
-
self.
|
374
|
-
conf.get('recon_candidates_limit', 5))
|
375
|
-
self.broker_timeout = config_positive_int_value(
|
376
|
-
conf.get('broker_timeout', 60))
|
855
|
+
self.shrinking_candidates = []
|
377
856
|
replica_count = self.ring.replica_count
|
378
857
|
quorum = quorum_size(replica_count)
|
379
858
|
self.shard_replication_quorum = config_auto_int_value(
|
@@ -395,7 +874,6 @@ class ContainerSharder(ContainerReplicator):
|
|
395
874
|
self.existing_shard_replication_quorum = replica_count
|
396
875
|
|
397
876
|
# internal client
|
398
|
-
self.conn_timeout = float(conf.get('conn_timeout', 5))
|
399
877
|
request_tries = config_positive_int_value(
|
400
878
|
conf.get('request_tries', 3))
|
401
879
|
internal_client_conf_path = conf.get('internal_client_conf_path',
|
@@ -405,7 +883,9 @@ class ContainerSharder(ContainerReplicator):
|
|
405
883
|
internal_client_conf_path,
|
406
884
|
'Swift Container Sharder',
|
407
885
|
request_tries,
|
408
|
-
|
886
|
+
use_replication_network=True,
|
887
|
+
global_conf={'log_name': '%s-ic' % conf.get(
|
888
|
+
'log_name', self.log_route)})
|
409
889
|
except (OSError, IOError) as err:
|
410
890
|
if err.errno != errno.ENOENT and \
|
411
891
|
not str(err).endswith(' not found'):
|
@@ -413,7 +893,67 @@ class ContainerSharder(ContainerReplicator):
|
|
413
893
|
raise SystemExit(
|
414
894
|
'Unable to load internal client from config: %r (%s)' %
|
415
895
|
(internal_client_conf_path, err))
|
896
|
+
self.stats_interval = float(conf.get('stats_interval', '3600'))
|
416
897
|
self.reported = 0
|
898
|
+
self.periodic_warnings_interval = float(
|
899
|
+
conf.get('periodic_warnings_interval',
|
900
|
+
DEFAULT_PERIODIC_WARNINGS_INTERVAL))
|
901
|
+
self.periodic_warnings_start = time.time()
|
902
|
+
self.periodic_warnings = set()
|
903
|
+
|
904
|
+
def _get_broker_details(self, broker):
|
905
|
+
try:
|
906
|
+
db_file = broker.db_file
|
907
|
+
except Exception: # noqa
|
908
|
+
db_file = ''
|
909
|
+
try:
|
910
|
+
path = broker.path
|
911
|
+
except Exception: # noqa
|
912
|
+
path = ''
|
913
|
+
return db_file, path
|
914
|
+
|
915
|
+
def _format_log_msg(self, broker, msg, *args):
|
916
|
+
# make best effort to include broker properties...
|
917
|
+
db_file, path = self._get_broker_details(broker)
|
918
|
+
if args:
|
919
|
+
msg = msg % args
|
920
|
+
return '%s, path: %s, db: %s' % (msg, quote(path), db_file)
|
921
|
+
|
922
|
+
def _log(self, level, broker, msg, *args):
|
923
|
+
if not self.logger.isEnabledFor(level):
|
924
|
+
return
|
925
|
+
|
926
|
+
self.logger.log(level, self._format_log_msg(broker, msg, *args))
|
927
|
+
|
928
|
+
def debug(self, broker, msg, *args, **kwargs):
|
929
|
+
self._log(logging.DEBUG, broker, msg, *args, **kwargs)
|
930
|
+
|
931
|
+
def info(self, broker, msg, *args, **kwargs):
|
932
|
+
self._log(logging.INFO, broker, msg, *args, **kwargs)
|
933
|
+
|
934
|
+
def warning(self, broker, msg, *args, **kwargs):
|
935
|
+
self._log(logging.WARNING, broker, msg, *args, **kwargs)
|
936
|
+
|
937
|
+
def periodic_warning(self, broker, msg, *args, **kwargs):
|
938
|
+
now = time.time()
|
939
|
+
if now - self.periodic_warnings_start >= \
|
940
|
+
self.periodic_warnings_interval:
|
941
|
+
self.periodic_warnings.clear()
|
942
|
+
self.periodic_warnings_start = now
|
943
|
+
|
944
|
+
db_file, path = self._get_broker_details(broker)
|
945
|
+
key = (db_file, msg)
|
946
|
+
if key not in self.periodic_warnings:
|
947
|
+
self.periodic_warnings.add(key)
|
948
|
+
self._log(logging.WARNING, broker, msg, *args, **kwargs)
|
949
|
+
|
950
|
+
def error(self, broker, msg, *args, **kwargs):
|
951
|
+
self._log(logging.ERROR, broker, msg, *args, **kwargs)
|
952
|
+
|
953
|
+
def exception(self, broker, msg, *args, **kwargs):
|
954
|
+
if not self.logger.isEnabledFor(logging.ERROR):
|
955
|
+
return
|
956
|
+
self.logger.exception(self._format_log_msg(broker, msg, *args))
|
417
957
|
|
418
958
|
def _zero_stats(self):
|
419
959
|
"""Zero out the stats."""
|
@@ -422,6 +962,7 @@ class ContainerSharder(ContainerReplicator):
|
|
422
962
|
# stats are maintained under the 'sharding' key in self.stats
|
423
963
|
self.stats['sharding'] = defaultdict(lambda: defaultdict(int))
|
424
964
|
self.sharding_candidates = []
|
965
|
+
self.shrinking_candidates = []
|
425
966
|
|
426
967
|
def _append_stat(self, category, key, value):
|
427
968
|
if not self.stats['sharding'][category][key]:
|
@@ -442,11 +983,15 @@ class ContainerSharder(ContainerReplicator):
|
|
442
983
|
else:
|
443
984
|
self.stats['sharding'][category][key] = max(current, value)
|
444
985
|
|
445
|
-
def _increment_stat(self, category, key,
|
446
|
-
self.
|
447
|
-
|
448
|
-
|
449
|
-
|
986
|
+
def _increment_stat(self, category, key, statsd=False):
|
987
|
+
self._update_stat(category, key, step=1, statsd=statsd)
|
988
|
+
|
989
|
+
def _update_stat(self, category, key, step=1, statsd=False):
|
990
|
+
if step:
|
991
|
+
self.stats['sharding'][category][key] += step
|
992
|
+
if statsd:
|
993
|
+
statsd_key = '%s_%s' % (category, key)
|
994
|
+
self.logger.update_stats(statsd_key, step)
|
450
995
|
|
451
996
|
def _make_stats_info(self, broker, node, own_shard_range):
|
452
997
|
try:
|
@@ -465,40 +1010,90 @@ class ContainerSharder(ContainerReplicator):
|
|
465
1010
|
|
466
1011
|
def _identify_sharding_candidate(self, broker, node):
|
467
1012
|
own_shard_range = broker.get_own_shard_range()
|
1013
|
+
update_own_shard_range_stats(broker, own_shard_range)
|
468
1014
|
if is_sharding_candidate(
|
469
1015
|
own_shard_range, self.shard_container_threshold):
|
470
1016
|
self.sharding_candidates.append(
|
471
1017
|
self._make_stats_info(broker, node, own_shard_range))
|
472
1018
|
|
473
|
-
def
|
474
|
-
|
475
|
-
|
1019
|
+
def _identify_shrinking_candidate(self, broker, node):
|
1020
|
+
sequences = find_compactible_shard_sequences(
|
1021
|
+
broker, self.shrink_threshold, self.expansion_limit,
|
1022
|
+
self.max_shrinking, self.max_expanding)
|
1023
|
+
# compactible_ranges are all apart from final acceptor in each sequence
|
1024
|
+
compactible_ranges = sum(len(seq) - 1 for seq in sequences)
|
1025
|
+
|
1026
|
+
if compactible_ranges:
|
1027
|
+
own_shard_range = broker.get_own_shard_range()
|
1028
|
+
update_own_shard_range_stats(broker, own_shard_range)
|
1029
|
+
shrink_candidate = self._make_stats_info(
|
1030
|
+
broker, node, own_shard_range)
|
1031
|
+
# The number of ranges/donors that can be shrunk if the
|
1032
|
+
# tool is used with the current max_shrinking, max_expanding
|
1033
|
+
# settings.
|
1034
|
+
shrink_candidate['compactible_ranges'] = compactible_ranges
|
1035
|
+
self.shrinking_candidates.append(shrink_candidate)
|
1036
|
+
|
1037
|
+
def _transform_candidate_stats(self, category, candidates, sort_keys):
|
476
1038
|
category['found'] = len(candidates)
|
477
|
-
candidates.sort(key=
|
1039
|
+
candidates.sort(key=itemgetter(*sort_keys), reverse=True)
|
478
1040
|
if self.recon_candidates_limit >= 0:
|
479
1041
|
category['top'] = candidates[:self.recon_candidates_limit]
|
480
1042
|
else:
|
481
1043
|
category['top'] = candidates
|
482
1044
|
|
483
1045
|
def _record_sharding_progress(self, broker, node, error):
|
1046
|
+
db_state = broker.get_db_state()
|
1047
|
+
if db_state not in (UNSHARDED, SHARDING, SHARDED):
|
1048
|
+
return
|
484
1049
|
own_shard_range = broker.get_own_shard_range()
|
485
|
-
if
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
1050
|
+
if own_shard_range.state not in ShardRange.CLEAVING_STATES:
|
1051
|
+
return
|
1052
|
+
|
1053
|
+
if db_state == SHARDED:
|
1054
|
+
contexts = CleavingContext.load_all(broker)
|
1055
|
+
if not contexts:
|
1056
|
+
return
|
1057
|
+
context_ts = max(float(ts) for c, ts in contexts)
|
1058
|
+
if context_ts + self.recon_sharded_timeout \
|
1059
|
+
< float(Timestamp.now()):
|
1060
|
+
# last context timestamp too old for the
|
1061
|
+
# broker to be recorded
|
1062
|
+
return
|
1063
|
+
|
1064
|
+
update_own_shard_range_stats(broker, own_shard_range)
|
1065
|
+
info = self._make_stats_info(broker, node, own_shard_range)
|
1066
|
+
info['state'] = own_shard_range.state_text
|
1067
|
+
info['db_state'] = broker.get_db_state()
|
1068
|
+
states = [ShardRange.FOUND, ShardRange.CREATED,
|
1069
|
+
ShardRange.CLEAVED, ShardRange.ACTIVE]
|
1070
|
+
shard_ranges = broker.get_shard_ranges(states=states)
|
1071
|
+
state_count = {}
|
1072
|
+
for state in states:
|
1073
|
+
state_count[ShardRange.STATES[state]] = 0
|
1074
|
+
for shard_range in shard_ranges:
|
1075
|
+
state_count[shard_range.state_text] += 1
|
1076
|
+
info.update(state_count)
|
1077
|
+
info['error'] = error and str(error)
|
1078
|
+
self._append_stat('sharding_in_progress', 'all', info)
|
1079
|
+
|
1080
|
+
if broker.sharding_required() and (
|
1081
|
+
own_shard_range.epoch is not None) and (
|
1082
|
+
float(own_shard_range.epoch) +
|
1083
|
+
self.container_sharding_timeout <
|
1084
|
+
time.time()):
|
1085
|
+
# Note: There is no requirement that own_shard_range.epoch equals
|
1086
|
+
# the time at which the own_shard_range was merged into the
|
1087
|
+
# container DB, which predicates sharding starting. But s-m-s-r and
|
1088
|
+
# auto-sharding do set epoch and then merge, so we use it to tell
|
1089
|
+
# whether sharding has been taking too long or not.
|
1090
|
+
self.warning(
|
1091
|
+
broker, 'Cleaving has not completed in %.2f seconds since %s. '
|
1092
|
+
'DB state: %s, own_shard_range state: %s, state count of '
|
1093
|
+
'shard ranges: %s' %
|
1094
|
+
(time.time() - float(own_shard_range.epoch),
|
1095
|
+
own_shard_range.epoch.isoformat, db_state,
|
1096
|
+
own_shard_range.state_text, str(state_count)))
|
502
1097
|
|
503
1098
|
def _report_stats(self):
|
504
1099
|
# report accumulated stats since start of one sharder cycle
|
@@ -509,7 +1104,7 @@ class ContainerSharder(ContainerReplicator):
|
|
509
1104
|
('created', default_stats),
|
510
1105
|
('cleaved', default_stats + ('min_time', 'max_time',)),
|
511
1106
|
('misplaced', default_stats + ('found', 'placed', 'unplaced')),
|
512
|
-
('audit_root', default_stats),
|
1107
|
+
('audit_root', default_stats + ('has_overlap', 'num_overlap')),
|
513
1108
|
('audit_shard', default_stats),
|
514
1109
|
)
|
515
1110
|
|
@@ -522,7 +1117,16 @@ class ContainerSharder(ContainerReplicator):
|
|
522
1117
|
msg = ' '.join(['%s:%s' % (k, str(stats[k])) for k in keys])
|
523
1118
|
self.logger.info('Since %s %s - %s', last_report, category, msg)
|
524
1119
|
|
525
|
-
|
1120
|
+
# transform the sharding and shrinking candidate states
|
1121
|
+
# first sharding
|
1122
|
+
category = self.stats['sharding']['sharding_candidates']
|
1123
|
+
self._transform_candidate_stats(category, self.sharding_candidates,
|
1124
|
+
sort_keys=('object_count',))
|
1125
|
+
|
1126
|
+
# next shrinking
|
1127
|
+
category = self.stats['sharding']['shrinking_candidates']
|
1128
|
+
self._transform_candidate_stats(category, self.shrinking_candidates,
|
1129
|
+
sort_keys=('compactible_ranges',))
|
526
1130
|
|
527
1131
|
dump_recon_cache(
|
528
1132
|
{'sharding_stats': self.stats,
|
@@ -532,7 +1136,7 @@ class ContainerSharder(ContainerReplicator):
|
|
532
1136
|
self.reported = now
|
533
1137
|
|
534
1138
|
def _periodic_report_stats(self):
|
535
|
-
if (time.time() - self.reported) >=
|
1139
|
+
if (time.time() - self.reported) >= self.stats_interval:
|
536
1140
|
self._report_stats()
|
537
1141
|
|
538
1142
|
def _check_node(self, node):
|
@@ -560,65 +1164,67 @@ class ContainerSharder(ContainerReplicator):
|
|
560
1164
|
params = params or {}
|
561
1165
|
params.setdefault('format', 'json')
|
562
1166
|
headers = {'X-Backend-Record-Type': 'shard',
|
1167
|
+
'X-Backend-Record-Shard-Format': 'full',
|
563
1168
|
'X-Backend-Override-Deleted': 'true',
|
564
1169
|
'X-Backend-Include-Deleted': str(include_deleted)}
|
565
1170
|
if newest:
|
566
1171
|
headers['X-Newest'] = 'true'
|
567
1172
|
try:
|
568
|
-
|
569
|
-
|
570
|
-
|
571
|
-
|
572
|
-
|
573
|
-
|
574
|
-
|
575
|
-
|
576
|
-
|
577
|
-
|
578
|
-
|
579
|
-
|
580
|
-
quote(broker.root_path), err)
|
581
|
-
return None
|
582
|
-
|
583
|
-
try:
|
584
|
-
data = json.loads(resp.body)
|
585
|
-
if not isinstance(data, list):
|
586
|
-
raise ValueError('not a list')
|
587
|
-
return [ShardRange.from_dict(shard_range)
|
588
|
-
for shard_range in data]
|
589
|
-
except (ValueError, TypeError, KeyError) as err:
|
590
|
-
self.logger.error(
|
591
|
-
"Failed to get shard ranges from %s: invalid data: %r",
|
592
|
-
quote(broker.root_path), err)
|
1173
|
+
resp = self.int_client.make_request(
|
1174
|
+
'GET', path, headers, acceptable_statuses=(2,),
|
1175
|
+
params=params)
|
1176
|
+
except internal_client.UnexpectedResponse as err:
|
1177
|
+
self.warning(broker, "Failed to get shard ranges from %s: %s",
|
1178
|
+
quote(broker.root_path), err)
|
1179
|
+
return None
|
1180
|
+
record_type = resp.headers.get('x-backend-record-type')
|
1181
|
+
if record_type != 'shard':
|
1182
|
+
err = 'unexpected record type %r' % record_type
|
1183
|
+
self.error(broker, "Failed to get shard ranges from %s: %s",
|
1184
|
+
quote(broker.root_path), err)
|
593
1185
|
return None
|
594
|
-
finally:
|
595
|
-
self.logger.txn_id = None
|
596
1186
|
|
597
|
-
|
1187
|
+
try:
|
1188
|
+
data = json.loads(resp.body)
|
1189
|
+
if not isinstance(data, list):
|
1190
|
+
raise ValueError('not a list')
|
1191
|
+
return [ShardRange.from_dict(shard_range)
|
1192
|
+
for shard_range in data]
|
1193
|
+
except (ValueError, TypeError, KeyError) as err:
|
1194
|
+
self.error(broker,
|
1195
|
+
"Failed to get shard ranges from %s: invalid data: %r",
|
1196
|
+
quote(broker.root_path), err)
|
1197
|
+
return None
|
1198
|
+
|
1199
|
+
def _put_container(self, broker, node, part, account, container, headers,
|
1200
|
+
body):
|
598
1201
|
try:
|
599
1202
|
direct_put_container(node, part, account, container,
|
600
1203
|
conn_timeout=self.conn_timeout,
|
601
1204
|
response_timeout=self.node_timeout,
|
602
1205
|
headers=headers, contents=body)
|
603
1206
|
except DirectClientException as err:
|
604
|
-
self.
|
605
|
-
|
606
|
-
|
1207
|
+
self.warning(broker,
|
1208
|
+
'Failed to put shard ranges to %s %s/%s: %s',
|
1209
|
+
node_to_string(node, replication=True),
|
1210
|
+
quote(account), quote(container), err.http_status)
|
607
1211
|
except (Exception, Timeout) as err:
|
608
|
-
self.
|
609
|
-
|
610
|
-
|
1212
|
+
self.exception(broker,
|
1213
|
+
'Failed to put shard ranges to %s %s/%s: %s',
|
1214
|
+
node_to_string(node, replication=True),
|
1215
|
+
quote(account), quote(container), err)
|
611
1216
|
else:
|
612
1217
|
return True
|
613
1218
|
return False
|
614
1219
|
|
615
|
-
def _send_shard_ranges(self, account, container, shard_ranges,
|
1220
|
+
def _send_shard_ranges(self, broker, account, container, shard_ranges,
|
616
1221
|
headers=None):
|
617
1222
|
body = json.dumps([dict(sr, reported=0)
|
618
1223
|
for sr in shard_ranges]).encode('ascii')
|
619
1224
|
part, nodes = self.ring.get_nodes(account, container)
|
620
1225
|
headers = headers or {}
|
621
1226
|
headers.update({'X-Backend-Record-Type': RECORD_TYPE_SHARD,
|
1227
|
+
USE_REPLICATION_NETWORK_HEADER: 'True',
|
622
1228
|
'User-Agent': 'container-sharder %s' % os.getpid(),
|
623
1229
|
'X-Timestamp': Timestamp.now().normal,
|
624
1230
|
'Content-Length': len(body),
|
@@ -626,7 +1232,7 @@ class ContainerSharder(ContainerReplicator):
|
|
626
1232
|
|
627
1233
|
pool = GreenAsyncPile(len(nodes))
|
628
1234
|
for node in nodes:
|
629
|
-
pool.spawn(self._put_container, node, part, account,
|
1235
|
+
pool.spawn(self._put_container, broker, node, part, account,
|
630
1236
|
container, headers, body)
|
631
1237
|
|
632
1238
|
results = pool.waitall(None)
|
@@ -642,20 +1248,19 @@ class ContainerSharder(ContainerReplicator):
|
|
642
1248
|
:param shard_range: a :class:`~swift.common.utils.ShardRange`
|
643
1249
|
:param root_path: the path of the shard's root container
|
644
1250
|
:param policy_index: the storage policy index
|
645
|
-
:returns: a tuple of ``(part, broker, node_id)`` where
|
646
|
-
shard container's partition,
|
1251
|
+
:returns: a tuple of ``(part, broker, node_id, put_timestamp)`` where
|
1252
|
+
``part`` is the shard container's partition,
|
1253
|
+
``broker`` is an instance of
|
647
1254
|
:class:`~swift.container.backend.ContainerBroker`,
|
648
|
-
``node_id`` is the id of the selected node
|
1255
|
+
``node_id`` is the id of the selected node,
|
1256
|
+
``put_timestamp`` is the put_timestamp if the broker needed to
|
1257
|
+
be initialized.
|
649
1258
|
"""
|
650
1259
|
part = self.ring.get_part(shard_range.account, shard_range.container)
|
651
1260
|
node = self.find_local_handoff_for_part(part)
|
652
|
-
put_timestamp = Timestamp.now().internal
|
653
|
-
if not node:
|
654
|
-
raise DeviceUnavailable(
|
655
|
-
'No mounted devices found suitable for creating shard broker '
|
656
|
-
'for %s in partition %s' % (quote(shard_range.name), part))
|
657
1261
|
|
658
|
-
|
1262
|
+
put_timestamp = Timestamp.now().internal
|
1263
|
+
shard_broker, initialized = ContainerBroker.create_broker(
|
659
1264
|
os.path.join(self.root, node['device']), part, shard_range.account,
|
660
1265
|
shard_range.container, epoch=shard_range.epoch,
|
661
1266
|
storage_policy_index=policy_index, put_timestamp=put_timestamp)
|
@@ -675,6 +1280,7 @@ class ContainerSharder(ContainerReplicator):
|
|
675
1280
|
'X-Container-Sysmeta-Sharding':
|
676
1281
|
('True', Timestamp.now().internal)})
|
677
1282
|
|
1283
|
+
put_timestamp = put_timestamp if initialized else None
|
678
1284
|
return part, shard_broker, node['id'], put_timestamp
|
679
1285
|
|
680
1286
|
def _audit_root_container(self, broker):
|
@@ -684,121 +1290,196 @@ class ContainerSharder(ContainerReplicator):
|
|
684
1290
|
warnings = []
|
685
1291
|
own_shard_range = broker.get_own_shard_range()
|
686
1292
|
|
687
|
-
if own_shard_range.state in
|
1293
|
+
if own_shard_range.state in ShardRange.SHARDING_STATES:
|
688
1294
|
shard_ranges = [sr for sr in broker.get_shard_ranges()
|
689
1295
|
if sr.state != ShardRange.SHRINKING]
|
690
|
-
|
691
|
-
if
|
1296
|
+
paths_with_gaps = find_paths_with_gaps(shard_ranges)
|
1297
|
+
if paths_with_gaps:
|
692
1298
|
warnings.append(
|
693
1299
|
'missing range(s): %s' %
|
694
|
-
' '.join(['%s-%s' % (lower, upper)
|
695
|
-
for
|
1300
|
+
' '.join(['%s-%s' % (gap.lower, gap.upper)
|
1301
|
+
for (_, gap, _) in paths_with_gaps]))
|
696
1302
|
|
697
1303
|
for state in ShardRange.STATES:
|
698
1304
|
if state == ShardRange.SHRINKING:
|
699
1305
|
# Shrinking is how we resolve overlaps; we've got to
|
700
1306
|
# allow multiple shards in that state
|
701
1307
|
continue
|
702
|
-
shard_ranges = broker.get_shard_ranges(states=state)
|
703
|
-
overlaps
|
704
|
-
|
1308
|
+
shard_ranges = broker.get_shard_ranges(states=[state])
|
1309
|
+
# Transient overlaps can occur during the period immediately after
|
1310
|
+
# sharding if a root learns about new child shards before it learns
|
1311
|
+
# that the parent has sharded. These overlaps are normally
|
1312
|
+
# corrected as an up-to-date version of the parent shard range is
|
1313
|
+
# replicated to the root. Parent-child overlaps are therefore
|
1314
|
+
# ignored for a reclaim age after the child was created. After
|
1315
|
+
# that, parent-child overlaps may indicate that there is
|
1316
|
+
# permanently stale parent shard range data, perhaps from a node
|
1317
|
+
# that has been offline, so these are reported.
|
1318
|
+
overlaps = find_overlapping_ranges(
|
1319
|
+
shard_ranges, exclude_parent_child=True,
|
1320
|
+
time_period=self.reclaim_age)
|
1321
|
+
if overlaps:
|
1322
|
+
self._increment_stat('audit_root', 'has_overlap')
|
1323
|
+
self._update_stat('audit_root', 'num_overlap',
|
1324
|
+
step=len(overlaps))
|
1325
|
+
all_overlaps = ', '.join(
|
1326
|
+
[' '.join(['%s-%s' % (sr.lower, sr.upper)
|
1327
|
+
for sr in overlapping_ranges])
|
1328
|
+
for overlapping_ranges in sorted(list(overlaps))])
|
705
1329
|
warnings.append(
|
706
|
-
'overlapping ranges in state %
|
707
|
-
(ShardRange.STATES[state],
|
708
|
-
|
709
|
-
|
1330
|
+
'overlapping ranges in state %r: %s' %
|
1331
|
+
(ShardRange.STATES[state], all_overlaps))
|
1332
|
+
|
1333
|
+
# We've seen a case in production where the roots own_shard_range
|
1334
|
+
# epoch is reset to None, and state set to ACTIVE (like re-defaulted)
|
1335
|
+
# Epoch it important to sharding so we want to detect if this happens
|
1336
|
+
# 1. So we can alert, and 2. to see how common it is.
|
1337
|
+
if own_shard_range.epoch is None and broker.db_epoch:
|
1338
|
+
warnings.append('own_shard_range reset to None should be %s'
|
1339
|
+
% broker.db_epoch)
|
710
1340
|
|
711
1341
|
if warnings:
|
712
|
-
self.
|
713
|
-
|
714
|
-
broker.db_file, quote(broker.path), ', '.join(warnings))
|
1342
|
+
self.warning(broker, 'Audit failed for root: %s',
|
1343
|
+
', '.join(warnings))
|
715
1344
|
self._increment_stat('audit_root', 'failure', statsd=True)
|
716
1345
|
return False
|
717
1346
|
|
718
1347
|
self._increment_stat('audit_root', 'success', statsd=True)
|
719
1348
|
return True
|
720
1349
|
|
721
|
-
def
|
722
|
-
|
723
|
-
|
724
|
-
|
725
|
-
|
726
|
-
|
727
|
-
self.shards_account_prefix)
|
728
|
-
|
729
|
-
own_shard_range = broker.get_own_shard_range(no_default=True)
|
730
|
-
|
731
|
-
shard_ranges = own_shard_range_from_root = None
|
732
|
-
if own_shard_range:
|
733
|
-
# Get the root view of the world, at least that part of the world
|
734
|
-
# that overlaps with this shard's namespace
|
735
|
-
shard_ranges = self._fetch_shard_ranges(
|
736
|
-
broker, newest=True,
|
737
|
-
params={'marker': str_to_wsgi(own_shard_range.lower_str),
|
738
|
-
'end_marker': str_to_wsgi(own_shard_range.upper_str)},
|
739
|
-
include_deleted=True)
|
740
|
-
if shard_ranges:
|
741
|
-
for shard_range in shard_ranges:
|
742
|
-
# look for this shard range in the list of shard ranges
|
743
|
-
# received from root; the root may have different lower and
|
744
|
-
# upper bounds for this shard (e.g. if this shard has been
|
745
|
-
# expanded in the root to accept a shrinking shard) so we
|
746
|
-
# only match on name.
|
747
|
-
if shard_range.name == own_shard_range.name:
|
748
|
-
own_shard_range_from_root = shard_range
|
749
|
-
break
|
750
|
-
else:
|
751
|
-
# this is not necessarily an error - some replicas of the
|
752
|
-
# root may not yet know about this shard container
|
753
|
-
warnings.append('root has no matching shard range')
|
754
|
-
elif not own_shard_range.deleted:
|
755
|
-
warnings.append('unable to get shard ranges from root')
|
756
|
-
# else, our shard range is deleted, so root may have reclaimed it
|
757
|
-
else:
|
758
|
-
errors.append('missing own shard range')
|
759
|
-
|
760
|
-
if warnings:
|
761
|
-
self.logger.warning(
|
762
|
-
'Audit warnings for shard %s (%s): %s',
|
763
|
-
broker.db_file, quote(broker.path), ', '.join(warnings))
|
764
|
-
|
765
|
-
if errors:
|
766
|
-
self.logger.warning(
|
767
|
-
'Audit failed for shard %s (%s) - skipping: %s',
|
768
|
-
broker.db_file, quote(broker.path), ', '.join(errors))
|
769
|
-
self._increment_stat('audit_shard', 'failure', statsd=True)
|
770
|
-
return False
|
1350
|
+
def _merge_shard_ranges_from_root(self, broker, shard_ranges,
|
1351
|
+
own_shard_range):
|
1352
|
+
"""
|
1353
|
+
Merge appropriate items from the given ``shard_ranges`` into the
|
1354
|
+
``broker``. The selection of items that are merged will depend upon the
|
1355
|
+
state of the shard.
|
771
1356
|
|
772
|
-
|
773
|
-
|
774
|
-
|
775
|
-
|
776
|
-
|
777
|
-
|
778
|
-
|
779
|
-
|
780
|
-
|
781
|
-
if
|
782
|
-
|
783
|
-
|
784
|
-
|
785
|
-
|
786
|
-
|
787
|
-
|
788
|
-
|
789
|
-
|
790
|
-
|
791
|
-
|
792
|
-
|
793
|
-
#
|
794
|
-
#
|
795
|
-
#
|
796
|
-
|
797
|
-
|
798
|
-
|
799
|
-
|
800
|
-
|
1357
|
+
:param broker: A :class:`~swift.container.backend.ContainerBroker`.
|
1358
|
+
:param shard_ranges: A list of instances of
|
1359
|
+
:class:`~swift.common.utils.ShardRange` describing the shard ranges
|
1360
|
+
fetched from the root container.
|
1361
|
+
:param own_shard_range: A :class:`~swift.common.utils.ShardRange`
|
1362
|
+
describing the shard's own shard range.
|
1363
|
+
:return: a tuple of ``own_shard_range, own_shard_range_from_root``. The
|
1364
|
+
returned``own_shard_range`` will have been updated if the matching
|
1365
|
+
``own_shard_range_from_root`` has newer data.
|
1366
|
+
``own_shard_range_from_root`` will be None if no such matching
|
1367
|
+
shard range is found in ``shard_ranges``.
|
1368
|
+
"""
|
1369
|
+
own_shard_range_from_root = None
|
1370
|
+
children_shard_ranges = []
|
1371
|
+
other_shard_ranges = []
|
1372
|
+
for shard_range in shard_ranges:
|
1373
|
+
# look for this shard range in the list of shard ranges received
|
1374
|
+
# from root; the root may have different lower and upper bounds for
|
1375
|
+
# this shard (e.g. if this shard has been expanded in the root to
|
1376
|
+
# accept a shrinking shard) so we only match on name.
|
1377
|
+
if shard_range.name == own_shard_range.name:
|
1378
|
+
# If we find our own shard range in the root response, merge
|
1379
|
+
# it and reload own shard range (note: own_range_from_root may
|
1380
|
+
# not necessarily be 'newer' than the own shard range we
|
1381
|
+
# already have, but merging will get us to the 'newest' state)
|
1382
|
+
self.debug(broker, 'Updating own shard range from root')
|
1383
|
+
own_shard_range_from_root = shard_range
|
1384
|
+
broker.merge_shard_ranges(own_shard_range_from_root)
|
1385
|
+
orig_own_shard_range = own_shard_range
|
1386
|
+
own_shard_range = broker.get_own_shard_range()
|
1387
|
+
if (orig_own_shard_range != own_shard_range or
|
1388
|
+
orig_own_shard_range.state != own_shard_range.state):
|
1389
|
+
self.info(broker,
|
1390
|
+
'Updated own shard range from %s to %s',
|
1391
|
+
orig_own_shard_range, own_shard_range)
|
1392
|
+
elif shard_range.is_child_of(own_shard_range):
|
1393
|
+
children_shard_ranges.append(shard_range)
|
1394
|
+
else:
|
1395
|
+
other_shard_ranges.append(shard_range)
|
1396
|
+
|
1397
|
+
if children_shard_ranges and not broker.is_sharded():
|
1398
|
+
# Merging shard ranges from the root is only necessary until this
|
1399
|
+
# DB is fully cleaved and reaches SHARDED DB state, after which it
|
1400
|
+
# is useful for debugging for the set of sub-shards to which a
|
1401
|
+
# shards has sharded to be frozen.
|
1402
|
+
self.debug(broker, 'Updating %d children shard ranges from root',
|
1403
|
+
len(children_shard_ranges))
|
1404
|
+
broker.merge_shard_ranges(children_shard_ranges)
|
1405
|
+
|
1406
|
+
if (other_shard_ranges
|
1407
|
+
and own_shard_range.state in ShardRange.CLEAVING_STATES
|
1408
|
+
and not broker.is_sharded()):
|
1409
|
+
# Other shard ranges returned from the root may need to be merged
|
1410
|
+
# for the purposes of sharding or shrinking this shard:
|
1411
|
+
#
|
1412
|
+
# Shrinking states: If the up-to-date state is shrinking, the
|
1413
|
+
# shards fetched from root may contain shards into which this shard
|
1414
|
+
# is to shrink itself. Shrinking is initiated by modifying multiple
|
1415
|
+
# neighboring shard range states *in the root*, rather than
|
1416
|
+
# modifying a shard directly. We therefore need to learn about
|
1417
|
+
# *other* neighboring shard ranges from the root, possibly
|
1418
|
+
# including the root itself. We need to include shrunk state too,
|
1419
|
+
# because one replica of a shard may already have moved the
|
1420
|
+
# own_shard_range state to shrunk while another replica may still
|
1421
|
+
# be in the process of shrinking.
|
1422
|
+
#
|
1423
|
+
# Sharding states: Normally a shard will shard to its own children.
|
1424
|
+
# However, in some circumstances a shard may need to shard to other
|
1425
|
+
# non-children sub-shards. For example, a shard range repair may
|
1426
|
+
# cause a child sub-shard to be deleted and its namespace covered
|
1427
|
+
# by another 'acceptor' shard.
|
1428
|
+
#
|
1429
|
+
# Therefore, if the up-to-date own_shard_range state indicates that
|
1430
|
+
# sharding or shrinking is in progress, then other shard ranges
|
1431
|
+
# will be merged, with the following caveats: we never expect a
|
1432
|
+
# shard to shard to any ancestor shard range including the root,
|
1433
|
+
# but containers might ultimately *shrink* to root; we never want
|
1434
|
+
# to cleave to a container that is itself sharding or shrinking;
|
1435
|
+
# the merged shard ranges should not result in gaps or overlaps in
|
1436
|
+
# the namespace of this shard.
|
1437
|
+
#
|
1438
|
+
# Note: the search for ancestors is guaranteed to find the parent
|
1439
|
+
# and root *if they are present*, but if any ancestor is missing
|
1440
|
+
# then there is a chance that older generations in the
|
1441
|
+
# other_shard_ranges will not be filtered and could be merged. That
|
1442
|
+
# is only a problem if they are somehow still in ACTIVE state, and
|
1443
|
+
# no overlap is detected, so the ancestor is merged.
|
1444
|
+
ancestor_names = [
|
1445
|
+
sr.name for sr in own_shard_range.find_ancestors(shard_ranges)]
|
1446
|
+
filtered_other_shard_ranges = [
|
1447
|
+
sr for sr in other_shard_ranges
|
1448
|
+
if (sr.name not in ancestor_names
|
1449
|
+
and (sr.state not in ShardRange.CLEAVING_STATES
|
1450
|
+
or sr.deleted))
|
1451
|
+
]
|
1452
|
+
if own_shard_range.state in ShardRange.SHRINKING_STATES:
|
1453
|
+
root_shard_range = own_shard_range.find_root(
|
1454
|
+
other_shard_ranges)
|
1455
|
+
if (root_shard_range and
|
1456
|
+
root_shard_range.state == ShardRange.ACTIVE):
|
1457
|
+
filtered_other_shard_ranges.append(root_shard_range)
|
1458
|
+
existing_shard_ranges = broker.get_shard_ranges()
|
1459
|
+
combined_shard_ranges = combine_shard_ranges(
|
1460
|
+
filtered_other_shard_ranges, existing_shard_ranges)
|
1461
|
+
overlaps = find_overlapping_ranges(combined_shard_ranges)
|
1462
|
+
paths_with_gaps = find_paths_with_gaps(
|
1463
|
+
combined_shard_ranges, own_shard_range)
|
1464
|
+
if not (overlaps or paths_with_gaps):
|
1465
|
+
# only merge if shard ranges appear to be *good*
|
1466
|
+
self.debug(broker,
|
1467
|
+
'Updating %s other shard range(s) from root',
|
1468
|
+
len(filtered_other_shard_ranges))
|
1469
|
+
broker.merge_shard_ranges(filtered_other_shard_ranges)
|
1470
|
+
|
1471
|
+
return own_shard_range, own_shard_range_from_root
|
1472
|
+
|
1473
|
+
def _delete_shard_container(self, broker, own_shard_range):
|
1474
|
+
"""
|
1475
|
+
Mark a shard container as deleted if it was sharded or shrunk more than
|
1476
|
+
reclaim_age in the past. (The DB file will be removed by the replicator
|
1477
|
+
after a further reclaim_age.)
|
801
1478
|
|
1479
|
+
:param broker: A :class:`~swift.container.backend.ContainerBroker`.
|
1480
|
+
:param own_shard_range: A :class:`~swift.common.utils.ShardRange`
|
1481
|
+
describing the shard's own shard range.
|
1482
|
+
"""
|
802
1483
|
delete_age = time.time() - self.reclaim_age
|
803
1484
|
deletable_states = (ShardRange.SHARDED, ShardRange.SHRUNK)
|
804
1485
|
if (own_shard_range.state in deletable_states and
|
@@ -806,21 +1487,79 @@ class ContainerSharder(ContainerReplicator):
|
|
806
1487
|
own_shard_range.timestamp < delete_age and
|
807
1488
|
broker.empty()):
|
808
1489
|
broker.delete_db(Timestamp.now().internal)
|
809
|
-
self.
|
810
|
-
broker.db_file, quote(broker.path))
|
1490
|
+
self.debug(broker, 'Marked shard container as deleted')
|
811
1491
|
|
812
|
-
|
813
|
-
|
1492
|
+
def _do_audit_shard_container(self, broker):
|
1493
|
+
warnings = []
|
1494
|
+
if not broker.account.startswith(self.shards_account_prefix):
|
1495
|
+
warnings.append('account not in shards namespace %r' %
|
1496
|
+
self.shards_account_prefix)
|
1497
|
+
|
1498
|
+
own_shard_range = broker.get_own_shard_range(no_default=True)
|
1499
|
+
|
1500
|
+
if not own_shard_range:
|
1501
|
+
self.warning(broker, 'Audit failed for shard: missing own shard '
|
1502
|
+
'range (skipping)')
|
1503
|
+
return False, warnings
|
1504
|
+
|
1505
|
+
# Get the root view of the world, at least that part of the world
|
1506
|
+
# that overlaps with this shard's namespace. The
|
1507
|
+
# 'states=auditing' parameter will cause the root to include
|
1508
|
+
# its own shard range in the response, which is necessary for the
|
1509
|
+
# particular case when this shard should be shrinking to the root
|
1510
|
+
# container; when not shrinking to root, but to another acceptor,
|
1511
|
+
# the root range should be in sharded state and will not interfere
|
1512
|
+
# with cleaving, listing or updating behaviour.
|
1513
|
+
shard_ranges = self._fetch_shard_ranges(
|
1514
|
+
broker, newest=True,
|
1515
|
+
params={'marker': str_to_wsgi(own_shard_range.lower_str),
|
1516
|
+
'end_marker': str_to_wsgi(own_shard_range.upper_str),
|
1517
|
+
'states': 'auditing'},
|
1518
|
+
include_deleted=True)
|
1519
|
+
if shard_ranges:
|
1520
|
+
own_shard_range, own_shard_range_from_root = \
|
1521
|
+
self._merge_shard_ranges_from_root(
|
1522
|
+
broker, shard_ranges, own_shard_range)
|
1523
|
+
if not own_shard_range_from_root:
|
1524
|
+
# this is not necessarily an error - some replicas of the
|
1525
|
+
# root may not yet know about this shard container, or the
|
1526
|
+
# shard's own shard range could become deleted and
|
1527
|
+
# reclaimed from the root under rare conditions
|
1528
|
+
warnings.append('root has no matching shard range')
|
1529
|
+
elif not own_shard_range.deleted:
|
1530
|
+
warnings.append('unable to get shard ranges from root')
|
1531
|
+
# else, our shard range is deleted, so root may have reclaimed it
|
1532
|
+
|
1533
|
+
self._delete_shard_container(broker, own_shard_range)
|
1534
|
+
|
1535
|
+
return True, warnings
|
1536
|
+
|
1537
|
+
def _audit_shard_container(self, broker):
|
1538
|
+
self._increment_stat('audit_shard', 'attempted')
|
1539
|
+
success, warnings = self._do_audit_shard_container(broker)
|
1540
|
+
if warnings:
|
1541
|
+
self.warning(broker, 'Audit warnings for shard: %s',
|
1542
|
+
', '.join(warnings))
|
1543
|
+
self._increment_stat(
|
1544
|
+
'audit_shard', 'success' if success else 'failure', statsd=True)
|
1545
|
+
return success
|
814
1546
|
|
815
1547
|
def _audit_cleave_contexts(self, broker):
|
816
1548
|
now = Timestamp.now()
|
817
1549
|
for context, last_mod in CleavingContext.load_all(broker):
|
818
|
-
|
819
|
-
|
1550
|
+
last_mod = Timestamp(last_mod)
|
1551
|
+
is_done = context.done() and last_mod.timestamp + \
|
1552
|
+
self.recon_sharded_timeout < now.timestamp
|
1553
|
+
is_stale = last_mod.timestamp + self.reclaim_age < now.timestamp
|
1554
|
+
if is_done or is_stale:
|
820
1555
|
context.delete(broker)
|
821
1556
|
|
822
1557
|
def _audit_container(self, broker):
|
823
1558
|
if broker.is_deleted():
|
1559
|
+
if broker.is_old_enough_to_reclaim(time.time(), self.reclaim_age) \
|
1560
|
+
and not broker.is_empty_enough_to_reclaim():
|
1561
|
+
self.periodic_warning(
|
1562
|
+
broker, 'Reclaimable db stuck waiting for shrinking')
|
824
1563
|
# if the container has been marked as deleted, all metadata will
|
825
1564
|
# have been erased so no point auditing. But we want it to pass, in
|
826
1565
|
# case any objects exist inside it.
|
@@ -830,18 +1569,32 @@ class ContainerSharder(ContainerReplicator):
|
|
830
1569
|
return self._audit_root_container(broker)
|
831
1570
|
return self._audit_shard_container(broker)
|
832
1571
|
|
833
|
-
def yield_objects(self, broker, src_shard_range, since_row=None
|
1572
|
+
def yield_objects(self, broker, src_shard_range, since_row=None,
|
1573
|
+
batch_size=None):
|
834
1574
|
"""
|
835
|
-
Iterates through all
|
836
|
-
yielding them in lists of up to
|
1575
|
+
Iterates through all object rows in ``src_shard_range`` in name order
|
1576
|
+
yielding them in lists of up to ``batch_size`` in length. All batches
|
1577
|
+
of rows that are not marked deleted are yielded before all batches of
|
1578
|
+
rows that are marked deleted.
|
837
1579
|
|
838
1580
|
:param broker: A :class:`~swift.container.backend.ContainerBroker`.
|
839
1581
|
:param src_shard_range: A :class:`~swift.common.utils.ShardRange`
|
840
1582
|
describing the source range.
|
841
|
-
:param since_row: include only
|
842
|
-
the given row id; by default all rows are included.
|
843
|
-
:
|
1583
|
+
:param since_row: include only object rows whose ROWID is greater than
|
1584
|
+
the given row id; by default all object rows are included.
|
1585
|
+
:param batch_size: The maximum number of object rows to include in each
|
1586
|
+
yielded batch; defaults to cleave_row_batch_size.
|
1587
|
+
:return: a generator of tuples of (list of rows, broker info dict)
|
844
1588
|
"""
|
1589
|
+
if (src_shard_range.lower == ShardRange.MAX or
|
1590
|
+
src_shard_range.upper == ShardRange.MIN):
|
1591
|
+
# this is an unexpected condition but handled with an early return
|
1592
|
+
# just in case, because:
|
1593
|
+
# lower == ShardRange.MAX -> marker == ''
|
1594
|
+
# which could result in rows being erroneously yielded.
|
1595
|
+
return
|
1596
|
+
|
1597
|
+
batch_size = batch_size or self.cleave_row_batch_size
|
845
1598
|
for include_deleted in (False, True):
|
846
1599
|
marker = src_shard_range.lower_str
|
847
1600
|
while True:
|
@@ -849,87 +1602,82 @@ class ContainerSharder(ContainerReplicator):
|
|
849
1602
|
info['max_row'] = broker.get_max_row()
|
850
1603
|
start = time.time()
|
851
1604
|
objects = broker.get_objects(
|
852
|
-
|
1605
|
+
limit=batch_size,
|
853
1606
|
marker=marker,
|
854
1607
|
end_marker=src_shard_range.end_marker,
|
855
1608
|
include_deleted=include_deleted,
|
856
1609
|
since_row=since_row)
|
1610
|
+
self.debug(broker, 'got %s rows (deleted=%s) in %ss',
|
1611
|
+
len(objects), include_deleted, time.time() - start)
|
857
1612
|
if objects:
|
858
|
-
self.logger.debug('got %s objects from %s in %ss',
|
859
|
-
len(objects), broker.db_file,
|
860
|
-
time.time() - start)
|
861
1613
|
yield objects, info
|
862
1614
|
|
863
|
-
if len(objects) <
|
1615
|
+
if len(objects) < batch_size:
|
864
1616
|
break
|
865
1617
|
marker = objects[-1]['name']
|
866
1618
|
|
867
1619
|
def yield_objects_to_shard_range(self, broker, src_shard_range,
|
868
1620
|
dest_shard_ranges):
|
869
1621
|
"""
|
870
|
-
Iterates through all
|
871
|
-
destination shard ranges provided by the ``
|
872
|
-
Yields tuples of (object
|
873
|
-
|
874
|
-
|
1622
|
+
Iterates through all object rows in ``src_shard_range`` to place them
|
1623
|
+
in destination shard ranges provided by the ``dest_shard_ranges``
|
1624
|
+
function. Yields tuples of ``(batch of object rows, destination shard
|
1625
|
+
range in which those object rows belong, broker info)``.
|
1626
|
+
|
1627
|
+
If no destination shard range exists for a batch of object rows then
|
1628
|
+
tuples are yielded of ``(batch of object rows, None, broker info)``.
|
1629
|
+
This indicates to the caller that there are a non-zero number of object
|
1630
|
+
rows for which no destination shard range was found.
|
1631
|
+
|
1632
|
+
Note that the same destination shard range may be referenced in more
|
1633
|
+
than one yielded tuple.
|
875
1634
|
|
876
1635
|
:param broker: A :class:`~swift.container.backend.ContainerBroker`.
|
877
1636
|
:param src_shard_range: A :class:`~swift.common.utils.ShardRange`
|
878
1637
|
describing the source range.
|
879
1638
|
:param dest_shard_ranges: A function which should return a list of
|
880
|
-
destination shard ranges in
|
881
|
-
|
882
|
-
|
1639
|
+
destination shard ranges sorted in the order defined by
|
1640
|
+
:meth:`~swift.common.utils.ShardRange.sort_key`.
|
1641
|
+
:return: a generator of tuples of ``(object row list, shard range,
|
1642
|
+
broker info dict)`` where ``shard_range`` may be ``None``.
|
883
1643
|
"""
|
884
|
-
|
885
|
-
|
886
|
-
|
887
|
-
|
1644
|
+
# calling dest_shard_ranges() may result in a request to fetch shard
|
1645
|
+
# ranges, so first check that the broker actually has misplaced object
|
1646
|
+
# rows in the source namespace
|
1647
|
+
for _ in self.yield_objects(broker, src_shard_range, batch_size=1):
|
1648
|
+
break
|
1649
|
+
else:
|
1650
|
+
return
|
888
1651
|
|
889
|
-
|
890
|
-
|
891
|
-
|
892
|
-
|
893
|
-
|
894
|
-
|
895
|
-
if
|
896
|
-
|
897
|
-
|
898
|
-
|
899
|
-
|
900
|
-
|
901
|
-
|
902
|
-
|
903
|
-
|
904
|
-
|
905
|
-
|
906
|
-
|
907
|
-
|
908
|
-
|
909
|
-
|
910
|
-
|
911
|
-
|
912
|
-
|
913
|
-
|
914
|
-
|
915
|
-
|
916
|
-
|
917
|
-
|
918
|
-
# yield the objects in current dest_shard_range
|
919
|
-
yield (objs[last_index:next_index],
|
920
|
-
dest_shard_range,
|
921
|
-
info)
|
922
|
-
last_index = next_index
|
923
|
-
dest_shard_range = next_or_none(dest_shard_range_iter)
|
924
|
-
next_index += 1
|
925
|
-
|
926
|
-
if next_index != last_index:
|
927
|
-
# yield tail of current batch of objects
|
928
|
-
# NB there may be more objects for the current
|
929
|
-
# dest_shard_range in the next batch from yield_objects
|
930
|
-
yield (objs[last_index:next_index],
|
931
|
-
None if unplaced else dest_shard_range,
|
932
|
-
info)
|
1652
|
+
dest_shard_range_iter = iter(dest_shard_ranges())
|
1653
|
+
src_shard_range_marker = src_shard_range.lower
|
1654
|
+
for dest_shard_range in dest_shard_range_iter:
|
1655
|
+
if dest_shard_range.upper <= src_shard_range.lower:
|
1656
|
+
continue
|
1657
|
+
|
1658
|
+
if dest_shard_range.lower > src_shard_range_marker:
|
1659
|
+
# no destination for a sub-namespace of the source namespace
|
1660
|
+
sub_src_range = src_shard_range.copy(
|
1661
|
+
lower=src_shard_range_marker, upper=dest_shard_range.lower)
|
1662
|
+
for objs, info in self.yield_objects(broker, sub_src_range):
|
1663
|
+
yield objs, None, info
|
1664
|
+
|
1665
|
+
sub_src_range = src_shard_range.copy(
|
1666
|
+
lower=max(dest_shard_range.lower, src_shard_range.lower),
|
1667
|
+
upper=min(dest_shard_range.upper, src_shard_range.upper))
|
1668
|
+
for objs, info in self.yield_objects(broker, sub_src_range):
|
1669
|
+
yield objs, dest_shard_range, info
|
1670
|
+
|
1671
|
+
src_shard_range_marker = dest_shard_range.upper
|
1672
|
+
if dest_shard_range.upper >= src_shard_range.upper:
|
1673
|
+
# the entire source namespace has been traversed
|
1674
|
+
break
|
1675
|
+
else:
|
1676
|
+
# dest_shard_ranges_iter was exhausted before reaching the end of
|
1677
|
+
# the source namespace
|
1678
|
+
sub_src_range = src_shard_range.copy(lower=src_shard_range_marker)
|
1679
|
+
for objs, info in self.yield_objects(broker, sub_src_range):
|
1680
|
+
yield objs, None, info
|
933
1681
|
|
934
1682
|
def _post_replicate_hook(self, broker, info, responses):
|
935
1683
|
# override superclass behaviour
|
@@ -939,11 +1687,15 @@ class ContainerSharder(ContainerReplicator):
|
|
939
1687
|
dest_broker, node_id, info):
|
940
1688
|
success, responses = self._replicate_object(
|
941
1689
|
part, dest_broker.db_file, node_id)
|
1690
|
+
replication_successes = responses.count(True)
|
942
1691
|
quorum = quorum_size(self.ring.replica_count)
|
943
|
-
if not success and
|
944
|
-
self.
|
945
|
-
'Failed to sufficiently replicate misplaced objects
|
946
|
-
|
1692
|
+
if not success and replication_successes < quorum:
|
1693
|
+
self.warning(
|
1694
|
+
broker, 'Failed to sufficiently replicate misplaced objects '
|
1695
|
+
'shard %s in state %s: %s successes, %s required '
|
1696
|
+
'(not removing objects), shard db: %s',
|
1697
|
+
dest_shard_range.name, dest_shard_range.state_text,
|
1698
|
+
replication_successes, quorum, dest_broker.db_file)
|
947
1699
|
return False
|
948
1700
|
|
949
1701
|
if broker.get_info()['id'] != info['id']:
|
@@ -961,9 +1713,9 @@ class ContainerSharder(ContainerReplicator):
|
|
961
1713
|
success = True
|
962
1714
|
|
963
1715
|
if not success:
|
964
|
-
self.
|
965
|
-
|
966
|
-
|
1716
|
+
self.warning(broker, 'Refused to remove misplaced objects for '
|
1717
|
+
'dest %s in state %s',
|
1718
|
+
dest_shard_range.name, dest_shard_range.state_text)
|
967
1719
|
return success
|
968
1720
|
|
969
1721
|
def _move_objects(self, src_broker, src_shard_range, policy_index,
|
@@ -981,16 +1733,19 @@ class ContainerSharder(ContainerReplicator):
|
|
981
1733
|
continue
|
982
1734
|
|
983
1735
|
if dest_shard_range.name == src_broker.path:
|
984
|
-
self.
|
985
|
-
|
1736
|
+
self.debug(src_broker,
|
1737
|
+
'Skipping source as misplaced objects destination')
|
986
1738
|
# in shrinking context, the misplaced objects might actually be
|
987
1739
|
# correctly placed if the root has expanded this shard but this
|
988
1740
|
# broker has not yet been updated
|
989
1741
|
continue
|
990
1742
|
|
991
1743
|
if dest_shard_range not in dest_brokers:
|
992
|
-
part, dest_broker, node_id,
|
993
|
-
|
1744
|
+
part, dest_broker, node_id, put_timestamp = \
|
1745
|
+
self._get_shard_broker(
|
1746
|
+
dest_shard_range, src_broker.root_path, policy_index)
|
1747
|
+
stat = 'db_exists' if put_timestamp is None else 'db_created'
|
1748
|
+
self._increment_stat('misplaced', stat, statsd=True)
|
994
1749
|
# save the broker info that was sampled prior to the *first*
|
995
1750
|
# yielded objects for this destination
|
996
1751
|
destination = {'part': part,
|
@@ -1004,20 +1759,20 @@ class ContainerSharder(ContainerReplicator):
|
|
1004
1759
|
placed += len(objs)
|
1005
1760
|
|
1006
1761
|
if unplaced:
|
1007
|
-
self.
|
1008
|
-
|
1009
|
-
'in %s', unplaced, quote(src_broker.path))
|
1762
|
+
self.warning(src_broker, 'Failed to find destination for at least '
|
1763
|
+
'%s misplaced objects', unplaced)
|
1010
1764
|
|
1011
1765
|
# TODO: consider executing the replication jobs concurrently
|
1012
1766
|
for dest_shard_range, dest_args in dest_brokers.items():
|
1013
|
-
self.
|
1014
|
-
|
1767
|
+
self.debug(src_broker,
|
1768
|
+
'moving misplaced objects found in range %s',
|
1769
|
+
dest_shard_range)
|
1015
1770
|
success &= self._replicate_and_delete(
|
1016
1771
|
src_broker, dest_shard_range, **dest_args)
|
1017
1772
|
|
1018
|
-
self.
|
1019
|
-
self.
|
1020
|
-
return success, placed
|
1773
|
+
self._update_stat('misplaced', 'placed', step=placed, statsd=True)
|
1774
|
+
self._update_stat('misplaced', 'unplaced', step=unplaced, statsd=True)
|
1775
|
+
return success, placed, unplaced
|
1021
1776
|
|
1022
1777
|
def _make_shard_range_fetcher(self, broker, src_shard_range):
|
1023
1778
|
# returns a function that will lazy load shard ranges on demand;
|
@@ -1058,12 +1813,12 @@ class ContainerSharder(ContainerReplicator):
|
|
1058
1813
|
|
1059
1814
|
def _make_misplaced_object_bounds(self, broker):
|
1060
1815
|
bounds = []
|
1061
|
-
|
1062
|
-
if
|
1816
|
+
db_state = broker.get_db_state()
|
1817
|
+
if db_state == SHARDED:
|
1063
1818
|
# Anything in the object table is treated as a misplaced object.
|
1064
1819
|
bounds.append(('', ''))
|
1065
1820
|
|
1066
|
-
if not bounds and
|
1821
|
+
if not bounds and db_state == SHARDING:
|
1067
1822
|
# Objects outside of this container's own range are misplaced.
|
1068
1823
|
# Objects in already cleaved shard ranges are also misplaced.
|
1069
1824
|
cleave_context = CleavingContext.load(broker)
|
@@ -1091,8 +1846,7 @@ class ContainerSharder(ContainerReplicator):
|
|
1091
1846
|
:return: True if all misplaced objects were sufficiently replicated to
|
1092
1847
|
their correct shard containers, False otherwise
|
1093
1848
|
"""
|
1094
|
-
self.
|
1095
|
-
quote(broker.path), broker.db_file)
|
1849
|
+
self.debug(broker, 'Looking for misplaced objects')
|
1096
1850
|
self._increment_stat('misplaced', 'attempted')
|
1097
1851
|
src_broker = src_broker or broker
|
1098
1852
|
if src_bounds is None:
|
@@ -1100,22 +1854,27 @@ class ContainerSharder(ContainerReplicator):
|
|
1100
1854
|
# (ab)use ShardRange instances to encapsulate source namespaces
|
1101
1855
|
src_ranges = [ShardRange('dont/care', Timestamp.now(), lower, upper)
|
1102
1856
|
for lower, upper in src_bounds]
|
1103
|
-
self.
|
1857
|
+
self.debug(broker, 'misplaced object source bounds %s', src_bounds)
|
1104
1858
|
policy_index = broker.storage_policy_index
|
1105
1859
|
success = True
|
1106
|
-
|
1860
|
+
num_placed = num_unplaced = 0
|
1107
1861
|
for src_shard_range in src_ranges:
|
1108
|
-
part_success,
|
1862
|
+
part_success, part_placed, part_unplaced = self._move_objects(
|
1109
1863
|
src_broker, src_shard_range, policy_index,
|
1110
1864
|
self._make_shard_range_fetcher(broker, src_shard_range))
|
1111
1865
|
success &= part_success
|
1112
|
-
|
1866
|
+
num_placed += part_placed
|
1867
|
+
num_unplaced += part_unplaced
|
1113
1868
|
|
1114
|
-
if
|
1869
|
+
if num_placed or num_unplaced:
|
1870
|
+
# the found stat records the number of DBs in which any misplaced
|
1871
|
+
# rows were found, not the total number of misplaced rows
|
1115
1872
|
self._increment_stat('misplaced', 'found', statsd=True)
|
1116
|
-
self.
|
1117
|
-
|
1118
|
-
self.
|
1873
|
+
self.debug(broker, 'Placed %s misplaced objects (%s unplaced)',
|
1874
|
+
num_placed, num_unplaced)
|
1875
|
+
self._increment_stat('misplaced', 'success' if success else 'failure',
|
1876
|
+
statsd=True)
|
1877
|
+
self.debug(broker, 'Finished handling misplaced objects')
|
1119
1878
|
return success
|
1120
1879
|
|
1121
1880
|
def _find_shard_ranges(self, broker):
|
@@ -1131,27 +1890,26 @@ class ContainerSharder(ContainerReplicator):
|
|
1131
1890
|
own_shard_range = broker.get_own_shard_range()
|
1132
1891
|
shard_ranges = broker.get_shard_ranges()
|
1133
1892
|
if shard_ranges and shard_ranges[-1].upper >= own_shard_range.upper:
|
1134
|
-
self.
|
1135
|
-
quote(broker.path))
|
1893
|
+
self.debug(broker, 'Scan for shard ranges already completed')
|
1136
1894
|
return 0
|
1137
1895
|
|
1138
|
-
self.
|
1139
|
-
quote(broker.path))
|
1896
|
+
self.info(broker, 'Starting scan for shard ranges')
|
1140
1897
|
self._increment_stat('scanned', 'attempted')
|
1141
1898
|
|
1142
1899
|
start = time.time()
|
1143
1900
|
shard_data, last_found = broker.find_shard_ranges(
|
1144
|
-
self.
|
1145
|
-
existing_ranges=shard_ranges
|
1901
|
+
self.rows_per_shard, limit=self.shard_scanner_batch_size,
|
1902
|
+
existing_ranges=shard_ranges,
|
1903
|
+
minimum_shard_size=self.minimum_shard_size)
|
1146
1904
|
elapsed = time.time() - start
|
1147
1905
|
|
1148
1906
|
if not shard_data:
|
1149
1907
|
if last_found:
|
1150
|
-
self.
|
1908
|
+
self.info(broker, "Already found all shard ranges")
|
1151
1909
|
self._increment_stat('scanned', 'success', statsd=True)
|
1152
1910
|
else:
|
1153
1911
|
# we didn't find anything
|
1154
|
-
self.
|
1912
|
+
self.warning(broker, "No shard ranges found")
|
1155
1913
|
self._increment_stat('scanned', 'failure', statsd=True)
|
1156
1914
|
return 0
|
1157
1915
|
|
@@ -1159,14 +1917,14 @@ class ContainerSharder(ContainerReplicator):
|
|
1159
1917
|
broker, shard_data, self.shards_account_prefix)
|
1160
1918
|
broker.merge_shard_ranges(shard_ranges)
|
1161
1919
|
num_found = len(shard_ranges)
|
1162
|
-
self.
|
1163
|
-
|
1164
|
-
self.
|
1920
|
+
self.info(broker, "Completed scan for shard ranges: %d found",
|
1921
|
+
num_found)
|
1922
|
+
self._update_stat('scanned', 'found', step=num_found)
|
1165
1923
|
self._min_stat('scanned', 'min_time', round(elapsed / num_found, 3))
|
1166
1924
|
self._max_stat('scanned', 'max_time', round(elapsed / num_found, 3))
|
1167
1925
|
|
1168
1926
|
if last_found:
|
1169
|
-
self.
|
1927
|
+
self.info(broker, "Final shard range reached.")
|
1170
1928
|
self._increment_stat('scanned', 'success', statsd=True)
|
1171
1929
|
return num_found
|
1172
1930
|
|
@@ -1174,7 +1932,7 @@ class ContainerSharder(ContainerReplicator):
|
|
1174
1932
|
# Create shard containers that are ready to receive redirected object
|
1175
1933
|
# updates. Do this now, so that redirection can begin immediately
|
1176
1934
|
# without waiting for cleaving to complete.
|
1177
|
-
found_ranges = broker.get_shard_ranges(states=ShardRange.FOUND)
|
1935
|
+
found_ranges = broker.get_shard_ranges(states=[ShardRange.FOUND])
|
1178
1936
|
created_ranges = []
|
1179
1937
|
for shard_range in found_ranges:
|
1180
1938
|
self._increment_stat('created', 'attempted')
|
@@ -1193,16 +1951,15 @@ class ContainerSharder(ContainerReplicator):
|
|
1193
1951
|
# may think they are in fact roots, but it cleans up well enough
|
1194
1952
|
# once everyone's upgraded.
|
1195
1953
|
success = self._send_shard_ranges(
|
1196
|
-
shard_range.account, shard_range.container,
|
1954
|
+
broker, shard_range.account, shard_range.container,
|
1197
1955
|
[shard_range], headers=headers)
|
1198
1956
|
if success:
|
1199
|
-
self.
|
1200
|
-
|
1957
|
+
self.debug(broker, 'PUT new shard range container for %s',
|
1958
|
+
shard_range)
|
1201
1959
|
self._increment_stat('created', 'success', statsd=True)
|
1202
1960
|
else:
|
1203
|
-
self.
|
1204
|
-
|
1205
|
-
shard_range, quote(broker.path))
|
1961
|
+
self.error(broker, 'PUT of new shard container %r failed',
|
1962
|
+
shard_range)
|
1206
1963
|
self._increment_stat('created', 'failure', statsd=True)
|
1207
1964
|
# break, not continue, because elsewhere it is assumed that
|
1208
1965
|
# finding and cleaving shard ranges progresses linearly, so we
|
@@ -1214,32 +1971,17 @@ class ContainerSharder(ContainerReplicator):
|
|
1214
1971
|
if created_ranges:
|
1215
1972
|
broker.merge_shard_ranges(created_ranges)
|
1216
1973
|
if not broker.is_root_container():
|
1217
|
-
self._send_shard_ranges(
|
1218
|
-
|
1219
|
-
self.
|
1220
|
-
|
1221
|
-
len(created_ranges))
|
1974
|
+
self._send_shard_ranges(broker, broker.root_account,
|
1975
|
+
broker.root_container, created_ranges)
|
1976
|
+
self.info(broker, "Completed creating %d shard range containers",
|
1977
|
+
len(created_ranges))
|
1222
1978
|
return len(created_ranges)
|
1223
1979
|
|
1224
|
-
def
|
1225
|
-
|
1226
|
-
|
1227
|
-
|
1228
|
-
quote(shard_range.name), shard_range)
|
1229
|
-
self._increment_stat('cleaved', 'attempted')
|
1980
|
+
def _cleave_shard_broker(self, broker, cleaving_context, shard_range,
|
1981
|
+
own_shard_range, shard_broker, put_timestamp,
|
1982
|
+
shard_part, node_id):
|
1983
|
+
result = CLEAVE_SUCCESS
|
1230
1984
|
start = time.time()
|
1231
|
-
policy_index = broker.storage_policy_index
|
1232
|
-
try:
|
1233
|
-
shard_part, shard_broker, node_id, put_timestamp = \
|
1234
|
-
self._get_shard_broker(shard_range, broker.root_path,
|
1235
|
-
policy_index)
|
1236
|
-
except DeviceUnavailable as duex:
|
1237
|
-
self.logger.warning(str(duex))
|
1238
|
-
self._increment_stat('cleaved', 'failure', statsd=True)
|
1239
|
-
return CLEAVE_FAILED
|
1240
|
-
|
1241
|
-
own_shard_range = broker.get_own_shard_range()
|
1242
|
-
|
1243
1985
|
# only cleave from the retiring db - misplaced objects handler will
|
1244
1986
|
# deal with any objects in the fresh db
|
1245
1987
|
source_broker = broker.get_brokers()[0]
|
@@ -1258,21 +2000,15 @@ class ContainerSharder(ContainerReplicator):
|
|
1258
2000
|
since_row=sync_from_row):
|
1259
2001
|
shard_broker.merge_items(objects)
|
1260
2002
|
if objects is None:
|
1261
|
-
self.
|
1262
|
-
|
2003
|
+
self.info(broker, "Cleaving %r - zero objects found",
|
2004
|
+
shard_range)
|
1263
2005
|
if shard_broker.get_info()['put_timestamp'] == put_timestamp:
|
1264
2006
|
# This was just created; don't need to replicate this
|
1265
2007
|
# SR because there was nothing there. So cleanup and
|
1266
2008
|
# remove the shard_broker from its hand off location.
|
1267
|
-
self.delete_db(shard_broker)
|
1268
|
-
cleaving_context.range_done(shard_range.upper_str)
|
1269
|
-
if shard_range.upper >= own_shard_range.upper:
|
1270
|
-
# cleaving complete
|
1271
|
-
cleaving_context.cleaving_done = True
|
1272
|
-
cleaving_context.store(broker)
|
1273
2009
|
# Because nothing was here we wont count it in the shard
|
1274
2010
|
# batch count.
|
1275
|
-
|
2011
|
+
result = CLEAVE_EMPTY
|
1276
2012
|
# Else, it wasn't newly created by us, and
|
1277
2013
|
# we don't know what's in it or why. Let it get
|
1278
2014
|
# replicated and counted in the batch count.
|
@@ -1288,20 +2024,25 @@ class ContainerSharder(ContainerReplicator):
|
|
1288
2024
|
[{'sync_point': source_max_row, 'remote_id': source_db_id}] +
|
1289
2025
|
source_broker.get_syncs())
|
1290
2026
|
else:
|
1291
|
-
self.
|
1292
|
-
|
2027
|
+
self.debug(broker, "Cleaving %r - shard db already in sync",
|
2028
|
+
shard_range)
|
1293
2029
|
|
1294
2030
|
replication_quorum = self.existing_shard_replication_quorum
|
1295
|
-
if
|
1296
|
-
|
1297
|
-
|
1298
|
-
|
1299
|
-
|
1300
|
-
|
1301
|
-
|
1302
|
-
|
1303
|
-
|
1304
|
-
|
2031
|
+
if own_shard_range.state in ShardRange.SHRINKING_STATES:
|
2032
|
+
if shard_range.includes(own_shard_range):
|
2033
|
+
# When shrinking to a single acceptor that completely encloses
|
2034
|
+
# this shard's namespace, include deleted own (donor) shard
|
2035
|
+
# range in the replicated db so that when acceptor next updates
|
2036
|
+
# root it will atomically update its namespace *and* delete the
|
2037
|
+
# donor. This reduces the chance of a temporary listing gap if
|
2038
|
+
# this shard fails to update the root with its SHRUNK/deleted
|
2039
|
+
# state. Don't do this when sharding a shard or shrinking to
|
2040
|
+
# multiple acceptors because in those cases the donor namespace
|
2041
|
+
# should not be deleted until *all* shards are cleaved.
|
2042
|
+
if own_shard_range.update_state(ShardRange.SHRUNK):
|
2043
|
+
own_shard_range.set_deleted()
|
2044
|
+
broker.merge_shard_ranges(own_shard_range)
|
2045
|
+
shard_broker.merge_shard_ranges(own_shard_range)
|
1305
2046
|
elif shard_range.state == ShardRange.CREATED:
|
1306
2047
|
# The shard range object stats may have changed since the shard
|
1307
2048
|
# range was found, so update with stats of objects actually
|
@@ -1310,51 +2051,74 @@ class ContainerSharder(ContainerReplicator):
|
|
1310
2051
|
info = shard_broker.get_info()
|
1311
2052
|
shard_range.update_meta(
|
1312
2053
|
info['object_count'], info['bytes_used'])
|
2054
|
+
# Update state to CLEAVED; only do this when sharding, not when
|
2055
|
+
# shrinking
|
1313
2056
|
shard_range.update_state(ShardRange.CLEAVED)
|
1314
2057
|
shard_broker.merge_shard_ranges(shard_range)
|
1315
2058
|
replication_quorum = self.shard_replication_quorum
|
1316
2059
|
|
1317
|
-
|
1318
|
-
|
1319
|
-
|
1320
|
-
|
1321
|
-
|
1322
|
-
|
2060
|
+
if result == CLEAVE_EMPTY:
|
2061
|
+
self.delete_db(shard_broker)
|
2062
|
+
else: # result == CLEAVE_SUCCESS:
|
2063
|
+
self.info(broker, 'Replicating new shard container %s for %s',
|
2064
|
+
quote(shard_broker.path), own_shard_range)
|
2065
|
+
|
2066
|
+
success, responses = self._replicate_object(
|
2067
|
+
shard_part, shard_broker.db_file, node_id)
|
2068
|
+
|
2069
|
+
replication_successes = responses.count(True)
|
2070
|
+
if (not success and (not responses or
|
2071
|
+
replication_successes < replication_quorum)):
|
2072
|
+
# insufficient replication or replication not even attempted;
|
2073
|
+
# break because we don't want to progress the cleave cursor
|
2074
|
+
# until each shard range has been successfully cleaved
|
2075
|
+
self.warning(
|
2076
|
+
broker, 'Failed to sufficiently replicate cleaved shard '
|
2077
|
+
'%s in state %s: %s successes, %s required, '
|
2078
|
+
'shard db: %s',
|
2079
|
+
shard_broker.path, shard_range.state_text,
|
2080
|
+
replication_successes, replication_quorum,
|
2081
|
+
shard_broker.db_file)
|
2082
|
+
self._increment_stat('cleaved', 'failure', statsd=True)
|
2083
|
+
result = CLEAVE_FAILED
|
2084
|
+
else:
|
2085
|
+
elapsed = round(time.time() - start, 3)
|
2086
|
+
self._min_stat('cleaved', 'min_time', elapsed)
|
2087
|
+
self._max_stat('cleaved', 'max_time', elapsed)
|
2088
|
+
self.info(broker, 'Cleaved %s in %gs', shard_range,
|
2089
|
+
elapsed)
|
2090
|
+
self._increment_stat('cleaved', 'success', statsd=True)
|
2091
|
+
|
2092
|
+
if result in (CLEAVE_SUCCESS, CLEAVE_EMPTY):
|
2093
|
+
broker.merge_shard_ranges(shard_range)
|
2094
|
+
cleaving_context.range_done(shard_range.upper_str)
|
2095
|
+
if shard_range.upper >= own_shard_range.upper:
|
2096
|
+
# cleaving complete
|
2097
|
+
cleaving_context.cleaving_done = True
|
2098
|
+
cleaving_context.store(broker)
|
2099
|
+
return result
|
1323
2100
|
|
1324
|
-
|
1325
|
-
|
1326
|
-
|
1327
|
-
|
1328
|
-
|
1329
|
-
|
1330
|
-
|
1331
|
-
|
1332
|
-
|
1333
|
-
|
1334
|
-
|
1335
|
-
|
1336
|
-
|
1337
|
-
|
1338
|
-
|
1339
|
-
self._max_stat('cleaved', 'max_time', elapsed)
|
1340
|
-
broker.merge_shard_ranges(shard_range)
|
1341
|
-
cleaving_context.range_done(shard_range.upper_str)
|
1342
|
-
if shard_range.upper >= own_shard_range.upper:
|
1343
|
-
# cleaving complete
|
1344
|
-
cleaving_context.cleaving_done = True
|
1345
|
-
cleaving_context.store(broker)
|
1346
|
-
self.logger.info(
|
1347
|
-
'Cleaved %s for shard range %s in %gs.',
|
1348
|
-
quote(broker.path), shard_range, elapsed)
|
1349
|
-
self._increment_stat('cleaved', 'success', statsd=True)
|
1350
|
-
return CLEAVE_SUCCESS
|
2101
|
+
def _cleave_shard_range(self, broker, cleaving_context, shard_range,
|
2102
|
+
own_shard_range):
|
2103
|
+
self.info(broker, "Cleaving from row %s into %s for %r",
|
2104
|
+
cleaving_context.last_cleave_to_row,
|
2105
|
+
quote(shard_range.name), shard_range)
|
2106
|
+
self._increment_stat('cleaved', 'attempted')
|
2107
|
+
policy_index = broker.storage_policy_index
|
2108
|
+
shard_part, shard_broker, node_id, put_timestamp = \
|
2109
|
+
self._get_shard_broker(shard_range, broker.root_path,
|
2110
|
+
policy_index)
|
2111
|
+
stat = 'db_exists' if put_timestamp is None else 'db_created'
|
2112
|
+
self._increment_stat('cleaved', stat, statsd=True)
|
2113
|
+
return self._cleave_shard_broker(
|
2114
|
+
broker, cleaving_context, shard_range, own_shard_range,
|
2115
|
+
shard_broker, put_timestamp, shard_part, node_id)
|
1351
2116
|
|
1352
2117
|
def _cleave(self, broker):
|
1353
2118
|
# Returns True if misplaced objects have been moved and the entire
|
1354
2119
|
# container namespace has been successfully cleaved, False otherwise
|
1355
2120
|
if broker.is_sharded():
|
1356
|
-
self.
|
1357
|
-
quote(broker.path))
|
2121
|
+
self.debug(broker, 'Passing over already sharded container')
|
1358
2122
|
return True
|
1359
2123
|
|
1360
2124
|
cleaving_context = CleavingContext.load(broker)
|
@@ -1362,9 +2126,8 @@ class ContainerSharder(ContainerReplicator):
|
|
1362
2126
|
# ensure any misplaced objects in the source broker are moved; note
|
1363
2127
|
# that this invocation of _move_misplaced_objects is targetted at
|
1364
2128
|
# the *retiring* db.
|
1365
|
-
self.
|
1366
|
-
|
1367
|
-
quote(broker.path))
|
2129
|
+
self.debug(broker,
|
2130
|
+
'Moving any misplaced objects from sharding container')
|
1368
2131
|
bounds = self._make_default_misplaced_object_bounds(broker)
|
1369
2132
|
cleaving_context.misplaced_done = self._move_misplaced_objects(
|
1370
2133
|
broker, src_broker=broker.get_brokers()[0],
|
@@ -1372,60 +2135,78 @@ class ContainerSharder(ContainerReplicator):
|
|
1372
2135
|
cleaving_context.store(broker)
|
1373
2136
|
|
1374
2137
|
if cleaving_context.cleaving_done:
|
1375
|
-
self.
|
1376
|
-
quote(broker.path))
|
2138
|
+
self.debug(broker, 'Cleaving already complete for container')
|
1377
2139
|
return cleaving_context.misplaced_done
|
1378
2140
|
|
1379
|
-
|
2141
|
+
shard_ranges = broker.get_shard_ranges(marker=cleaving_context.marker)
|
2142
|
+
# Ignore shrinking shard ranges: we never want to cleave objects to a
|
2143
|
+
# shrinking shard. Shrinking shard ranges are to be expected in a root;
|
2144
|
+
# shrinking shard ranges (other than own shard range) are not normally
|
2145
|
+
# expected in a shard but can occur if there is an overlapping shard
|
2146
|
+
# range that has been discovered from the root.
|
2147
|
+
ranges_todo = [sr for sr in shard_ranges
|
2148
|
+
if sr.state != ShardRange.SHRINKING]
|
1380
2149
|
if cleaving_context.cursor:
|
1381
|
-
# always update ranges_todo in case
|
1382
|
-
#
|
2150
|
+
# always update ranges_todo in case shard ranges have changed since
|
2151
|
+
# last visit
|
1383
2152
|
cleaving_context.ranges_todo = len(ranges_todo)
|
1384
|
-
self.
|
1385
|
-
|
1386
|
-
|
1387
|
-
quote(broker.path))
|
2153
|
+
self.debug(broker, 'Continuing to cleave (%s done, %s todo)',
|
2154
|
+
cleaving_context.ranges_done,
|
2155
|
+
cleaving_context.ranges_todo)
|
1388
2156
|
else:
|
1389
2157
|
cleaving_context.start()
|
2158
|
+
own_shard_range = broker.get_own_shard_range()
|
2159
|
+
cleaving_context.cursor = own_shard_range.lower_str
|
1390
2160
|
cleaving_context.ranges_todo = len(ranges_todo)
|
1391
|
-
self.
|
1392
|
-
|
2161
|
+
self.info(broker, 'Starting to cleave (%s todo)',
|
2162
|
+
cleaving_context.ranges_todo)
|
2163
|
+
|
2164
|
+
own_shard_range = broker.get_own_shard_range(no_default=True)
|
2165
|
+
if own_shard_range is None:
|
2166
|
+
# A default should never be SHRINKING or SHRUNK but because we
|
2167
|
+
# may write own_shard_range back to broker, let's make sure
|
2168
|
+
# it can't be defaulted.
|
2169
|
+
self.warning(broker, 'Failed to get own_shard_range')
|
2170
|
+
ranges_todo = [] # skip cleaving
|
1393
2171
|
|
1394
2172
|
ranges_done = []
|
1395
2173
|
for shard_range in ranges_todo:
|
1396
|
-
if
|
1397
|
-
#
|
1398
|
-
#
|
1399
|
-
#
|
1400
|
-
#
|
1401
|
-
# occur if there is an overlapping shard range that has been
|
1402
|
-
# discovered from the root.
|
1403
|
-
cleaving_context.range_done(None) # don't move the cursor
|
1404
|
-
continue
|
1405
|
-
elif shard_range.state in (ShardRange.CREATED,
|
1406
|
-
ShardRange.CLEAVED,
|
1407
|
-
ShardRange.ACTIVE):
|
1408
|
-
cleave_result = self._cleave_shard_range(
|
1409
|
-
broker, cleaving_context, shard_range)
|
1410
|
-
if cleave_result == CLEAVE_SUCCESS:
|
1411
|
-
ranges_done.append(shard_range)
|
1412
|
-
if len(ranges_done) == self.cleave_batch_size:
|
1413
|
-
break
|
1414
|
-
elif cleave_result == CLEAVE_FAILED:
|
1415
|
-
break
|
1416
|
-
# else, no errors, but no rows found either. keep going,
|
1417
|
-
# and don't count it against our batch size
|
1418
|
-
else:
|
1419
|
-
self.logger.info('Stopped cleave at unready %s', shard_range)
|
2174
|
+
if cleaving_context.cleaving_done:
|
2175
|
+
# note: there may still be ranges_todo, for example: if this
|
2176
|
+
# shard is shrinking and has merged a root shard range in
|
2177
|
+
# sharded state along with an active acceptor shard range, but
|
2178
|
+
# the root range is irrelevant
|
1420
2179
|
break
|
1421
2180
|
|
1422
|
-
|
1423
|
-
|
1424
|
-
|
1425
|
-
cleaving_context.
|
1426
|
-
|
1427
|
-
|
1428
|
-
|
2181
|
+
if len(ranges_done) == self.cleave_batch_size:
|
2182
|
+
break
|
2183
|
+
|
2184
|
+
if shard_range.lower > cleaving_context.cursor:
|
2185
|
+
self.info(broker, 'Stopped cleave at gap: %r - %r' %
|
2186
|
+
(cleaving_context.cursor, shard_range.lower))
|
2187
|
+
break
|
2188
|
+
|
2189
|
+
if shard_range.state not in (ShardRange.CREATED,
|
2190
|
+
ShardRange.CLEAVED,
|
2191
|
+
ShardRange.ACTIVE):
|
2192
|
+
self.info(broker, 'Stopped cleave at unready %s', shard_range)
|
2193
|
+
break
|
2194
|
+
|
2195
|
+
cleave_result = self._cleave_shard_range(
|
2196
|
+
broker, cleaving_context, shard_range, own_shard_range)
|
2197
|
+
|
2198
|
+
if cleave_result == CLEAVE_SUCCESS:
|
2199
|
+
ranges_done.append(shard_range)
|
2200
|
+
elif cleave_result == CLEAVE_FAILED:
|
2201
|
+
break
|
2202
|
+
# else: CLEAVE_EMPTY: no errors, but no rows found either. keep
|
2203
|
+
# going, and don't count it against our batch size
|
2204
|
+
|
2205
|
+
# _cleave_shard_range always store()s the context on success; *also* do
|
2206
|
+
# that here in case we hit a failure right off the bat or ended loop
|
2207
|
+
# with skipped ranges
|
2208
|
+
cleaving_context.store(broker)
|
2209
|
+
self.debug(broker, 'Cleaved %s shard ranges', len(ranges_done))
|
1429
2210
|
return (cleaving_context.misplaced_done and
|
1430
2211
|
cleaving_context.cleaving_done)
|
1431
2212
|
|
@@ -1435,18 +2216,23 @@ class ContainerSharder(ContainerReplicator):
|
|
1435
2216
|
# Move all CLEAVED shards to ACTIVE state and if a shard then
|
1436
2217
|
# delete own shard range; these changes will be simultaneously
|
1437
2218
|
# reported in the next update to the root container.
|
1438
|
-
|
1439
|
-
|
1440
|
-
|
1441
|
-
|
1442
|
-
|
1443
|
-
|
1444
|
-
|
1445
|
-
next_state = ShardRange.SHRUNK
|
1446
|
-
else:
|
1447
|
-
next_state = ShardRange.SHARDED
|
1448
|
-
own_shard_range.update_state(next_state)
|
2219
|
+
own_shard_range = broker.get_own_shard_range(no_default=True)
|
2220
|
+
if own_shard_range is None:
|
2221
|
+
# This is more of a belts and braces, not sure we could even
|
2222
|
+
# get this far with without an own_shard_range. But because
|
2223
|
+
# we will be writing own_shard_range back, we need to make sure
|
2224
|
+
self.warning(broker, 'Failed to get own_shard_range')
|
2225
|
+
return False
|
1449
2226
|
own_shard_range.update_meta(0, 0)
|
2227
|
+
if own_shard_range.state in ShardRange.SHRINKING_STATES:
|
2228
|
+
own_shard_range.update_state(ShardRange.SHRUNK)
|
2229
|
+
modified_shard_ranges = []
|
2230
|
+
else:
|
2231
|
+
own_shard_range.update_state(ShardRange.SHARDED)
|
2232
|
+
modified_shard_ranges = broker.get_shard_ranges(
|
2233
|
+
states=[ShardRange.CLEAVED])
|
2234
|
+
for sr in modified_shard_ranges:
|
2235
|
+
sr.update_state(ShardRange.ACTIVE)
|
1450
2236
|
if (not broker.is_root_container() and not
|
1451
2237
|
own_shard_range.deleted):
|
1452
2238
|
own_shard_range = own_shard_range.copy(
|
@@ -1454,16 +2240,12 @@ class ContainerSharder(ContainerReplicator):
|
|
1454
2240
|
modified_shard_ranges.append(own_shard_range)
|
1455
2241
|
broker.merge_shard_ranges(modified_shard_ranges)
|
1456
2242
|
if broker.set_sharded_state():
|
1457
|
-
cleaving_context.delete(broker)
|
1458
2243
|
return True
|
1459
2244
|
else:
|
1460
|
-
self.
|
1461
|
-
'Failed to remove retiring db file for %s',
|
1462
|
-
quote(broker.path))
|
2245
|
+
self.warning(broker, 'Failed to remove retiring db file')
|
1463
2246
|
else:
|
1464
|
-
self.
|
1465
|
-
|
1466
|
-
broker.db_files[0], dict(cleaving_context))
|
2247
|
+
self.warning(broker, 'Repeat cleaving required, context: %s',
|
2248
|
+
dict(cleaving_context))
|
1467
2249
|
cleaving_context.reset()
|
1468
2250
|
cleaving_context.store(broker)
|
1469
2251
|
|
@@ -1473,106 +2255,138 @@ class ContainerSharder(ContainerReplicator):
|
|
1473
2255
|
candidates = find_sharding_candidates(
|
1474
2256
|
broker, self.shard_container_threshold, shard_ranges)
|
1475
2257
|
if candidates:
|
1476
|
-
self.
|
1477
|
-
|
2258
|
+
self.debug(broker, 'Identified %s sharding candidates',
|
2259
|
+
len(candidates))
|
1478
2260
|
broker.merge_shard_ranges(candidates)
|
1479
2261
|
|
1480
2262
|
def _find_and_enable_shrinking_candidates(self, broker):
|
1481
2263
|
if not broker.is_sharded():
|
1482
|
-
self.
|
1483
|
-
quote(broker.path))
|
2264
|
+
self.warning(broker, 'Cannot shrink a not yet sharded container')
|
1484
2265
|
return
|
1485
2266
|
|
1486
|
-
|
1487
|
-
broker, self.
|
1488
|
-
|
2267
|
+
compactible_sequences = find_compactible_shard_sequences(
|
2268
|
+
broker, self.shrink_threshold, self.expansion_limit,
|
2269
|
+
self.max_shrinking, self.max_expanding, include_shrinking=True)
|
2270
|
+
self.debug(broker, 'Found %s compactible sequences of length(s) %s' %
|
2271
|
+
(len(compactible_sequences),
|
2272
|
+
[len(s) for s in compactible_sequences]))
|
2273
|
+
process_compactible_shard_sequences(broker, compactible_sequences)
|
1489
2274
|
own_shard_range = broker.get_own_shard_range()
|
1490
|
-
for
|
1491
|
-
|
1492
|
-
|
1493
|
-
|
2275
|
+
for sequence in compactible_sequences:
|
2276
|
+
acceptor = sequence[-1]
|
2277
|
+
donors = ShardRangeList(sequence[:-1])
|
2278
|
+
self.debug(broker,
|
2279
|
+
'shrinking %d objects from %d shard ranges into %s' %
|
2280
|
+
(donors.object_count, len(donors), acceptor))
|
1494
2281
|
if acceptor.name != own_shard_range.name:
|
1495
|
-
self._send_shard_ranges(
|
1496
|
-
|
1497
|
-
acceptor.increment_meta(
|
1498
|
-
else:
|
1499
|
-
# no need to change namespace or stats
|
1500
|
-
acceptor.update_state(ShardRange.ACTIVE,
|
1501
|
-
state_timestamp=Timestamp.now())
|
2282
|
+
self._send_shard_ranges(broker, acceptor.account,
|
2283
|
+
acceptor.container, [acceptor])
|
2284
|
+
acceptor.increment_meta(donors.object_count, donors.bytes_used)
|
1502
2285
|
# Now send a copy of the expanded acceptor, with an updated
|
1503
|
-
# timestamp, to
|
2286
|
+
# timestamp, to each donor container. This forces each donor to
|
1504
2287
|
# asynchronously cleave its entire contents to the acceptor and
|
1505
2288
|
# delete itself. The donor will pass its own deleted shard range to
|
1506
2289
|
# the acceptor when cleaving. Subsequent updates from the donor or
|
1507
2290
|
# the acceptor will then update the root to have the deleted donor
|
1508
2291
|
# shard range.
|
1509
|
-
|
1510
|
-
|
2292
|
+
for donor in donors:
|
2293
|
+
self._send_shard_ranges(broker, donor.account,
|
2294
|
+
donor.container, [donor, acceptor])
|
1511
2295
|
|
1512
2296
|
def _update_root_container(self, broker):
|
1513
2297
|
own_shard_range = broker.get_own_shard_range(no_default=True)
|
1514
|
-
if not own_shard_range
|
2298
|
+
if not own_shard_range:
|
2299
|
+
return
|
2300
|
+
|
2301
|
+
# Don't update the osr stats including tombstones unless its CLEAVED+
|
2302
|
+
if own_shard_range.state in SHARD_UPDATE_STAT_STATES:
|
2303
|
+
# do a reclaim *now* in order to get best estimate of tombstone
|
2304
|
+
# count that is consistent with the current object_count
|
2305
|
+
reclaimer = self._reclaim(broker)
|
2306
|
+
tombstones = reclaimer.get_tombstone_count()
|
2307
|
+
self.debug(broker, 'tombstones = %d', tombstones)
|
2308
|
+
# shrinking candidates are found in the root DB so that's the only
|
2309
|
+
# place we need up to date tombstone stats.
|
2310
|
+
own_shard_range.update_tombstones(tombstones)
|
2311
|
+
update_own_shard_range_stats(broker, own_shard_range)
|
2312
|
+
|
2313
|
+
if not own_shard_range.reported:
|
2314
|
+
broker.merge_shard_ranges(own_shard_range)
|
2315
|
+
|
2316
|
+
# we can't use `state not in SHARD_UPDATE_STAT_STATES` to return
|
2317
|
+
# because there are cases we still want to update root even if the
|
2318
|
+
# stats are wrong. Such as it's a new shard or something else has
|
2319
|
+
# decided to remove the latch to update root.
|
2320
|
+
if own_shard_range.reported:
|
1515
2321
|
return
|
1516
2322
|
|
1517
|
-
# persist the reported shard metadata
|
1518
|
-
broker.merge_shard_ranges(own_shard_range)
|
1519
2323
|
# now get a consistent list of own and other shard ranges
|
1520
2324
|
shard_ranges = broker.get_shard_ranges(
|
1521
2325
|
include_own=True,
|
1522
2326
|
include_deleted=True)
|
1523
2327
|
# send everything
|
1524
|
-
if self._send_shard_ranges(
|
1525
|
-
|
2328
|
+
if self._send_shard_ranges(broker, broker.root_account,
|
2329
|
+
broker.root_container, shard_ranges,
|
2330
|
+
{'Referer': quote(broker.path)}):
|
1526
2331
|
# on success, mark ourselves as reported so we don't keep
|
1527
2332
|
# hammering the root
|
1528
2333
|
own_shard_range.reported = True
|
1529
2334
|
broker.merge_shard_ranges(own_shard_range)
|
2335
|
+
self.debug(broker, 'updated root objs=%d, tombstones=%s',
|
2336
|
+
own_shard_range.object_count,
|
2337
|
+
own_shard_range.tombstones)
|
1530
2338
|
|
1531
2339
|
def _process_broker(self, broker, node, part):
|
1532
2340
|
broker.get_info() # make sure account/container are populated
|
1533
|
-
|
1534
|
-
|
1535
|
-
|
2341
|
+
db_state = broker.get_db_state()
|
2342
|
+
is_deleted = broker.is_deleted()
|
2343
|
+
self.debug(broker, 'Starting processing, state %s%s', db_state,
|
2344
|
+
' (deleted)' if is_deleted else '')
|
1536
2345
|
|
1537
2346
|
if not self._audit_container(broker):
|
1538
2347
|
return
|
1539
2348
|
|
1540
2349
|
# now look and deal with misplaced objects.
|
2350
|
+
move_start_ts = time.time()
|
1541
2351
|
self._move_misplaced_objects(broker)
|
2352
|
+
self.logger.timing_since(
|
2353
|
+
'sharder.sharding.move_misplaced', move_start_ts)
|
1542
2354
|
|
1543
|
-
|
1544
|
-
# This container is deleted so we can skip it. We still want
|
1545
|
-
# deleted containers to go via misplaced items because they may
|
1546
|
-
# have new objects sitting in them that may need to move.
|
1547
|
-
return
|
2355
|
+
is_leader = node['index'] == 0 and self.auto_shard and not is_deleted
|
1548
2356
|
|
1549
|
-
|
1550
|
-
if state in (UNSHARDED, COLLAPSED):
|
2357
|
+
if db_state in (UNSHARDED, COLLAPSED):
|
1551
2358
|
if is_leader and broker.is_root_container():
|
1552
2359
|
# bootstrap sharding of root container
|
2360
|
+
own_shard_range = broker.get_own_shard_range()
|
2361
|
+
update_own_shard_range_stats(broker, own_shard_range)
|
1553
2362
|
self._find_and_enable_sharding_candidates(
|
1554
|
-
broker, shard_ranges=[
|
2363
|
+
broker, shard_ranges=[own_shard_range])
|
1555
2364
|
|
1556
2365
|
own_shard_range = broker.get_own_shard_range()
|
1557
|
-
if own_shard_range.state in
|
1558
|
-
|
1559
|
-
ShardRange.SHARDED,
|
1560
|
-
ShardRange.SHRUNK):
|
1561
|
-
if broker.get_shard_ranges():
|
2366
|
+
if own_shard_range.state in ShardRange.CLEAVING_STATES:
|
2367
|
+
if broker.has_other_shard_ranges():
|
1562
2368
|
# container has been given shard ranges rather than
|
1563
|
-
# found them e.g. via replication or a shrink event
|
2369
|
+
# found them e.g. via replication or a shrink event,
|
2370
|
+
# or manually triggered cleaving.
|
2371
|
+
db_start_ts = time.time()
|
1564
2372
|
if broker.set_sharding_state():
|
1565
|
-
|
2373
|
+
db_state = SHARDING
|
2374
|
+
self.info(broker, 'Kick off container cleaving, '
|
2375
|
+
'own shard range in state %r',
|
2376
|
+
own_shard_range.state_text)
|
2377
|
+
self.logger.timing_since(
|
2378
|
+
'sharder.sharding.set_state', db_start_ts)
|
1566
2379
|
elif is_leader:
|
1567
2380
|
if broker.set_sharding_state():
|
1568
|
-
|
2381
|
+
db_state = SHARDING
|
1569
2382
|
else:
|
1570
|
-
self.
|
1571
|
-
|
1572
|
-
|
1573
|
-
|
2383
|
+
self.debug(broker,
|
2384
|
+
'Own shard range in state %r but no shard '
|
2385
|
+
'ranges and not leader; remaining unsharded',
|
2386
|
+
own_shard_range.state_text)
|
1574
2387
|
|
1575
|
-
if
|
2388
|
+
if db_state == SHARDING:
|
2389
|
+
cleave_start_ts = time.time()
|
1576
2390
|
if is_leader:
|
1577
2391
|
num_found = self._find_shard_ranges(broker)
|
1578
2392
|
else:
|
@@ -1587,38 +2401,53 @@ class ContainerSharder(ContainerReplicator):
|
|
1587
2401
|
|
1588
2402
|
# always try to cleave any pending shard ranges
|
1589
2403
|
cleave_complete = self._cleave(broker)
|
2404
|
+
self.logger.timing_since(
|
2405
|
+
'sharder.sharding.cleave', cleave_start_ts)
|
1590
2406
|
|
1591
2407
|
if cleave_complete:
|
1592
|
-
self.logger.info('Completed cleaving of %s',
|
1593
|
-
quote(broker.path))
|
1594
2408
|
if self._complete_sharding(broker):
|
1595
|
-
|
2409
|
+
db_state = SHARDED
|
1596
2410
|
self._increment_stat('visited', 'completed', statsd=True)
|
2411
|
+
self.info(broker, 'Completed cleaving, DB set to sharded '
|
2412
|
+
'state')
|
2413
|
+
self.logger.timing_since(
|
2414
|
+
'sharder.sharding.completed',
|
2415
|
+
float(broker.get_own_shard_range().epoch))
|
1597
2416
|
else:
|
1598
|
-
self.
|
1599
|
-
|
2417
|
+
self.info(broker, 'Completed cleaving, DB remaining in '
|
2418
|
+
'sharding state')
|
2419
|
+
|
2420
|
+
if not broker.is_deleted():
|
2421
|
+
if db_state == SHARDED and broker.is_root_container():
|
2422
|
+
# look for shrink stats
|
2423
|
+
send_start_ts = time.time()
|
2424
|
+
self._identify_shrinking_candidate(broker, node)
|
2425
|
+
if is_leader:
|
2426
|
+
self._find_and_enable_shrinking_candidates(broker)
|
2427
|
+
self._find_and_enable_sharding_candidates(broker)
|
2428
|
+
for shard_range in broker.get_shard_ranges(
|
2429
|
+
states=[ShardRange.SHARDING]):
|
2430
|
+
self._send_shard_ranges(broker, shard_range.account,
|
2431
|
+
shard_range.container,
|
2432
|
+
[shard_range])
|
2433
|
+
self.logger.timing_since(
|
2434
|
+
'sharder.sharding.send_sr', send_start_ts)
|
1600
2435
|
|
1601
|
-
|
1602
|
-
|
1603
|
-
|
1604
|
-
|
1605
|
-
|
1606
|
-
|
1607
|
-
|
1608
|
-
|
1609
|
-
|
1610
|
-
|
1611
|
-
|
1612
|
-
|
1613
|
-
|
1614
|
-
|
1615
|
-
|
1616
|
-
# shards move to ACTIVE state and the sharded shard
|
1617
|
-
# simultaneously become deleted.
|
1618
|
-
self._update_root_container(broker)
|
1619
|
-
|
1620
|
-
self.logger.debug('Finished processing %s state %s',
|
1621
|
-
quote(broker.path), broker.get_db_state())
|
2436
|
+
if not broker.is_root_container():
|
2437
|
+
# Update the root container with this container's shard range
|
2438
|
+
# info; do this even when sharded in case previous attempts
|
2439
|
+
# failed; don't do this if there is no own shard range. When
|
2440
|
+
# sharding a shard, this is when the root will see the new
|
2441
|
+
# shards move to ACTIVE state and the sharded shard
|
2442
|
+
# simultaneously become deleted.
|
2443
|
+
update_start_ts = time.time()
|
2444
|
+
self._update_root_container(broker)
|
2445
|
+
self.logger.timing_since(
|
2446
|
+
'sharder.sharding.update_root', update_start_ts)
|
2447
|
+
|
2448
|
+
self.debug(broker,
|
2449
|
+
'Finished processing, state %s%s',
|
2450
|
+
broker.get_db_state(), ' (deleted)' if is_deleted else '')
|
1622
2451
|
|
1623
2452
|
def _one_shard_cycle(self, devices_to_shard, partitions_to_shard):
|
1624
2453
|
"""
|
@@ -1642,9 +2471,9 @@ class ContainerSharder(ContainerReplicator):
|
|
1642
2471
|
self.logger.info('(Override partitions: %s)',
|
1643
2472
|
', '.join(str(p) for p in partitions_to_shard))
|
1644
2473
|
self._zero_stats()
|
1645
|
-
self._local_device_ids =
|
2474
|
+
self._local_device_ids = {}
|
1646
2475
|
dirs = []
|
1647
|
-
self.ips = whataremyips(
|
2476
|
+
self.ips = whataremyips(self.bind_ip)
|
1648
2477
|
for node in self.ring.devs:
|
1649
2478
|
device_path = self._check_node(node)
|
1650
2479
|
if not device_path:
|
@@ -1653,7 +2482,7 @@ class ContainerSharder(ContainerReplicator):
|
|
1653
2482
|
if os.path.isdir(datadir):
|
1654
2483
|
# Populate self._local_device_ids so we can find devices for
|
1655
2484
|
# shard containers later
|
1656
|
-
self._local_device_ids
|
2485
|
+
self._local_device_ids[node['id']] = node
|
1657
2486
|
if node['device'] not in devices_to_shard:
|
1658
2487
|
continue
|
1659
2488
|
part_filt = self._partition_dir_filter(
|
@@ -1661,7 +2490,7 @@ class ContainerSharder(ContainerReplicator):
|
|
1661
2490
|
partitions_to_shard)
|
1662
2491
|
dirs.append((datadir, node, part_filt))
|
1663
2492
|
if not dirs:
|
1664
|
-
self.logger.
|
2493
|
+
self.logger.info('Found no containers directories')
|
1665
2494
|
for part, path, node in self.roundrobin_datadirs(dirs):
|
1666
2495
|
# NB: get_part_nodes always provides an 'index' key;
|
1667
2496
|
# this will be used in leader selection
|
@@ -1686,42 +2515,47 @@ class ContainerSharder(ContainerReplicator):
|
|
1686
2515
|
self._increment_stat('visited', 'skipped')
|
1687
2516
|
except (Exception, Timeout) as err:
|
1688
2517
|
self._increment_stat('visited', 'failure', statsd=True)
|
1689
|
-
self.
|
1690
|
-
|
2518
|
+
self.exception(broker, 'Unhandled exception while processing: '
|
2519
|
+
'%s', err)
|
1691
2520
|
error = err
|
1692
2521
|
try:
|
1693
2522
|
self._record_sharding_progress(broker, node, error)
|
1694
2523
|
except (Exception, Timeout) as error:
|
1695
|
-
self.
|
1696
|
-
|
1697
|
-
path, error)
|
2524
|
+
self.exception(broker, 'Unhandled exception while dumping '
|
2525
|
+
'progress: %s', error)
|
1698
2526
|
self._periodic_report_stats()
|
1699
2527
|
|
1700
2528
|
self._report_stats()
|
1701
2529
|
|
2530
|
+
@contextmanager
|
1702
2531
|
def _set_auto_shard_from_command_line(self, **kwargs):
|
2532
|
+
conf_auto_shard = self.auto_shard
|
1703
2533
|
auto_shard = kwargs.get('auto_shard', None)
|
1704
2534
|
if auto_shard is not None:
|
1705
2535
|
self.auto_shard = config_true_value(auto_shard)
|
2536
|
+
try:
|
2537
|
+
yield
|
2538
|
+
finally:
|
2539
|
+
self.auto_shard = conf_auto_shard
|
1706
2540
|
|
1707
2541
|
def run_forever(self, *args, **kwargs):
|
1708
2542
|
"""Run the container sharder until stopped."""
|
1709
|
-
self._set_auto_shard_from_command_line(**kwargs)
|
1710
|
-
|
1711
|
-
|
1712
|
-
|
1713
|
-
|
1714
|
-
|
1715
|
-
|
1716
|
-
|
1717
|
-
|
1718
|
-
|
1719
|
-
|
1720
|
-
|
1721
|
-
|
1722
|
-
|
1723
|
-
|
1724
|
-
|
2543
|
+
with self._set_auto_shard_from_command_line(**kwargs):
|
2544
|
+
self.reported = time.time()
|
2545
|
+
time.sleep(random() * self.interval)
|
2546
|
+
while True:
|
2547
|
+
begin = time.time()
|
2548
|
+
try:
|
2549
|
+
self._one_shard_cycle(devices_to_shard=Everything(),
|
2550
|
+
partitions_to_shard=Everything())
|
2551
|
+
except (Exception, Timeout):
|
2552
|
+
self.logger.increment('errors')
|
2553
|
+
self.logger.exception('Exception in sharder')
|
2554
|
+
elapsed = time.time() - begin
|
2555
|
+
self.logger.info(
|
2556
|
+
'Container sharder cycle completed: %.02fs', elapsed)
|
2557
|
+
if elapsed < self.interval:
|
2558
|
+
time.sleep(self.interval - elapsed)
|
1725
2559
|
|
1726
2560
|
def run_once(self, *args, **kwargs):
|
1727
2561
|
"""Run the container sharder once."""
|
@@ -1729,10 +2563,32 @@ class ContainerSharder(ContainerReplicator):
|
|
1729
2563
|
override_options = parse_override_options(once=True, **kwargs)
|
1730
2564
|
devices_to_shard = override_options.devices or Everything()
|
1731
2565
|
partitions_to_shard = override_options.partitions or Everything()
|
1732
|
-
self._set_auto_shard_from_command_line(**kwargs)
|
1733
|
-
|
1734
|
-
|
1735
|
-
|
1736
|
-
|
1737
|
-
|
1738
|
-
|
2566
|
+
with self._set_auto_shard_from_command_line(**kwargs):
|
2567
|
+
begin = self.reported = time.time()
|
2568
|
+
self._one_shard_cycle(devices_to_shard=devices_to_shard,
|
2569
|
+
partitions_to_shard=partitions_to_shard)
|
2570
|
+
elapsed = time.time() - begin
|
2571
|
+
self.logger.info(
|
2572
|
+
'Container sharder "once" mode completed: %.02fs', elapsed)
|
2573
|
+
|
2574
|
+
|
2575
|
+
def main():
|
2576
|
+
parser = OptionParser("%prog CONFIG [options]")
|
2577
|
+
parser.add_option('-d', '--devices',
|
2578
|
+
help='Shard containers only on given devices. '
|
2579
|
+
'Comma-separated list. '
|
2580
|
+
'Only has effect if --once is used.')
|
2581
|
+
parser.add_option('-p', '--partitions',
|
2582
|
+
help='Shard containers only in given partitions. '
|
2583
|
+
'Comma-separated list. '
|
2584
|
+
'Only has effect if --once is used.')
|
2585
|
+
parser.add_option('--no-auto-shard', action='store_false',
|
2586
|
+
dest='auto_shard', default=None,
|
2587
|
+
help='Disable auto-sharding. Overrides the auto_shard '
|
2588
|
+
'value in the config file.')
|
2589
|
+
conf_file, options = parse_options(parser=parser, once=True)
|
2590
|
+
run_daemon(ContainerSharder, conf_file, **options)
|
2591
|
+
|
2592
|
+
|
2593
|
+
if __name__ == '__main__':
|
2594
|
+
main()
|