swift 2.23.3__py3-none-any.whl → 2.35.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- swift/__init__.py +29 -50
- swift/account/auditor.py +21 -118
- swift/account/backend.py +33 -28
- swift/account/reaper.py +37 -28
- swift/account/replicator.py +22 -0
- swift/account/server.py +60 -26
- swift/account/utils.py +28 -11
- swift-2.23.3.data/scripts/swift-account-audit → swift/cli/account_audit.py +23 -13
- swift-2.23.3.data/scripts/swift-config → swift/cli/config.py +2 -2
- swift/cli/container_deleter.py +5 -11
- swift-2.23.3.data/scripts/swift-dispersion-populate → swift/cli/dispersion_populate.py +8 -7
- swift/cli/dispersion_report.py +10 -9
- swift-2.23.3.data/scripts/swift-drive-audit → swift/cli/drive_audit.py +63 -21
- swift/cli/form_signature.py +3 -7
- swift-2.23.3.data/scripts/swift-get-nodes → swift/cli/get_nodes.py +8 -2
- swift/cli/info.py +154 -14
- swift/cli/manage_shard_ranges.py +705 -37
- swift-2.23.3.data/scripts/swift-oldies → swift/cli/oldies.py +25 -14
- swift-2.23.3.data/scripts/swift-orphans → swift/cli/orphans.py +7 -3
- swift/cli/recon.py +196 -67
- swift-2.23.3.data/scripts/swift-recon-cron → swift/cli/recon_cron.py +17 -20
- swift-2.23.3.data/scripts/swift-reconciler-enqueue → swift/cli/reconciler_enqueue.py +2 -3
- swift/cli/relinker.py +807 -126
- swift/cli/reload.py +135 -0
- swift/cli/ringbuilder.py +217 -20
- swift/cli/ringcomposer.py +0 -1
- swift/cli/shard-info.py +4 -3
- swift/common/base_storage_server.py +9 -20
- swift/common/bufferedhttp.py +48 -74
- swift/common/constraints.py +20 -15
- swift/common/container_sync_realms.py +9 -11
- swift/common/daemon.py +25 -8
- swift/common/db.py +195 -128
- swift/common/db_auditor.py +168 -0
- swift/common/db_replicator.py +95 -55
- swift/common/digest.py +141 -0
- swift/common/direct_client.py +144 -33
- swift/common/error_limiter.py +93 -0
- swift/common/exceptions.py +25 -1
- swift/common/header_key_dict.py +2 -9
- swift/common/http_protocol.py +373 -0
- swift/common/internal_client.py +129 -59
- swift/common/linkat.py +3 -4
- swift/common/manager.py +284 -67
- swift/common/memcached.py +390 -145
- swift/common/middleware/__init__.py +4 -0
- swift/common/middleware/account_quotas.py +211 -46
- swift/common/middleware/acl.py +3 -8
- swift/common/middleware/backend_ratelimit.py +230 -0
- swift/common/middleware/bulk.py +22 -34
- swift/common/middleware/catch_errors.py +1 -3
- swift/common/middleware/cname_lookup.py +6 -11
- swift/common/middleware/container_quotas.py +1 -1
- swift/common/middleware/container_sync.py +39 -17
- swift/common/middleware/copy.py +12 -0
- swift/common/middleware/crossdomain.py +22 -9
- swift/common/middleware/crypto/__init__.py +2 -1
- swift/common/middleware/crypto/crypto_utils.py +11 -15
- swift/common/middleware/crypto/decrypter.py +28 -11
- swift/common/middleware/crypto/encrypter.py +12 -17
- swift/common/middleware/crypto/keymaster.py +8 -15
- swift/common/middleware/crypto/kms_keymaster.py +2 -1
- swift/common/middleware/dlo.py +15 -11
- swift/common/middleware/domain_remap.py +5 -4
- swift/common/middleware/etag_quoter.py +128 -0
- swift/common/middleware/formpost.py +73 -70
- swift/common/middleware/gatekeeper.py +8 -1
- swift/common/middleware/keystoneauth.py +33 -3
- swift/common/middleware/list_endpoints.py +4 -4
- swift/common/middleware/listing_formats.py +85 -49
- swift/common/middleware/memcache.py +4 -95
- swift/common/middleware/name_check.py +3 -2
- swift/common/middleware/proxy_logging.py +160 -92
- swift/common/middleware/ratelimit.py +17 -10
- swift/common/middleware/read_only.py +6 -4
- swift/common/middleware/recon.py +59 -22
- swift/common/middleware/s3api/acl_handlers.py +25 -3
- swift/common/middleware/s3api/acl_utils.py +6 -1
- swift/common/middleware/s3api/controllers/__init__.py +6 -0
- swift/common/middleware/s3api/controllers/acl.py +3 -2
- swift/common/middleware/s3api/controllers/bucket.py +242 -137
- swift/common/middleware/s3api/controllers/logging.py +2 -2
- swift/common/middleware/s3api/controllers/multi_delete.py +43 -20
- swift/common/middleware/s3api/controllers/multi_upload.py +219 -133
- swift/common/middleware/s3api/controllers/obj.py +112 -8
- swift/common/middleware/s3api/controllers/object_lock.py +44 -0
- swift/common/middleware/s3api/controllers/s3_acl.py +2 -2
- swift/common/middleware/s3api/controllers/tagging.py +57 -0
- swift/common/middleware/s3api/controllers/versioning.py +36 -7
- swift/common/middleware/s3api/etree.py +22 -9
- swift/common/middleware/s3api/exception.py +0 -4
- swift/common/middleware/s3api/s3api.py +113 -41
- swift/common/middleware/s3api/s3request.py +384 -218
- swift/common/middleware/s3api/s3response.py +126 -23
- swift/common/middleware/s3api/s3token.py +16 -17
- swift/common/middleware/s3api/schema/delete.rng +1 -1
- swift/common/middleware/s3api/subresource.py +7 -10
- swift/common/middleware/s3api/utils.py +27 -10
- swift/common/middleware/slo.py +665 -358
- swift/common/middleware/staticweb.py +64 -37
- swift/common/middleware/symlink.py +51 -18
- swift/common/middleware/tempauth.py +76 -58
- swift/common/middleware/tempurl.py +191 -173
- swift/common/middleware/versioned_writes/__init__.py +51 -0
- swift/common/middleware/{versioned_writes.py → versioned_writes/legacy.py} +27 -26
- swift/common/middleware/versioned_writes/object_versioning.py +1482 -0
- swift/common/middleware/x_profile/exceptions.py +1 -4
- swift/common/middleware/x_profile/html_viewer.py +18 -19
- swift/common/middleware/x_profile/profile_model.py +1 -2
- swift/common/middleware/xprofile.py +10 -10
- swift-2.23.3.data/scripts/swift-container-server → swift/common/recon.py +13 -8
- swift/common/registry.py +147 -0
- swift/common/request_helpers.py +324 -57
- swift/common/ring/builder.py +67 -25
- swift/common/ring/composite_builder.py +1 -1
- swift/common/ring/ring.py +177 -51
- swift/common/ring/utils.py +1 -1
- swift/common/splice.py +10 -6
- swift/common/statsd_client.py +205 -0
- swift/common/storage_policy.py +49 -44
- swift/common/swob.py +86 -102
- swift/common/{utils.py → utils/__init__.py} +2163 -2772
- swift/common/utils/base.py +131 -0
- swift/common/utils/config.py +433 -0
- swift/common/utils/ipaddrs.py +256 -0
- swift/common/utils/libc.py +345 -0
- swift/common/utils/logs.py +859 -0
- swift/common/utils/timestamp.py +412 -0
- swift/common/wsgi.py +553 -535
- swift/container/auditor.py +14 -100
- swift/container/backend.py +490 -231
- swift/container/reconciler.py +126 -37
- swift/container/replicator.py +96 -22
- swift/container/server.py +358 -165
- swift/container/sharder.py +1540 -684
- swift/container/sync.py +94 -88
- swift/container/updater.py +53 -32
- swift/obj/auditor.py +153 -35
- swift/obj/diskfile.py +466 -217
- swift/obj/expirer.py +406 -124
- swift/obj/mem_diskfile.py +7 -4
- swift/obj/mem_server.py +1 -0
- swift/obj/reconstructor.py +523 -262
- swift/obj/replicator.py +249 -188
- swift/obj/server.py +207 -122
- swift/obj/ssync_receiver.py +145 -85
- swift/obj/ssync_sender.py +113 -54
- swift/obj/updater.py +652 -139
- swift/obj/watchers/__init__.py +0 -0
- swift/obj/watchers/dark_data.py +213 -0
- swift/proxy/controllers/account.py +11 -11
- swift/proxy/controllers/base.py +848 -604
- swift/proxy/controllers/container.py +433 -92
- swift/proxy/controllers/info.py +3 -2
- swift/proxy/controllers/obj.py +1000 -489
- swift/proxy/server.py +185 -112
- {swift-2.23.3.dist-info → swift-2.35.0.dist-info}/AUTHORS +58 -11
- {swift-2.23.3.dist-info → swift-2.35.0.dist-info}/METADATA +51 -56
- swift-2.35.0.dist-info/RECORD +201 -0
- {swift-2.23.3.dist-info → swift-2.35.0.dist-info}/WHEEL +1 -1
- {swift-2.23.3.dist-info → swift-2.35.0.dist-info}/entry_points.txt +43 -0
- swift-2.35.0.dist-info/pbr.json +1 -0
- swift/locale/de/LC_MESSAGES/swift.po +0 -1216
- swift/locale/en_GB/LC_MESSAGES/swift.po +0 -1207
- swift/locale/es/LC_MESSAGES/swift.po +0 -1085
- swift/locale/fr/LC_MESSAGES/swift.po +0 -909
- swift/locale/it/LC_MESSAGES/swift.po +0 -894
- swift/locale/ja/LC_MESSAGES/swift.po +0 -965
- swift/locale/ko_KR/LC_MESSAGES/swift.po +0 -964
- swift/locale/pt_BR/LC_MESSAGES/swift.po +0 -881
- swift/locale/ru/LC_MESSAGES/swift.po +0 -891
- swift/locale/tr_TR/LC_MESSAGES/swift.po +0 -832
- swift/locale/zh_CN/LC_MESSAGES/swift.po +0 -833
- swift/locale/zh_TW/LC_MESSAGES/swift.po +0 -838
- swift-2.23.3.data/scripts/swift-account-auditor +0 -23
- swift-2.23.3.data/scripts/swift-account-info +0 -51
- swift-2.23.3.data/scripts/swift-account-reaper +0 -23
- swift-2.23.3.data/scripts/swift-account-replicator +0 -34
- swift-2.23.3.data/scripts/swift-account-server +0 -23
- swift-2.23.3.data/scripts/swift-container-auditor +0 -23
- swift-2.23.3.data/scripts/swift-container-info +0 -55
- swift-2.23.3.data/scripts/swift-container-reconciler +0 -21
- swift-2.23.3.data/scripts/swift-container-replicator +0 -34
- swift-2.23.3.data/scripts/swift-container-sharder +0 -37
- swift-2.23.3.data/scripts/swift-container-sync +0 -23
- swift-2.23.3.data/scripts/swift-container-updater +0 -23
- swift-2.23.3.data/scripts/swift-dispersion-report +0 -24
- swift-2.23.3.data/scripts/swift-form-signature +0 -20
- swift-2.23.3.data/scripts/swift-init +0 -119
- swift-2.23.3.data/scripts/swift-object-auditor +0 -29
- swift-2.23.3.data/scripts/swift-object-expirer +0 -33
- swift-2.23.3.data/scripts/swift-object-info +0 -60
- swift-2.23.3.data/scripts/swift-object-reconstructor +0 -33
- swift-2.23.3.data/scripts/swift-object-relinker +0 -41
- swift-2.23.3.data/scripts/swift-object-replicator +0 -37
- swift-2.23.3.data/scripts/swift-object-server +0 -27
- swift-2.23.3.data/scripts/swift-object-updater +0 -23
- swift-2.23.3.data/scripts/swift-proxy-server +0 -23
- swift-2.23.3.data/scripts/swift-recon +0 -24
- swift-2.23.3.data/scripts/swift-ring-builder +0 -24
- swift-2.23.3.data/scripts/swift-ring-builder-analyzer +0 -22
- swift-2.23.3.data/scripts/swift-ring-composer +0 -22
- swift-2.23.3.dist-info/RECORD +0 -220
- swift-2.23.3.dist-info/pbr.json +0 -1
- {swift-2.23.3.dist-info → swift-2.35.0.dist-info}/LICENSE +0 -0
- {swift-2.23.3.dist-info → swift-2.35.0.dist-info}/top_level.txt +0 -0
swift/obj/reconstructor.py
CHANGED
@@ -12,42 +12,44 @@
|
|
12
12
|
# implied.
|
13
13
|
# See the License for the specific language governing permissions and
|
14
14
|
# limitations under the License.
|
15
|
-
|
15
|
+
import itertools
|
16
16
|
import json
|
17
17
|
import errno
|
18
|
+
from optparse import OptionParser
|
18
19
|
import os
|
19
20
|
from os.path import join
|
20
21
|
import random
|
21
22
|
import time
|
22
23
|
from collections import defaultdict
|
23
|
-
import
|
24
|
-
import six.moves.cPickle as pickle
|
24
|
+
import pickle # nosec: B403
|
25
25
|
import shutil
|
26
26
|
|
27
27
|
from eventlet import (GreenPile, GreenPool, Timeout, sleep, tpool, spawn)
|
28
28
|
from eventlet.support.greenlets import GreenletExit
|
29
29
|
|
30
|
-
from swift import gettext_ as _
|
31
30
|
from swift.common.utils import (
|
32
31
|
whataremyips, unlink_older_than, compute_eta, get_logger,
|
33
|
-
dump_recon_cache, mkdirs, config_true_value,
|
34
|
-
GreenAsyncPile, Timestamp, remove_file,
|
32
|
+
dump_recon_cache, mkdirs, config_true_value, parse_options,
|
33
|
+
GreenAsyncPile, Timestamp, remove_file, node_to_string,
|
35
34
|
load_recon_cache, parse_override_options, distribute_evenly,
|
36
|
-
|
35
|
+
remove_directory, config_request_node_count_value,
|
36
|
+
non_negative_int, get_prefixed_logger)
|
37
37
|
from swift.common.header_key_dict import HeaderKeyDict
|
38
38
|
from swift.common.bufferedhttp import http_connect
|
39
|
-
from swift.common.daemon import Daemon
|
39
|
+
from swift.common.daemon import Daemon, run_daemon
|
40
|
+
from swift.common.recon import RECON_OBJECT_FILE, DEFAULT_RECON_CACHE_PATH
|
40
41
|
from swift.common.ring.utils import is_local_device
|
41
42
|
from swift.obj.ssync_sender import Sender as ssync_sender
|
42
43
|
from swift.common.http import HTTP_OK, HTTP_NOT_FOUND, \
|
43
44
|
HTTP_INSUFFICIENT_STORAGE
|
44
45
|
from swift.obj.diskfile import DiskFileRouter, get_data_dir, \
|
45
|
-
get_tmp_dir
|
46
|
+
get_tmp_dir, DEFAULT_RECLAIM_AGE
|
46
47
|
from swift.common.storage_policy import POLICIES, EC_POLICY
|
47
48
|
from swift.common.exceptions import ConnectionTimeout, DiskFileError, \
|
48
|
-
SuffixSyncError
|
49
|
+
SuffixSyncError, PartitionLockTimeout, DiskFileNotExist
|
49
50
|
|
50
51
|
SYNC, REVERT = ('sync_only', 'sync_revert')
|
52
|
+
UNKNOWN_RESPONSE_STATUS = 0 # used as response status for timeouts, exceptions
|
51
53
|
|
52
54
|
|
53
55
|
def _get_partners(node_index, part_nodes):
|
@@ -81,17 +83,29 @@ def _full_path(node, part, relative_path, policy):
|
|
81
83
|
:class:`~swift.common.storage_policy.BaseStoragePolicy`
|
82
84
|
:return: string representation of absolute path on node plus policy index
|
83
85
|
"""
|
84
|
-
if not isinstance(relative_path,
|
86
|
+
if not isinstance(relative_path, str):
|
85
87
|
relative_path = relative_path.decode('utf8')
|
86
|
-
return '%(
|
87
|
-
'
|
88
|
-
'
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
88
|
+
return '%(node)s/%(part)s%(path)s policy#%(policy)d' % {
|
89
|
+
'node': node_to_string(node, replication=True),
|
90
|
+
'part': part, 'path': relative_path,
|
91
|
+
'policy': policy,
|
92
|
+
}
|
93
|
+
|
94
|
+
|
95
|
+
class ResponseBucket(object):
|
96
|
+
"""
|
97
|
+
Encapsulates fragment GET response data related to a single timestamp.
|
98
|
+
"""
|
99
|
+
def __init__(self):
|
100
|
+
# count of all responses associated with this Bucket
|
101
|
+
self.num_responses = 0
|
102
|
+
# map {frag_index: response} for subset of responses that could be used
|
103
|
+
# to rebuild the missing fragment
|
104
|
+
self.useful_responses = {}
|
105
|
+
# set if a durable timestamp was seen in responses
|
106
|
+
self.durable = False
|
107
|
+
# etag of the first response associated with the Bucket
|
108
|
+
self.etag = None
|
95
109
|
|
96
110
|
|
97
111
|
class RebuildingECDiskFileStream(object):
|
@@ -143,15 +157,15 @@ class ObjectReconstructor(Daemon):
|
|
143
157
|
def __init__(self, conf, logger=None):
|
144
158
|
"""
|
145
159
|
:param conf: configuration object obtained from ConfigParser
|
146
|
-
:param logger:
|
160
|
+
:param logger: an instance of ``SwiftLogAdapter``.
|
147
161
|
"""
|
148
162
|
self.conf = conf
|
149
|
-
self.logger =
|
150
|
-
logger or get_logger(conf, log_route='object-reconstructor')
|
163
|
+
self.logger = \
|
164
|
+
logger or get_logger(conf, log_route='object-reconstructor')
|
151
165
|
self.devices_dir = conf.get('devices', '/srv/node')
|
152
166
|
self.mount_check = config_true_value(conf.get('mount_check', 'true'))
|
153
167
|
self.swift_dir = conf.get('swift_dir', '/etc/swift')
|
154
|
-
self.
|
168
|
+
self.ring_ip = conf.get('ring_ip', conf.get('bind_ip', '0.0.0.0'))
|
155
169
|
self.servers_per_port = int(conf.get('servers_per_port', '0') or 0)
|
156
170
|
self.port = None if self.servers_per_port else \
|
157
171
|
int(conf.get('bind_port', 6200))
|
@@ -162,23 +176,30 @@ class ObjectReconstructor(Daemon):
|
|
162
176
|
self.reconstructor_workers = int(conf.get('reconstructor_workers', 0))
|
163
177
|
self.policies = [policy for policy in POLICIES
|
164
178
|
if policy.policy_type == EC_POLICY]
|
165
|
-
self.stats_interval =
|
166
|
-
self.ring_check_interval =
|
179
|
+
self.stats_interval = float(conf.get('stats_interval', '300'))
|
180
|
+
self.ring_check_interval = float(conf.get('ring_check_interval', 15))
|
167
181
|
self.next_check = time.time() + self.ring_check_interval
|
168
182
|
self.partition_times = []
|
169
|
-
self.interval =
|
170
|
-
|
171
|
-
if 'run_pause' in conf
|
172
|
-
|
173
|
-
|
174
|
-
|
175
|
-
|
176
|
-
|
183
|
+
self.interval = float(conf.get('interval') or
|
184
|
+
conf.get('run_pause') or 30)
|
185
|
+
if 'run_pause' in conf:
|
186
|
+
if 'interval' in conf:
|
187
|
+
self.logger.warning(
|
188
|
+
'Option object-reconstructor/run_pause is deprecated and '
|
189
|
+
'object-reconstructor/interval is already configured. '
|
190
|
+
'You can safely remove run_pause; it is now ignored and '
|
191
|
+
'will be removed in a future version.')
|
192
|
+
else:
|
193
|
+
self.logger.warning(
|
194
|
+
'Option object-reconstructor/run_pause is deprecated '
|
195
|
+
'and will be removed in a future version. '
|
196
|
+
'Update your configuration to use option '
|
197
|
+
'object-reconstructor/interval.')
|
177
198
|
self.http_timeout = int(conf.get('http_timeout', 60))
|
178
199
|
self.lockup_timeout = int(conf.get('lockup_timeout', 1800))
|
179
200
|
self.recon_cache_path = conf.get('recon_cache_path',
|
180
|
-
|
181
|
-
self.rcache = os.path.join(self.recon_cache_path,
|
201
|
+
DEFAULT_RECON_CACHE_PATH)
|
202
|
+
self.rcache = os.path.join(self.recon_cache_path, RECON_OBJECT_FILE)
|
182
203
|
self._next_rcache_update = time.time() + self.stats_interval
|
183
204
|
# defaults subject to change after beta
|
184
205
|
self.conn_timeout = float(conf.get('conn_timeout', 0.5))
|
@@ -208,8 +229,28 @@ class ObjectReconstructor(Daemon):
|
|
208
229
|
'of handoffs_only.')
|
209
230
|
self.rebuild_handoff_node_count = int(conf.get(
|
210
231
|
'rebuild_handoff_node_count', 2))
|
232
|
+
self.quarantine_threshold = non_negative_int(
|
233
|
+
conf.get('quarantine_threshold', 0))
|
234
|
+
self.quarantine_age = int(
|
235
|
+
conf.get('quarantine_age',
|
236
|
+
conf.get('reclaim_age', DEFAULT_RECLAIM_AGE)))
|
237
|
+
self.request_node_count = config_request_node_count_value(
|
238
|
+
conf.get('request_node_count', '2 * replicas'))
|
239
|
+
self.max_objects_per_revert = non_negative_int(
|
240
|
+
conf.get('max_objects_per_revert', 0))
|
241
|
+
# When upgrading from liberasurecode<=1.5.0, you may want to continue
|
242
|
+
# writing legacy CRCs until all nodes are upgraded and capabale of
|
243
|
+
# reading fragments with zlib CRCs.
|
244
|
+
# See https://bugs.launchpad.net/liberasurecode/+bug/1886088 for more
|
245
|
+
# information.
|
246
|
+
if 'write_legacy_ec_crc' in conf:
|
247
|
+
os.environ['LIBERASURECODE_WRITE_LEGACY_CRC'] = \
|
248
|
+
'1' if config_true_value(conf['write_legacy_ec_crc']) else '0'
|
249
|
+
# else, assume operators know what they're doing and leave env alone
|
250
|
+
|
211
251
|
self._df_router = DiskFileRouter(conf, self.logger)
|
212
252
|
self.all_local_devices = self.get_local_devices()
|
253
|
+
self.rings_mtime = None
|
213
254
|
|
214
255
|
def get_worker_args(self, once=False, **kwargs):
|
215
256
|
"""
|
@@ -263,6 +304,11 @@ class ObjectReconstructor(Daemon):
|
|
263
304
|
if now > self._next_rcache_update:
|
264
305
|
self._next_rcache_update = now + self.stats_interval
|
265
306
|
self.aggregate_recon_update()
|
307
|
+
rings_mtime = [os.path.getmtime(self.load_object_ring(
|
308
|
+
policy).serialized_path) for policy in self.policies]
|
309
|
+
if self.rings_mtime == rings_mtime:
|
310
|
+
return True
|
311
|
+
self.rings_mtime = rings_mtime
|
266
312
|
return self.get_local_devices() == self.all_local_devices
|
267
313
|
|
268
314
|
def aggregate_recon_update(self):
|
@@ -327,56 +373,169 @@ class ObjectReconstructor(Daemon):
|
|
327
373
|
return False
|
328
374
|
return True
|
329
375
|
|
330
|
-
def _get_response(self, node,
|
376
|
+
def _get_response(self, node, policy, partition, path, headers):
|
331
377
|
"""
|
332
378
|
Helper method for reconstruction that GETs a single EC fragment
|
333
379
|
archive
|
334
380
|
|
335
381
|
:param node: the node to GET from
|
336
|
-
:param
|
382
|
+
:param policy: the job policy
|
383
|
+
:param partition: the partition
|
337
384
|
:param path: path of the desired EC archive relative to partition dir
|
338
385
|
:param headers: the headers to send
|
339
|
-
:param full_path: full path to desired EC archive
|
340
386
|
:returns: response
|
341
387
|
"""
|
388
|
+
full_path = _full_path(node, partition, path, policy)
|
342
389
|
resp = None
|
343
390
|
try:
|
344
391
|
with ConnectionTimeout(self.conn_timeout):
|
345
|
-
conn = http_connect(
|
346
|
-
|
392
|
+
conn = http_connect(
|
393
|
+
node['replication_ip'], node['replication_port'],
|
394
|
+
node['device'], partition, 'GET', path, headers=headers)
|
347
395
|
with Timeout(self.node_timeout):
|
348
396
|
resp = conn.getresponse()
|
349
397
|
resp.full_path = full_path
|
350
|
-
|
351
|
-
self.logger.warning(
|
352
|
-
_("Invalid response %(resp)s from %(full_path)s"),
|
353
|
-
{'resp': resp.status, 'full_path': full_path})
|
354
|
-
resp = None
|
355
|
-
elif resp.status == HTTP_NOT_FOUND:
|
356
|
-
resp = None
|
398
|
+
resp.node = node
|
357
399
|
except (Exception, Timeout):
|
358
400
|
self.logger.exception(
|
359
|
-
|
401
|
+
"Trying to GET %(full_path)s", {
|
360
402
|
'full_path': full_path})
|
361
403
|
return resp
|
362
404
|
|
363
|
-
def
|
405
|
+
def _handle_fragment_response(self, node, policy, partition, fi_to_rebuild,
|
406
|
+
path, buckets, error_responses, resp):
|
364
407
|
"""
|
365
|
-
|
366
|
-
|
367
|
-
diskfile is opened to provide metadata - but to reconstruct the
|
368
|
-
missing fragment archive we must connect to multiple object servers.
|
408
|
+
Place ok responses into a per-timestamp bucket. Append bad responses to
|
409
|
+
a list per-status-code in error_responses.
|
369
410
|
|
370
|
-
:
|
371
|
-
|
372
|
-
:param datafile_metadata: the datafile metadata to attach to
|
373
|
-
the rebuilt fragment archive
|
374
|
-
:returns: a DiskFile like class for use by ssync
|
375
|
-
:raises DiskFileError: if the fragment archive cannot be reconstructed
|
411
|
+
:return: the per-timestamp bucket if the response is ok, otherwise
|
412
|
+
None.
|
376
413
|
"""
|
377
|
-
|
378
|
-
|
379
|
-
|
414
|
+
if not resp:
|
415
|
+
error_responses[UNKNOWN_RESPONSE_STATUS].append(resp)
|
416
|
+
return None
|
417
|
+
|
418
|
+
if resp.status not in [HTTP_OK, HTTP_NOT_FOUND]:
|
419
|
+
self.logger.warning(
|
420
|
+
"Invalid response %(resp)s from %(full_path)s",
|
421
|
+
{'resp': resp.status, 'full_path': resp.full_path})
|
422
|
+
if resp.status != HTTP_OK:
|
423
|
+
error_responses[resp.status].append(resp)
|
424
|
+
return None
|
425
|
+
|
426
|
+
resp.headers = HeaderKeyDict(resp.getheaders())
|
427
|
+
frag_index = resp.headers.get('X-Object-Sysmeta-Ec-Frag-Index')
|
428
|
+
try:
|
429
|
+
resp_frag_index = int(frag_index)
|
430
|
+
except (TypeError, ValueError):
|
431
|
+
# The successful response should include valid X-Object-
|
432
|
+
# Sysmeta-Ec-Frag-Index but for safety, catching the case either
|
433
|
+
# missing X-Object-Sysmeta-Ec-Frag-Index or invalid frag index to
|
434
|
+
# reconstruct and dump warning log for that
|
435
|
+
self.logger.warning(
|
436
|
+
'Invalid resp from %s '
|
437
|
+
'(invalid X-Object-Sysmeta-Ec-Frag-Index: %r)',
|
438
|
+
resp.full_path, frag_index)
|
439
|
+
error_responses[UNKNOWN_RESPONSE_STATUS].append(resp)
|
440
|
+
return None
|
441
|
+
|
442
|
+
timestamp = resp.headers.get('X-Backend-Data-Timestamp',
|
443
|
+
resp.headers.get('X-Backend-Timestamp'))
|
444
|
+
if not timestamp:
|
445
|
+
self.logger.warning(
|
446
|
+
'Invalid resp from %s, frag index %s (missing '
|
447
|
+
'X-Backend-Data-Timestamp and X-Backend-Timestamp)',
|
448
|
+
resp.full_path, resp_frag_index)
|
449
|
+
error_responses[UNKNOWN_RESPONSE_STATUS].append(resp)
|
450
|
+
return None
|
451
|
+
timestamp = Timestamp(timestamp)
|
452
|
+
|
453
|
+
etag = resp.headers.get('X-Object-Sysmeta-Ec-Etag')
|
454
|
+
if not etag:
|
455
|
+
self.logger.warning(
|
456
|
+
'Invalid resp from %s, frag index %s (missing Etag)',
|
457
|
+
resp.full_path, resp_frag_index)
|
458
|
+
error_responses[UNKNOWN_RESPONSE_STATUS].append(resp)
|
459
|
+
return None
|
460
|
+
|
461
|
+
bucket = buckets[timestamp]
|
462
|
+
bucket.num_responses += 1
|
463
|
+
if bucket.etag is None:
|
464
|
+
bucket.etag = etag
|
465
|
+
elif bucket.etag != etag:
|
466
|
+
self.logger.error('Mixed Etag (%s, %s) for %s frag#%s',
|
467
|
+
etag, bucket.etag,
|
468
|
+
_full_path(node, partition, path, policy),
|
469
|
+
fi_to_rebuild)
|
470
|
+
return None
|
471
|
+
|
472
|
+
durable_timestamp = resp.headers.get('X-Backend-Durable-Timestamp')
|
473
|
+
if durable_timestamp:
|
474
|
+
buckets[Timestamp(durable_timestamp)].durable = True
|
475
|
+
|
476
|
+
if resp_frag_index == fi_to_rebuild:
|
477
|
+
# TODO: With duplicated EC frags it's not unreasonable to find the
|
478
|
+
# very fragment we're trying to rebuild exists on another primary
|
479
|
+
# node. In this case we should stream it directly from the remote
|
480
|
+
# node to our target instead of rebuild. But instead we ignore it.
|
481
|
+
self.logger.debug(
|
482
|
+
'Found existing frag #%s at %s while rebuilding to %s',
|
483
|
+
fi_to_rebuild, resp.full_path,
|
484
|
+
_full_path(node, partition, path, policy))
|
485
|
+
elif resp_frag_index not in bucket.useful_responses:
|
486
|
+
bucket.useful_responses[resp_frag_index] = resp
|
487
|
+
# else: duplicate frag_index isn't useful for rebuilding
|
488
|
+
|
489
|
+
return bucket
|
490
|
+
|
491
|
+
def _is_quarantine_candidate(self, policy, buckets, error_responses, df):
|
492
|
+
# This condition is deliberately strict because it determines if
|
493
|
+
# more requests will be issued and ultimately if the fragment
|
494
|
+
# will be quarantined.
|
495
|
+
if list(error_responses.keys()) != [404]:
|
496
|
+
# only quarantine if all other responses are 404 so we are
|
497
|
+
# confident there are no other frags on queried nodes
|
498
|
+
return False
|
499
|
+
|
500
|
+
local_timestamp = Timestamp(df.get_datafile_metadata()['X-Timestamp'])
|
501
|
+
if list(buckets.keys()) != [local_timestamp]:
|
502
|
+
# don't quarantine if there's insufficient other timestamp
|
503
|
+
# frags, or no response for the local frag timestamp: we
|
504
|
+
# possibly could quarantine, but this unexpected case may be
|
505
|
+
# worth more investigation
|
506
|
+
return False
|
507
|
+
|
508
|
+
if time.time() - float(local_timestamp) <= self.quarantine_age:
|
509
|
+
# If the fragment has not yet passed reclaim age then it is
|
510
|
+
# likely that a tombstone will be reverted to this node, or
|
511
|
+
# neighbor frags will get reverted from handoffs to *other* nodes
|
512
|
+
# and we'll discover we *do* have enough to reconstruct. Don't
|
513
|
+
# quarantine it yet: better that it is cleaned up 'normally'.
|
514
|
+
return False
|
515
|
+
|
516
|
+
bucket = buckets[local_timestamp]
|
517
|
+
return (bucket.num_responses <= self.quarantine_threshold and
|
518
|
+
bucket.num_responses < policy.ec_ndata and
|
519
|
+
df._frag_index in bucket.useful_responses)
|
520
|
+
|
521
|
+
def _make_fragment_requests(self, job, node, df, buckets, error_responses):
|
522
|
+
"""
|
523
|
+
Issue requests for fragments to the list of ``nodes`` and sort the
|
524
|
+
responses into per-timestamp ``buckets`` or per-status
|
525
|
+
``error_responses``. If any bucket accumulates sufficient responses to
|
526
|
+
rebuild the missing fragment then return that bucket.
|
527
|
+
|
528
|
+
:param job: job from ssync_sender.
|
529
|
+
:param node: node to which we're rebuilding.
|
530
|
+
:param df: an instance of :class:`~swift.obj.diskfile.BaseDiskFile`.
|
531
|
+
:param buckets: dict of per-timestamp buckets for ok responses.
|
532
|
+
:param error_responses: dict of per-status lists of error responses.
|
533
|
+
:return: A per-timestamp with sufficient responses, or None if
|
534
|
+
there is no such bucket.
|
535
|
+
"""
|
536
|
+
policy = job['policy']
|
537
|
+
partition = job['partition']
|
538
|
+
datafile_metadata = df.get_datafile_metadata()
|
380
539
|
|
381
540
|
# the fragment index we need to reconstruct is the position index
|
382
541
|
# of the node we're rebuilding to within the primary part list
|
@@ -385,126 +544,162 @@ class ObjectReconstructor(Daemon):
|
|
385
544
|
# KISS send out connection requests to all nodes, see what sticks.
|
386
545
|
# Use fragment preferences header to tell other nodes that we want
|
387
546
|
# fragments at the same timestamp as our fragment, and that they don't
|
388
|
-
# need to be durable.
|
547
|
+
# need to be durable. Accumulate responses into per-timestamp buckets
|
548
|
+
# and if any buckets gets enough responses then use those responses to
|
549
|
+
# rebuild.
|
389
550
|
headers = self.headers.copy()
|
390
|
-
headers['X-Backend-Storage-Policy-Index'] = int(
|
551
|
+
headers['X-Backend-Storage-Policy-Index'] = int(policy)
|
391
552
|
headers['X-Backend-Replication'] = 'True'
|
392
|
-
|
393
|
-
|
553
|
+
local_timestamp = Timestamp(datafile_metadata['X-Timestamp'])
|
554
|
+
frag_prefs = [{'timestamp': local_timestamp.normal, 'exclude': []}]
|
394
555
|
headers['X-Backend-Fragment-Preferences'] = json.dumps(frag_prefs)
|
395
|
-
pile = GreenAsyncPile(len(part_nodes))
|
396
556
|
path = datafile_metadata['name']
|
397
|
-
for _node in part_nodes:
|
398
|
-
full_get_path = _full_path(
|
399
|
-
_node, job['partition'], path, job['policy'])
|
400
|
-
pile.spawn(self._get_response, _node, job['partition'],
|
401
|
-
path, headers, full_get_path)
|
402
|
-
|
403
|
-
buckets = defaultdict(dict)
|
404
|
-
durable_buckets = {}
|
405
|
-
etag_buckets = {}
|
406
|
-
error_resp_count = 0
|
407
|
-
for resp in pile:
|
408
|
-
if not resp:
|
409
|
-
error_resp_count += 1
|
410
|
-
continue
|
411
|
-
resp.headers = HeaderKeyDict(resp.getheaders())
|
412
|
-
frag_index = resp.headers.get('X-Object-Sysmeta-Ec-Frag-Index')
|
413
|
-
try:
|
414
|
-
resp_frag_index = int(frag_index)
|
415
|
-
except (TypeError, ValueError):
|
416
|
-
# The successful response should include valid X-Object-
|
417
|
-
# Sysmeta-Ec-Frag-Index but for safety, catching the case
|
418
|
-
# either missing X-Object-Sysmeta-Ec-Frag-Index or invalid
|
419
|
-
# frag index to reconstruct and dump warning log for that
|
420
|
-
self.logger.warning(
|
421
|
-
'Invalid resp from %s '
|
422
|
-
'(invalid X-Object-Sysmeta-Ec-Frag-Index: %r)',
|
423
|
-
resp.full_path, frag_index)
|
424
|
-
continue
|
425
557
|
|
426
|
-
|
427
|
-
|
428
|
-
|
429
|
-
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
434
|
-
|
435
|
-
|
436
|
-
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
|
441
|
-
|
442
|
-
|
443
|
-
|
444
|
-
|
445
|
-
|
446
|
-
|
558
|
+
ring = policy.object_ring
|
559
|
+
primary_nodes = ring.get_part_nodes(partition)
|
560
|
+
# primary_node_count is the maximum number of nodes to consume in a
|
561
|
+
# normal rebuild attempt when there is no quarantine candidate,
|
562
|
+
# including the node to which we are rebuilding
|
563
|
+
primary_node_count = len(primary_nodes)
|
564
|
+
# don't try and fetch a fragment from the node we're rebuilding to
|
565
|
+
filtered_primary_nodes = [n for n in primary_nodes
|
566
|
+
if n['id'] != node['id']]
|
567
|
+
# concurrency is the number of requests fired off in initial batch
|
568
|
+
concurrency = len(filtered_primary_nodes)
|
569
|
+
# max_node_count is the maximum number of nodes to consume when
|
570
|
+
# verifying a quarantine candidate and is at least primary_node_count
|
571
|
+
max_node_count = max(primary_node_count,
|
572
|
+
self.request_node_count(primary_node_count))
|
573
|
+
|
574
|
+
pile = GreenAsyncPile(concurrency)
|
575
|
+
for primary_node in filtered_primary_nodes:
|
576
|
+
pile.spawn(self._get_response, primary_node, policy, partition,
|
577
|
+
path, headers)
|
578
|
+
|
579
|
+
useful_bucket = None
|
580
|
+
for resp in pile:
|
581
|
+
bucket = self._handle_fragment_response(
|
582
|
+
node, policy, partition, fi_to_rebuild, path, buckets,
|
583
|
+
error_responses, resp)
|
584
|
+
if bucket and len(bucket.useful_responses) >= policy.ec_ndata:
|
585
|
+
useful_bucket = bucket
|
586
|
+
break
|
447
587
|
|
448
|
-
|
449
|
-
|
450
|
-
|
588
|
+
# Once all rebuild nodes have responded, if we have a quarantine
|
589
|
+
# candidate, go beyond primary_node_count and on to handoffs. The
|
590
|
+
# first non-404 response will prevent quarantine, but the expected
|
591
|
+
# common case is all 404 responses so we use some concurrency to get an
|
592
|
+
# outcome faster at the risk of some unnecessary requests in the
|
593
|
+
# uncommon case.
|
594
|
+
if (not useful_bucket and
|
595
|
+
self._is_quarantine_candidate(
|
596
|
+
policy, buckets, error_responses, df)):
|
597
|
+
node_count = primary_node_count
|
598
|
+
handoff_iter = itertools.islice(ring.get_more_nodes(partition),
|
599
|
+
max_node_count - node_count)
|
600
|
+
for handoff_node in itertools.islice(handoff_iter, concurrency):
|
601
|
+
node_count += 1
|
602
|
+
pile.spawn(self._get_response, handoff_node, policy, partition,
|
603
|
+
path, headers)
|
604
|
+
for resp in pile:
|
605
|
+
bucket = self._handle_fragment_response(
|
606
|
+
node, policy, partition, fi_to_rebuild, path, buckets,
|
607
|
+
error_responses, resp)
|
608
|
+
if bucket and len(bucket.useful_responses) >= policy.ec_ndata:
|
609
|
+
useful_bucket = bucket
|
610
|
+
self.logger.debug(
|
611
|
+
'Reconstructing frag from handoffs, node_count=%d'
|
612
|
+
% node_count)
|
613
|
+
break
|
614
|
+
elif self._is_quarantine_candidate(
|
615
|
+
policy, buckets, error_responses, df):
|
616
|
+
try:
|
617
|
+
handoff_node = next(handoff_iter)
|
618
|
+
node_count += 1
|
619
|
+
pile.spawn(self._get_response, handoff_node, policy,
|
620
|
+
partition, path, headers)
|
621
|
+
except StopIteration:
|
622
|
+
pass
|
623
|
+
# else: this frag is no longer a quarantine candidate, so we
|
624
|
+
# could break right here and ignore any remaining responses,
|
625
|
+
# but given that we may have actually found another frag we'll
|
626
|
+
# optimistically wait for any remaining responses in case a
|
627
|
+
# useful bucket is assembled.
|
628
|
+
|
629
|
+
return useful_bucket
|
630
|
+
|
631
|
+
def reconstruct_fa(self, job, node, df):
|
632
|
+
"""
|
633
|
+
Reconstructs a fragment archive - this method is called from ssync
|
634
|
+
after a remote node responds that is missing this object - the local
|
635
|
+
diskfile is opened to provide metadata - but to reconstruct the
|
636
|
+
missing fragment archive we must connect to multiple object servers.
|
451
637
|
|
452
|
-
|
453
|
-
|
454
|
-
|
455
|
-
|
456
|
-
|
457
|
-
|
638
|
+
:param job: job from ssync_sender.
|
639
|
+
:param node: node to which we're rebuilding.
|
640
|
+
:param df: an instance of :class:`~swift.obj.diskfile.BaseDiskFile`.
|
641
|
+
:returns: a DiskFile like class for use by ssync.
|
642
|
+
:raises DiskFileQuarantined: if the fragment archive cannot be
|
643
|
+
reconstructed and has as a result been quarantined.
|
644
|
+
:raises DiskFileError: if the fragment archive cannot be reconstructed.
|
645
|
+
"""
|
646
|
+
policy = job['policy']
|
647
|
+
partition = job['partition']
|
648
|
+
# the fragment index we need to reconstruct is the position index
|
649
|
+
# of the node we're rebuilding to within the primary part list
|
650
|
+
fi_to_rebuild = node['backend_index']
|
651
|
+
datafile_metadata = df.get_datafile_metadata()
|
652
|
+
if not df.validate_metadata():
|
653
|
+
raise df._quarantine(
|
654
|
+
df._data_file, "Invalid fragment #%s" % df._frag_index)
|
655
|
+
local_timestamp = Timestamp(datafile_metadata['X-Timestamp'])
|
656
|
+
path = datafile_metadata['name']
|
458
657
|
|
459
|
-
|
460
|
-
|
461
|
-
'Mixed Etag (%s, %s) for %s frag#%s',
|
462
|
-
etag, etag_buckets[timestamp],
|
463
|
-
_full_path(node, job['partition'],
|
464
|
-
datafile_metadata['name'], job['policy']),
|
465
|
-
fi_to_rebuild)
|
466
|
-
continue
|
658
|
+
buckets = defaultdict(ResponseBucket) # map timestamp -> Bucket
|
659
|
+
error_responses = defaultdict(list) # map status code -> response list
|
467
660
|
|
468
|
-
|
469
|
-
|
470
|
-
|
471
|
-
|
472
|
-
|
473
|
-
|
474
|
-
|
475
|
-
|
476
|
-
|
477
|
-
|
478
|
-
|
479
|
-
|
480
|
-
|
481
|
-
|
482
|
-
|
483
|
-
|
484
|
-
|
485
|
-
|
486
|
-
|
487
|
-
|
488
|
-
|
489
|
-
|
490
|
-
|
491
|
-
|
492
|
-
|
493
|
-
|
494
|
-
|
495
|
-
|
496
|
-
|
497
|
-
|
498
|
-
|
499
|
-
|
500
|
-
|
501
|
-
|
502
|
-
|
503
|
-
|
504
|
-
|
505
|
-
|
506
|
-
|
507
|
-
|
661
|
+
# don't try and fetch a fragment from the node we're rebuilding to
|
662
|
+
useful_bucket = self._make_fragment_requests(
|
663
|
+
job, node, df, buckets, error_responses)
|
664
|
+
|
665
|
+
if useful_bucket:
|
666
|
+
frag_indexes = list(useful_bucket.useful_responses.keys())
|
667
|
+
self.logger.debug('Reconstruct frag #%s with frag indexes %s'
|
668
|
+
% (fi_to_rebuild, frag_indexes))
|
669
|
+
responses = list(useful_bucket.useful_responses.values())
|
670
|
+
rebuilt_fragment_iter = self.make_rebuilt_fragment_iter(
|
671
|
+
responses[:policy.ec_ndata], path, policy, fi_to_rebuild)
|
672
|
+
return RebuildingECDiskFileStream(datafile_metadata, fi_to_rebuild,
|
673
|
+
rebuilt_fragment_iter)
|
674
|
+
|
675
|
+
full_path = _full_path(node, partition, path, policy)
|
676
|
+
for timestamp, bucket in sorted(buckets.items()):
|
677
|
+
self.logger.error(
|
678
|
+
'Unable to get enough responses (%s/%s from %s ok responses) '
|
679
|
+
'to reconstruct %s %s frag#%s with ETag %s and timestamp %s' %
|
680
|
+
(len(bucket.useful_responses), policy.ec_ndata,
|
681
|
+
bucket.num_responses,
|
682
|
+
'durable' if bucket.durable else 'non-durable',
|
683
|
+
full_path, fi_to_rebuild, bucket.etag, timestamp.internal))
|
684
|
+
|
685
|
+
if error_responses:
|
686
|
+
durable = buckets[local_timestamp].durable
|
687
|
+
errors = ', '.join(
|
688
|
+
'%s x %s' % (len(responses),
|
689
|
+
'unknown' if status == UNKNOWN_RESPONSE_STATUS
|
690
|
+
else status)
|
691
|
+
for status, responses in sorted(error_responses.items()))
|
692
|
+
self.logger.error(
|
693
|
+
'Unable to get enough responses (%s error responses) '
|
694
|
+
'to reconstruct %s %s frag#%s' % (
|
695
|
+
errors, 'durable' if durable else 'non-durable',
|
696
|
+
full_path, fi_to_rebuild))
|
697
|
+
|
698
|
+
if self._is_quarantine_candidate(policy, buckets, error_responses, df):
|
699
|
+
raise df._quarantine(
|
700
|
+
df._data_file, "Solitary fragment #%s" % df._frag_index)
|
701
|
+
|
702
|
+
raise DiskFileError('Unable to reconstruct EC archive')
|
508
703
|
|
509
704
|
def _reconstruct(self, policy, fragment_payload, frag_index):
|
510
705
|
return policy.pyeclib_driver.reconstruct(fragment_payload,
|
@@ -539,8 +734,8 @@ class ObjectReconstructor(Daemon):
|
|
539
734
|
fragment_payload = [fragment for fragment in pile]
|
540
735
|
except (Exception, Timeout):
|
541
736
|
self.logger.exception(
|
542
|
-
|
543
|
-
|
737
|
+
"Error trying to rebuild %(path)s "
|
738
|
+
"policy#%(policy)d frag#%(frag_index)s",
|
544
739
|
{'path': path,
|
545
740
|
'policy': policy,
|
546
741
|
'frag_index': frag_index,
|
@@ -562,9 +757,9 @@ class ObjectReconstructor(Daemon):
|
|
562
757
|
elapsed = (time.time() - self.start) or 0.000001
|
563
758
|
rate = self.reconstruction_part_count / elapsed
|
564
759
|
self.logger.info(
|
565
|
-
|
566
|
-
|
567
|
-
|
760
|
+
"%(reconstructed)d/%(total)d (%(percentage).2f%%)"
|
761
|
+
" partitions reconstructed in %(time).2fs "
|
762
|
+
"(%(rate).2f/sec, %(remaining)s remaining)",
|
568
763
|
{'reconstructed': self.reconstruction_part_count,
|
569
764
|
'total': self.part_count,
|
570
765
|
'percentage':
|
@@ -577,29 +772,31 @@ class ObjectReconstructor(Daemon):
|
|
577
772
|
|
578
773
|
if self.suffix_count and self.partition_times:
|
579
774
|
self.logger.info(
|
580
|
-
|
581
|
-
|
775
|
+
"%(checked)d suffixes checked - "
|
776
|
+
"%(hashed).2f%% hashed, %(synced).2f%% synced",
|
582
777
|
{'checked': self.suffix_count,
|
583
778
|
'hashed': (self.suffix_hash * 100.0) / self.suffix_count,
|
584
779
|
'synced': (self.suffix_sync * 100.0) / self.suffix_count})
|
585
780
|
self.partition_times.sort()
|
586
781
|
self.logger.info(
|
587
|
-
|
588
|
-
|
782
|
+
"Partition times: max %(max).4fs, "
|
783
|
+
"min %(min).4fs, med %(med).4fs",
|
589
784
|
{'max': self.partition_times[-1],
|
590
785
|
'min': self.partition_times[0],
|
591
786
|
'med': self.partition_times[
|
592
787
|
len(self.partition_times) // 2]})
|
593
788
|
else:
|
594
789
|
self.logger.info(
|
595
|
-
|
790
|
+
"Nothing reconstructed for %s seconds.",
|
596
791
|
(time.time() - self.start))
|
597
792
|
|
598
793
|
def _emplace_log_prefix(self, worker_index):
|
599
|
-
self.logger
|
600
|
-
|
601
|
-
|
602
|
-
|
794
|
+
self.logger = get_prefixed_logger(
|
795
|
+
self.logger, "[worker %d/%d pid=%s] " % (
|
796
|
+
worker_index + 1,
|
797
|
+
# use 1-based indexing for more readable logs
|
798
|
+
self.reconstructor_workers,
|
799
|
+
os.getpid()))
|
603
800
|
|
604
801
|
def kill_coros(self):
|
605
802
|
"""Utility function that kills all coroutines currently running."""
|
@@ -627,7 +824,7 @@ class ObjectReconstructor(Daemon):
|
|
627
824
|
while True:
|
628
825
|
sleep(self.lockup_timeout)
|
629
826
|
if self.reconstruction_count == self.last_reconstruction_count:
|
630
|
-
self.logger.error(
|
827
|
+
self.logger.error("Lockup detected.. killing live coros.")
|
631
828
|
self.kill_coros()
|
632
829
|
self.last_reconstruction_count = self.reconstruction_count
|
633
830
|
|
@@ -664,22 +861,6 @@ class ObjectReconstructor(Daemon):
|
|
664
861
|
suffixes.append(suffix)
|
665
862
|
return suffixes
|
666
863
|
|
667
|
-
def rehash_remote(self, node, job, suffixes):
|
668
|
-
headers = self.headers.copy()
|
669
|
-
headers['X-Backend-Storage-Policy-Index'] = int(job['policy'])
|
670
|
-
try:
|
671
|
-
with Timeout(self.http_timeout):
|
672
|
-
conn = http_connect(
|
673
|
-
node['replication_ip'], node['replication_port'],
|
674
|
-
node['device'], job['partition'], 'REPLICATE',
|
675
|
-
'/' + '-'.join(sorted(suffixes)),
|
676
|
-
headers=headers)
|
677
|
-
conn.getresponse().read()
|
678
|
-
except (Exception, Timeout):
|
679
|
-
self.logger.exception(
|
680
|
-
_("Trying to sync suffixes with %s") % _full_path(
|
681
|
-
node, job['partition'], '', job['policy']))
|
682
|
-
|
683
864
|
def _iter_nodes_for_frag(self, policy, partition, node):
|
684
865
|
"""
|
685
866
|
Generate a priority list of nodes that can sync to the given node.
|
@@ -688,7 +869,7 @@ class ObjectReconstructor(Daemon):
|
|
688
869
|
handoffs.
|
689
870
|
|
690
871
|
To avoid conflicts placing frags we'll skip through the handoffs and
|
691
|
-
only yield back those that are offset equal to
|
872
|
+
only yield back those that are offset equal to the given primary
|
692
873
|
node index.
|
693
874
|
|
694
875
|
Nodes returned from this iterator will have 'backend_index' set.
|
@@ -732,15 +913,17 @@ class ObjectReconstructor(Daemon):
|
|
732
913
|
except StopIteration:
|
733
914
|
break
|
734
915
|
attempts_remaining -= 1
|
916
|
+
conn = None
|
735
917
|
try:
|
736
918
|
with Timeout(self.http_timeout):
|
737
|
-
|
919
|
+
conn = http_connect(
|
738
920
|
node['replication_ip'], node['replication_port'],
|
739
921
|
node['device'], job['partition'], 'REPLICATE',
|
740
|
-
'', headers=headers)
|
922
|
+
'', headers=headers)
|
923
|
+
resp = conn.getresponse()
|
741
924
|
if resp.status == HTTP_INSUFFICIENT_STORAGE:
|
742
925
|
self.logger.error(
|
743
|
-
|
926
|
+
'%s responded as unmounted',
|
744
927
|
_full_path(node, job['partition'], '',
|
745
928
|
job['policy']))
|
746
929
|
attempts_remaining += 1
|
@@ -748,10 +931,10 @@ class ObjectReconstructor(Daemon):
|
|
748
931
|
full_path = _full_path(node, job['partition'], '',
|
749
932
|
job['policy'])
|
750
933
|
self.logger.error(
|
751
|
-
|
934
|
+
"Invalid response %(resp)s from %(full_path)s",
|
752
935
|
{'resp': resp.status, 'full_path': full_path})
|
753
936
|
else:
|
754
|
-
remote_suffixes = pickle.loads(resp.read())
|
937
|
+
remote_suffixes = pickle.loads(resp.read()) # nosec: B301
|
755
938
|
except (Exception, Timeout):
|
756
939
|
# all exceptions are logged here so that our caller can
|
757
940
|
# safely catch our exception and continue to the next node
|
@@ -760,6 +943,9 @@ class ObjectReconstructor(Daemon):
|
|
760
943
|
'from %r' % _full_path(
|
761
944
|
node, job['partition'], '',
|
762
945
|
job['policy']))
|
946
|
+
finally:
|
947
|
+
if conn:
|
948
|
+
conn.close()
|
763
949
|
if remote_suffixes is None:
|
764
950
|
raise SuffixSyncError('Unable to get remote suffix hashes')
|
765
951
|
|
@@ -781,7 +967,7 @@ class ObjectReconstructor(Daemon):
|
|
781
967
|
self.suffix_count += len(suffixes)
|
782
968
|
return suffixes, node
|
783
969
|
|
784
|
-
def delete_reverted_objs(self, job, objects
|
970
|
+
def delete_reverted_objs(self, job, objects):
|
785
971
|
"""
|
786
972
|
For EC we can potentially revert only some of a partition
|
787
973
|
so we'll delete reverted objects here. Note that we delete
|
@@ -790,21 +976,45 @@ class ObjectReconstructor(Daemon):
|
|
790
976
|
:param job: the job being processed
|
791
977
|
:param objects: a dict of objects to be deleted, each entry maps
|
792
978
|
hash=>timestamp
|
793
|
-
:param frag_index: (int) the fragment index of data files to be deleted
|
794
979
|
"""
|
795
980
|
df_mgr = self._df_router[job['policy']]
|
796
981
|
suffixes_to_delete = set()
|
797
982
|
for object_hash, timestamps in objects.items():
|
798
983
|
try:
|
799
|
-
df = df_mgr.
|
984
|
+
df, filenames = df_mgr.get_diskfile_and_filenames_from_hash(
|
800
985
|
job['local_dev']['device'], job['partition'],
|
801
986
|
object_hash, job['policy'],
|
802
|
-
frag_index=frag_index)
|
803
|
-
|
987
|
+
frag_index=job['frag_index'])
|
988
|
+
# legacy durable data files look like modern nondurable data
|
989
|
+
# files; we therefore override nondurable_purge_delay when we
|
990
|
+
# know the data file is durable so that legacy durable data
|
991
|
+
# files get purged
|
992
|
+
nondurable_purge_delay = (0 if timestamps.get('durable')
|
993
|
+
else df_mgr.commit_window)
|
994
|
+
data_files = [
|
995
|
+
f for f in filenames
|
996
|
+
if f.endswith('.data')]
|
997
|
+
purgable_data_files = [
|
998
|
+
f for f in data_files
|
999
|
+
if f.startswith(timestamps['ts_data'].internal)]
|
1000
|
+
if (job['primary_frag_index'] is None
|
1001
|
+
and len(purgable_data_files) == len(data_files) <= 1):
|
1002
|
+
# pure handoff node, and we're about to purge the last
|
1003
|
+
# .data file, so it's ok to remove any meta file that may
|
1004
|
+
# have been reverted
|
1005
|
+
meta_timestamp = timestamps.get('ts_meta')
|
1006
|
+
else:
|
1007
|
+
meta_timestamp = None
|
1008
|
+
df.purge(timestamps['ts_data'], job['frag_index'],
|
1009
|
+
nondurable_purge_delay, meta_timestamp)
|
1010
|
+
except DiskFileNotExist:
|
1011
|
+
# may have passed reclaim age since being reverted, or may have
|
1012
|
+
# raced with another reconstructor process trying the same
|
1013
|
+
pass
|
804
1014
|
except DiskFileError:
|
805
1015
|
self.logger.exception(
|
806
1016
|
'Unable to purge DiskFile (%r %r %r)',
|
807
|
-
object_hash, timestamps['ts_data'], frag_index)
|
1017
|
+
object_hash, timestamps['ts_data'], job['frag_index'])
|
808
1018
|
suffixes_to_delete.add(object_hash[-3:])
|
809
1019
|
|
810
1020
|
for suffix in suffixes_to_delete:
|
@@ -854,12 +1064,13 @@ class ObjectReconstructor(Daemon):
|
|
854
1064
|
if not suffixes:
|
855
1065
|
continue
|
856
1066
|
|
857
|
-
# ssync any out-of-sync suffixes with the remote node
|
1067
|
+
# ssync any out-of-sync suffixes with the remote node; do not limit
|
1068
|
+
# max_objects - we need to check them all because, unlike a revert
|
1069
|
+
# job, we don't purge any objects so start with the same set each
|
1070
|
+
# cycle
|
858
1071
|
success, _ = ssync_sender(
|
859
|
-
self, node, job, suffixes
|
860
|
-
|
861
|
-
if success:
|
862
|
-
self.rehash_remote(node, job, suffixes)
|
1072
|
+
self, node, job, suffixes, include_non_durable=False,
|
1073
|
+
max_objects=0)()
|
863
1074
|
# update stats for this attempt
|
864
1075
|
self.suffix_sync += len(suffixes)
|
865
1076
|
self.logger.update_stats('suffix.syncs', len(suffixes))
|
@@ -873,19 +1084,37 @@ class ObjectReconstructor(Daemon):
|
|
873
1084
|
'partition.delete.count.%s' % (job['local_dev']['device'],))
|
874
1085
|
syncd_with = 0
|
875
1086
|
reverted_objs = {}
|
876
|
-
|
877
|
-
|
878
|
-
|
879
|
-
|
880
|
-
|
881
|
-
|
882
|
-
|
883
|
-
|
884
|
-
|
885
|
-
|
886
|
-
|
887
|
-
|
888
|
-
|
1087
|
+
try:
|
1088
|
+
df_mgr = self._df_router[job['policy']]
|
1089
|
+
# Only object-server can take this lock if an incoming SSYNC is
|
1090
|
+
# running on the same partition. Taking the lock here ensure we
|
1091
|
+
# won't enter a race condition where both nodes try to
|
1092
|
+
# cross-replicate the same partition and both delete it.
|
1093
|
+
with df_mgr.partition_lock(job['device'], job['policy'],
|
1094
|
+
job['partition'], name='replication',
|
1095
|
+
timeout=0.2):
|
1096
|
+
limited_by_max_objects = False
|
1097
|
+
for node in job['sync_to']:
|
1098
|
+
node['backend_index'] = job['policy'].get_backend_index(
|
1099
|
+
node['index'])
|
1100
|
+
sender = ssync_sender(
|
1101
|
+
self, node, job, job['suffixes'],
|
1102
|
+
include_non_durable=True,
|
1103
|
+
max_objects=self.max_objects_per_revert)
|
1104
|
+
success, in_sync_objs = sender()
|
1105
|
+
limited_by_max_objects |= sender.limited_by_max_objects
|
1106
|
+
if success:
|
1107
|
+
syncd_with += 1
|
1108
|
+
reverted_objs.update(in_sync_objs)
|
1109
|
+
if syncd_with >= len(job['sync_to']):
|
1110
|
+
self.delete_reverted_objs(job, reverted_objs)
|
1111
|
+
if syncd_with < len(job['sync_to']) or limited_by_max_objects:
|
1112
|
+
self.handoffs_remaining += 1
|
1113
|
+
except PartitionLockTimeout:
|
1114
|
+
self.logger.info("Unable to lock handoff partition %d for revert "
|
1115
|
+
"on device %s policy %d",
|
1116
|
+
job['partition'], job['device'], job['policy'])
|
1117
|
+
self.logger.increment('partition.lock-failure.count')
|
889
1118
|
self.handoffs_remaining += 1
|
890
1119
|
self.logger.timing_since('partition.delete.timing', begin)
|
891
1120
|
|
@@ -947,7 +1176,8 @@ class ObjectReconstructor(Daemon):
|
|
947
1176
|
data_fi_to_suffixes[fi].append(suffix)
|
948
1177
|
|
949
1178
|
# helper to ensure consistent structure of jobs
|
950
|
-
def build_job(job_type, frag_index, suffixes, sync_to
|
1179
|
+
def build_job(job_type, frag_index, suffixes, sync_to,
|
1180
|
+
primary_frag_index):
|
951
1181
|
return {
|
952
1182
|
'job_type': job_type,
|
953
1183
|
'frag_index': frag_index,
|
@@ -960,28 +1190,33 @@ class ObjectReconstructor(Daemon):
|
|
960
1190
|
'local_dev': local_dev,
|
961
1191
|
# ssync likes to have it handy
|
962
1192
|
'device': local_dev['device'],
|
1193
|
+
# provide a hint to revert jobs that the node is a primary for
|
1194
|
+
# one of the frag indexes
|
1195
|
+
'primary_frag_index': primary_frag_index,
|
963
1196
|
}
|
964
1197
|
|
965
1198
|
# aggregate jobs for all the fragment index in this part
|
966
1199
|
jobs = []
|
967
1200
|
|
968
1201
|
# check the primary nodes - to see if the part belongs here
|
1202
|
+
primary_frag_index = None
|
969
1203
|
part_nodes = policy.object_ring.get_part_nodes(partition)
|
970
1204
|
for node in part_nodes:
|
971
1205
|
if node['id'] == local_dev['id']:
|
972
1206
|
# this partition belongs here, we'll need a sync job
|
973
|
-
|
1207
|
+
primary_frag_index = policy.get_backend_index(node['index'])
|
974
1208
|
try:
|
975
|
-
suffixes = data_fi_to_suffixes.pop(
|
1209
|
+
suffixes = data_fi_to_suffixes.pop(primary_frag_index)
|
976
1210
|
except KeyError:
|
977
1211
|
# N.B. If this function ever returns an empty list of jobs
|
978
1212
|
# the entire partition will be deleted.
|
979
1213
|
suffixes = []
|
980
1214
|
sync_job = build_job(
|
981
1215
|
job_type=SYNC,
|
982
|
-
frag_index=
|
1216
|
+
frag_index=primary_frag_index,
|
983
1217
|
suffixes=suffixes,
|
984
1218
|
sync_to=_get_partners(node['index'], part_nodes),
|
1219
|
+
primary_frag_index=primary_frag_index
|
985
1220
|
)
|
986
1221
|
# ssync callback to rebuild missing fragment_archives
|
987
1222
|
sync_job['sync_diskfile_builder'] = self.reconstruct_fa
|
@@ -1012,6 +1247,7 @@ class ObjectReconstructor(Daemon):
|
|
1012
1247
|
frag_index=fi,
|
1013
1248
|
suffixes=data_fi_to_suffixes[fi],
|
1014
1249
|
sync_to=nodes_sync_to,
|
1250
|
+
primary_frag_index=primary_frag_index
|
1015
1251
|
)
|
1016
1252
|
jobs.append(revert_job)
|
1017
1253
|
|
@@ -1038,21 +1274,22 @@ class ObjectReconstructor(Daemon):
|
|
1038
1274
|
job_type=REVERT,
|
1039
1275
|
frag_index=None,
|
1040
1276
|
suffixes=non_data_fragment_suffixes,
|
1041
|
-
sync_to=random.sample(part_nodes, nsample)
|
1277
|
+
sync_to=random.sample(part_nodes, nsample),
|
1278
|
+
primary_frag_index=primary_frag_index
|
1042
1279
|
))
|
1043
1280
|
# return a list of jobs for this part
|
1044
1281
|
return jobs
|
1045
1282
|
|
1046
1283
|
def get_policy2devices(self):
|
1047
|
-
ips = whataremyips(self.
|
1284
|
+
ips = whataremyips(self.ring_ip)
|
1048
1285
|
policy2devices = {}
|
1049
1286
|
for policy in self.policies:
|
1050
1287
|
self.load_object_ring(policy)
|
1051
|
-
local_devices =
|
1052
|
-
|
1288
|
+
local_devices = [
|
1289
|
+
dev for dev in policy.object_ring.devs
|
1290
|
+
if dev and is_local_device(
|
1053
1291
|
ips, self.port,
|
1054
|
-
dev['replication_ip'], dev['replication_port'])
|
1055
|
-
policy.object_ring.devs))
|
1292
|
+
dev['replication_ip'], dev['replication_port'])]
|
1056
1293
|
policy2devices[policy] = local_devices
|
1057
1294
|
return policy2devices
|
1058
1295
|
|
@@ -1087,7 +1324,7 @@ class ObjectReconstructor(Daemon):
|
|
1087
1324
|
policy.object_ring, 'next_part_power', None)
|
1088
1325
|
if next_part_power is not None:
|
1089
1326
|
self.logger.warning(
|
1090
|
-
|
1327
|
+
"next_part_power set in policy '%s'. Skipping",
|
1091
1328
|
policy.name)
|
1092
1329
|
continue
|
1093
1330
|
|
@@ -1099,7 +1336,7 @@ class ObjectReconstructor(Daemon):
|
|
1099
1336
|
self.device_count += 1
|
1100
1337
|
dev_path = df_mgr.get_dev_path(local_dev['device'])
|
1101
1338
|
if not dev_path:
|
1102
|
-
self.logger.warning(
|
1339
|
+
self.logger.warning('%s is not mounted',
|
1103
1340
|
local_dev['device'])
|
1104
1341
|
continue
|
1105
1342
|
data_dir = get_data_dir(policy)
|
@@ -1193,7 +1430,7 @@ class ObjectReconstructor(Daemon):
|
|
1193
1430
|
shutil.rmtree(path, ignore_errors=True)
|
1194
1431
|
remove_file(path)
|
1195
1432
|
|
1196
|
-
self.logger.info(
|
1433
|
+
self.logger.info("Removing partition: %s", path)
|
1197
1434
|
tpool.execute(kill_it, path)
|
1198
1435
|
|
1199
1436
|
def reconstruct(self, **kwargs):
|
@@ -1203,15 +1440,21 @@ class ObjectReconstructor(Daemon):
|
|
1203
1440
|
|
1204
1441
|
stats = spawn(self.heartbeat)
|
1205
1442
|
lockup_detector = spawn(self.detect_lockups)
|
1443
|
+
changed_rings = set()
|
1206
1444
|
|
1207
1445
|
try:
|
1208
1446
|
self.run_pool = GreenPool(size=self.concurrency)
|
1209
1447
|
for part_info in self.collect_parts(**kwargs):
|
1210
1448
|
sleep() # Give spawns a cycle
|
1449
|
+
if part_info['policy'] in changed_rings:
|
1450
|
+
continue
|
1211
1451
|
if not self.check_ring(part_info['policy'].object_ring):
|
1212
|
-
|
1213
|
-
|
1214
|
-
|
1452
|
+
changed_rings.add(part_info['policy'])
|
1453
|
+
self.logger.info(
|
1454
|
+
"Ring change detected for policy %d (%s). Aborting "
|
1455
|
+
"current reconstruction pass for this policy.",
|
1456
|
+
part_info['policy'].idx, part_info['policy'].name)
|
1457
|
+
continue
|
1215
1458
|
|
1216
1459
|
self.reconstruction_part_count += 1
|
1217
1460
|
jobs = self.build_reconstruction_jobs(part_info)
|
@@ -1230,8 +1473,8 @@ class ObjectReconstructor(Daemon):
|
|
1230
1473
|
with Timeout(self.lockup_timeout):
|
1231
1474
|
self.run_pool.waitall()
|
1232
1475
|
except (Exception, Timeout):
|
1233
|
-
self.logger.exception(
|
1234
|
-
|
1476
|
+
self.logger.exception("Exception in top-level "
|
1477
|
+
"reconstruction loop")
|
1235
1478
|
self.kill_coros()
|
1236
1479
|
finally:
|
1237
1480
|
stats.kill()
|
@@ -1239,14 +1482,14 @@ class ObjectReconstructor(Daemon):
|
|
1239
1482
|
self.stats_line()
|
1240
1483
|
if self.handoffs_only:
|
1241
1484
|
if self.handoffs_remaining > 0:
|
1242
|
-
self.logger.info(
|
1485
|
+
self.logger.info(
|
1243
1486
|
"Handoffs only mode still has handoffs remaining. "
|
1244
|
-
"Next pass will continue to revert handoffs.")
|
1487
|
+
"Next pass will continue to revert handoffs.")
|
1245
1488
|
else:
|
1246
|
-
self.logger.warning(
|
1489
|
+
self.logger.warning(
|
1247
1490
|
"Handoffs only mode found no handoffs remaining. "
|
1248
1491
|
"You should disable handoffs_only once all nodes "
|
1249
|
-
"are reporting no handoffs remaining.")
|
1492
|
+
"are reporting no handoffs remaining.")
|
1250
1493
|
|
1251
1494
|
def final_recon_dump(self, total, override_devices=None, **kwargs):
|
1252
1495
|
"""
|
@@ -1283,13 +1526,13 @@ class ObjectReconstructor(Daemon):
|
|
1283
1526
|
if multiprocess_worker_index is not None:
|
1284
1527
|
self._emplace_log_prefix(multiprocess_worker_index)
|
1285
1528
|
start = time.time()
|
1286
|
-
self.logger.info(
|
1529
|
+
self.logger.info("Running object reconstructor in script mode.")
|
1287
1530
|
override_opts = parse_override_options(once=True, **kwargs)
|
1288
1531
|
self.reconstruct(override_devices=override_opts.devices,
|
1289
1532
|
override_partitions=override_opts.partitions)
|
1290
1533
|
total = (time.time() - start) / 60
|
1291
1534
|
self.logger.info(
|
1292
|
-
|
1535
|
+
"Object reconstruction complete (once). (%.02f minutes)", total)
|
1293
1536
|
# Only dump stats if they would actually be meaningful -- i.e. we're
|
1294
1537
|
# collecting per-disk stats and covering all partitions, or we're
|
1295
1538
|
# covering all partitions, all disks.
|
@@ -1302,21 +1545,39 @@ class ObjectReconstructor(Daemon):
|
|
1302
1545
|
def run_forever(self, multiprocess_worker_index=None, *args, **kwargs):
|
1303
1546
|
if multiprocess_worker_index is not None:
|
1304
1547
|
self._emplace_log_prefix(multiprocess_worker_index)
|
1305
|
-
self.logger.info(
|
1548
|
+
self.logger.info("Starting object reconstructor in daemon mode.")
|
1306
1549
|
# Run the reconstructor continually
|
1307
1550
|
while True:
|
1308
1551
|
start = time.time()
|
1309
|
-
self.logger.info(
|
1552
|
+
self.logger.info("Starting object reconstruction pass.")
|
1310
1553
|
override_opts = parse_override_options(**kwargs)
|
1311
1554
|
# Run the reconstructor
|
1312
1555
|
self.reconstruct(override_devices=override_opts.devices,
|
1313
1556
|
override_partitions=override_opts.partitions)
|
1314
1557
|
total = (time.time() - start) / 60
|
1315
1558
|
self.logger.info(
|
1316
|
-
|
1559
|
+
"Object reconstruction complete. (%.02f minutes)", total)
|
1317
1560
|
self.final_recon_dump(
|
1318
1561
|
total, override_devices=override_opts.devices,
|
1319
1562
|
override_partitions=override_opts.partitions)
|
1320
1563
|
self.logger.debug('reconstruction sleeping for %s seconds.',
|
1321
1564
|
self.interval)
|
1322
1565
|
sleep(self.interval)
|
1566
|
+
|
1567
|
+
|
1568
|
+
def main():
|
1569
|
+
parser = OptionParser("%prog CONFIG [options]")
|
1570
|
+
parser.add_option('-d', '--devices',
|
1571
|
+
help='Reconstruct only given devices. '
|
1572
|
+
'Comma-separated list. '
|
1573
|
+
'Only has effect if --once is used.')
|
1574
|
+
parser.add_option('-p', '--partitions',
|
1575
|
+
help='Reconstruct only given partitions. '
|
1576
|
+
'Comma-separated list. '
|
1577
|
+
'Only has effect if --once is used.')
|
1578
|
+
conf_file, options = parse_options(parser=parser, once=True)
|
1579
|
+
run_daemon(ObjectReconstructor, conf_file, **options)
|
1580
|
+
|
1581
|
+
|
1582
|
+
if __name__ == '__main__':
|
1583
|
+
main()
|