swift 2.23.3__py3-none-any.whl → 2.35.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (206) hide show
  1. swift/__init__.py +29 -50
  2. swift/account/auditor.py +21 -118
  3. swift/account/backend.py +33 -28
  4. swift/account/reaper.py +37 -28
  5. swift/account/replicator.py +22 -0
  6. swift/account/server.py +60 -26
  7. swift/account/utils.py +28 -11
  8. swift-2.23.3.data/scripts/swift-account-audit → swift/cli/account_audit.py +23 -13
  9. swift-2.23.3.data/scripts/swift-config → swift/cli/config.py +2 -2
  10. swift/cli/container_deleter.py +5 -11
  11. swift-2.23.3.data/scripts/swift-dispersion-populate → swift/cli/dispersion_populate.py +8 -7
  12. swift/cli/dispersion_report.py +10 -9
  13. swift-2.23.3.data/scripts/swift-drive-audit → swift/cli/drive_audit.py +63 -21
  14. swift/cli/form_signature.py +3 -7
  15. swift-2.23.3.data/scripts/swift-get-nodes → swift/cli/get_nodes.py +8 -2
  16. swift/cli/info.py +154 -14
  17. swift/cli/manage_shard_ranges.py +705 -37
  18. swift-2.23.3.data/scripts/swift-oldies → swift/cli/oldies.py +25 -14
  19. swift-2.23.3.data/scripts/swift-orphans → swift/cli/orphans.py +7 -3
  20. swift/cli/recon.py +196 -67
  21. swift-2.23.3.data/scripts/swift-recon-cron → swift/cli/recon_cron.py +17 -20
  22. swift-2.23.3.data/scripts/swift-reconciler-enqueue → swift/cli/reconciler_enqueue.py +2 -3
  23. swift/cli/relinker.py +807 -126
  24. swift/cli/reload.py +135 -0
  25. swift/cli/ringbuilder.py +217 -20
  26. swift/cli/ringcomposer.py +0 -1
  27. swift/cli/shard-info.py +4 -3
  28. swift/common/base_storage_server.py +9 -20
  29. swift/common/bufferedhttp.py +48 -74
  30. swift/common/constraints.py +20 -15
  31. swift/common/container_sync_realms.py +9 -11
  32. swift/common/daemon.py +25 -8
  33. swift/common/db.py +195 -128
  34. swift/common/db_auditor.py +168 -0
  35. swift/common/db_replicator.py +95 -55
  36. swift/common/digest.py +141 -0
  37. swift/common/direct_client.py +144 -33
  38. swift/common/error_limiter.py +93 -0
  39. swift/common/exceptions.py +25 -1
  40. swift/common/header_key_dict.py +2 -9
  41. swift/common/http_protocol.py +373 -0
  42. swift/common/internal_client.py +129 -59
  43. swift/common/linkat.py +3 -4
  44. swift/common/manager.py +284 -67
  45. swift/common/memcached.py +390 -145
  46. swift/common/middleware/__init__.py +4 -0
  47. swift/common/middleware/account_quotas.py +211 -46
  48. swift/common/middleware/acl.py +3 -8
  49. swift/common/middleware/backend_ratelimit.py +230 -0
  50. swift/common/middleware/bulk.py +22 -34
  51. swift/common/middleware/catch_errors.py +1 -3
  52. swift/common/middleware/cname_lookup.py +6 -11
  53. swift/common/middleware/container_quotas.py +1 -1
  54. swift/common/middleware/container_sync.py +39 -17
  55. swift/common/middleware/copy.py +12 -0
  56. swift/common/middleware/crossdomain.py +22 -9
  57. swift/common/middleware/crypto/__init__.py +2 -1
  58. swift/common/middleware/crypto/crypto_utils.py +11 -15
  59. swift/common/middleware/crypto/decrypter.py +28 -11
  60. swift/common/middleware/crypto/encrypter.py +12 -17
  61. swift/common/middleware/crypto/keymaster.py +8 -15
  62. swift/common/middleware/crypto/kms_keymaster.py +2 -1
  63. swift/common/middleware/dlo.py +15 -11
  64. swift/common/middleware/domain_remap.py +5 -4
  65. swift/common/middleware/etag_quoter.py +128 -0
  66. swift/common/middleware/formpost.py +73 -70
  67. swift/common/middleware/gatekeeper.py +8 -1
  68. swift/common/middleware/keystoneauth.py +33 -3
  69. swift/common/middleware/list_endpoints.py +4 -4
  70. swift/common/middleware/listing_formats.py +85 -49
  71. swift/common/middleware/memcache.py +4 -95
  72. swift/common/middleware/name_check.py +3 -2
  73. swift/common/middleware/proxy_logging.py +160 -92
  74. swift/common/middleware/ratelimit.py +17 -10
  75. swift/common/middleware/read_only.py +6 -4
  76. swift/common/middleware/recon.py +59 -22
  77. swift/common/middleware/s3api/acl_handlers.py +25 -3
  78. swift/common/middleware/s3api/acl_utils.py +6 -1
  79. swift/common/middleware/s3api/controllers/__init__.py +6 -0
  80. swift/common/middleware/s3api/controllers/acl.py +3 -2
  81. swift/common/middleware/s3api/controllers/bucket.py +242 -137
  82. swift/common/middleware/s3api/controllers/logging.py +2 -2
  83. swift/common/middleware/s3api/controllers/multi_delete.py +43 -20
  84. swift/common/middleware/s3api/controllers/multi_upload.py +219 -133
  85. swift/common/middleware/s3api/controllers/obj.py +112 -8
  86. swift/common/middleware/s3api/controllers/object_lock.py +44 -0
  87. swift/common/middleware/s3api/controllers/s3_acl.py +2 -2
  88. swift/common/middleware/s3api/controllers/tagging.py +57 -0
  89. swift/common/middleware/s3api/controllers/versioning.py +36 -7
  90. swift/common/middleware/s3api/etree.py +22 -9
  91. swift/common/middleware/s3api/exception.py +0 -4
  92. swift/common/middleware/s3api/s3api.py +113 -41
  93. swift/common/middleware/s3api/s3request.py +384 -218
  94. swift/common/middleware/s3api/s3response.py +126 -23
  95. swift/common/middleware/s3api/s3token.py +16 -17
  96. swift/common/middleware/s3api/schema/delete.rng +1 -1
  97. swift/common/middleware/s3api/subresource.py +7 -10
  98. swift/common/middleware/s3api/utils.py +27 -10
  99. swift/common/middleware/slo.py +665 -358
  100. swift/common/middleware/staticweb.py +64 -37
  101. swift/common/middleware/symlink.py +51 -18
  102. swift/common/middleware/tempauth.py +76 -58
  103. swift/common/middleware/tempurl.py +191 -173
  104. swift/common/middleware/versioned_writes/__init__.py +51 -0
  105. swift/common/middleware/{versioned_writes.py → versioned_writes/legacy.py} +27 -26
  106. swift/common/middleware/versioned_writes/object_versioning.py +1482 -0
  107. swift/common/middleware/x_profile/exceptions.py +1 -4
  108. swift/common/middleware/x_profile/html_viewer.py +18 -19
  109. swift/common/middleware/x_profile/profile_model.py +1 -2
  110. swift/common/middleware/xprofile.py +10 -10
  111. swift-2.23.3.data/scripts/swift-container-server → swift/common/recon.py +13 -8
  112. swift/common/registry.py +147 -0
  113. swift/common/request_helpers.py +324 -57
  114. swift/common/ring/builder.py +67 -25
  115. swift/common/ring/composite_builder.py +1 -1
  116. swift/common/ring/ring.py +177 -51
  117. swift/common/ring/utils.py +1 -1
  118. swift/common/splice.py +10 -6
  119. swift/common/statsd_client.py +205 -0
  120. swift/common/storage_policy.py +49 -44
  121. swift/common/swob.py +86 -102
  122. swift/common/{utils.py → utils/__init__.py} +2163 -2772
  123. swift/common/utils/base.py +131 -0
  124. swift/common/utils/config.py +433 -0
  125. swift/common/utils/ipaddrs.py +256 -0
  126. swift/common/utils/libc.py +345 -0
  127. swift/common/utils/logs.py +859 -0
  128. swift/common/utils/timestamp.py +412 -0
  129. swift/common/wsgi.py +553 -535
  130. swift/container/auditor.py +14 -100
  131. swift/container/backend.py +490 -231
  132. swift/container/reconciler.py +126 -37
  133. swift/container/replicator.py +96 -22
  134. swift/container/server.py +358 -165
  135. swift/container/sharder.py +1540 -684
  136. swift/container/sync.py +94 -88
  137. swift/container/updater.py +53 -32
  138. swift/obj/auditor.py +153 -35
  139. swift/obj/diskfile.py +466 -217
  140. swift/obj/expirer.py +406 -124
  141. swift/obj/mem_diskfile.py +7 -4
  142. swift/obj/mem_server.py +1 -0
  143. swift/obj/reconstructor.py +523 -262
  144. swift/obj/replicator.py +249 -188
  145. swift/obj/server.py +207 -122
  146. swift/obj/ssync_receiver.py +145 -85
  147. swift/obj/ssync_sender.py +113 -54
  148. swift/obj/updater.py +652 -139
  149. swift/obj/watchers/__init__.py +0 -0
  150. swift/obj/watchers/dark_data.py +213 -0
  151. swift/proxy/controllers/account.py +11 -11
  152. swift/proxy/controllers/base.py +848 -604
  153. swift/proxy/controllers/container.py +433 -92
  154. swift/proxy/controllers/info.py +3 -2
  155. swift/proxy/controllers/obj.py +1000 -489
  156. swift/proxy/server.py +185 -112
  157. {swift-2.23.3.dist-info → swift-2.35.0.dist-info}/AUTHORS +58 -11
  158. {swift-2.23.3.dist-info → swift-2.35.0.dist-info}/METADATA +51 -56
  159. swift-2.35.0.dist-info/RECORD +201 -0
  160. {swift-2.23.3.dist-info → swift-2.35.0.dist-info}/WHEEL +1 -1
  161. {swift-2.23.3.dist-info → swift-2.35.0.dist-info}/entry_points.txt +43 -0
  162. swift-2.35.0.dist-info/pbr.json +1 -0
  163. swift/locale/de/LC_MESSAGES/swift.po +0 -1216
  164. swift/locale/en_GB/LC_MESSAGES/swift.po +0 -1207
  165. swift/locale/es/LC_MESSAGES/swift.po +0 -1085
  166. swift/locale/fr/LC_MESSAGES/swift.po +0 -909
  167. swift/locale/it/LC_MESSAGES/swift.po +0 -894
  168. swift/locale/ja/LC_MESSAGES/swift.po +0 -965
  169. swift/locale/ko_KR/LC_MESSAGES/swift.po +0 -964
  170. swift/locale/pt_BR/LC_MESSAGES/swift.po +0 -881
  171. swift/locale/ru/LC_MESSAGES/swift.po +0 -891
  172. swift/locale/tr_TR/LC_MESSAGES/swift.po +0 -832
  173. swift/locale/zh_CN/LC_MESSAGES/swift.po +0 -833
  174. swift/locale/zh_TW/LC_MESSAGES/swift.po +0 -838
  175. swift-2.23.3.data/scripts/swift-account-auditor +0 -23
  176. swift-2.23.3.data/scripts/swift-account-info +0 -51
  177. swift-2.23.3.data/scripts/swift-account-reaper +0 -23
  178. swift-2.23.3.data/scripts/swift-account-replicator +0 -34
  179. swift-2.23.3.data/scripts/swift-account-server +0 -23
  180. swift-2.23.3.data/scripts/swift-container-auditor +0 -23
  181. swift-2.23.3.data/scripts/swift-container-info +0 -55
  182. swift-2.23.3.data/scripts/swift-container-reconciler +0 -21
  183. swift-2.23.3.data/scripts/swift-container-replicator +0 -34
  184. swift-2.23.3.data/scripts/swift-container-sharder +0 -37
  185. swift-2.23.3.data/scripts/swift-container-sync +0 -23
  186. swift-2.23.3.data/scripts/swift-container-updater +0 -23
  187. swift-2.23.3.data/scripts/swift-dispersion-report +0 -24
  188. swift-2.23.3.data/scripts/swift-form-signature +0 -20
  189. swift-2.23.3.data/scripts/swift-init +0 -119
  190. swift-2.23.3.data/scripts/swift-object-auditor +0 -29
  191. swift-2.23.3.data/scripts/swift-object-expirer +0 -33
  192. swift-2.23.3.data/scripts/swift-object-info +0 -60
  193. swift-2.23.3.data/scripts/swift-object-reconstructor +0 -33
  194. swift-2.23.3.data/scripts/swift-object-relinker +0 -41
  195. swift-2.23.3.data/scripts/swift-object-replicator +0 -37
  196. swift-2.23.3.data/scripts/swift-object-server +0 -27
  197. swift-2.23.3.data/scripts/swift-object-updater +0 -23
  198. swift-2.23.3.data/scripts/swift-proxy-server +0 -23
  199. swift-2.23.3.data/scripts/swift-recon +0 -24
  200. swift-2.23.3.data/scripts/swift-ring-builder +0 -24
  201. swift-2.23.3.data/scripts/swift-ring-builder-analyzer +0 -22
  202. swift-2.23.3.data/scripts/swift-ring-composer +0 -22
  203. swift-2.23.3.dist-info/RECORD +0 -220
  204. swift-2.23.3.dist-info/pbr.json +0 -1
  205. {swift-2.23.3.dist-info → swift-2.35.0.dist-info}/LICENSE +0 -0
  206. {swift-2.23.3.dist-info → swift-2.35.0.dist-info}/top_level.txt +0 -0
@@ -12,42 +12,44 @@
12
12
  # implied.
13
13
  # See the License for the specific language governing permissions and
14
14
  # limitations under the License.
15
-
15
+ import itertools
16
16
  import json
17
17
  import errno
18
+ from optparse import OptionParser
18
19
  import os
19
20
  from os.path import join
20
21
  import random
21
22
  import time
22
23
  from collections import defaultdict
23
- import six
24
- import six.moves.cPickle as pickle
24
+ import pickle # nosec: B403
25
25
  import shutil
26
26
 
27
27
  from eventlet import (GreenPile, GreenPool, Timeout, sleep, tpool, spawn)
28
28
  from eventlet.support.greenlets import GreenletExit
29
29
 
30
- from swift import gettext_ as _
31
30
  from swift.common.utils import (
32
31
  whataremyips, unlink_older_than, compute_eta, get_logger,
33
- dump_recon_cache, mkdirs, config_true_value,
34
- GreenAsyncPile, Timestamp, remove_file,
32
+ dump_recon_cache, mkdirs, config_true_value, parse_options,
33
+ GreenAsyncPile, Timestamp, remove_file, node_to_string,
35
34
  load_recon_cache, parse_override_options, distribute_evenly,
36
- PrefixLoggerAdapter, remove_directory)
35
+ remove_directory, config_request_node_count_value,
36
+ non_negative_int, get_prefixed_logger)
37
37
  from swift.common.header_key_dict import HeaderKeyDict
38
38
  from swift.common.bufferedhttp import http_connect
39
- from swift.common.daemon import Daemon
39
+ from swift.common.daemon import Daemon, run_daemon
40
+ from swift.common.recon import RECON_OBJECT_FILE, DEFAULT_RECON_CACHE_PATH
40
41
  from swift.common.ring.utils import is_local_device
41
42
  from swift.obj.ssync_sender import Sender as ssync_sender
42
43
  from swift.common.http import HTTP_OK, HTTP_NOT_FOUND, \
43
44
  HTTP_INSUFFICIENT_STORAGE
44
45
  from swift.obj.diskfile import DiskFileRouter, get_data_dir, \
45
- get_tmp_dir
46
+ get_tmp_dir, DEFAULT_RECLAIM_AGE
46
47
  from swift.common.storage_policy import POLICIES, EC_POLICY
47
48
  from swift.common.exceptions import ConnectionTimeout, DiskFileError, \
48
- SuffixSyncError
49
+ SuffixSyncError, PartitionLockTimeout, DiskFileNotExist
49
50
 
50
51
  SYNC, REVERT = ('sync_only', 'sync_revert')
52
+ UNKNOWN_RESPONSE_STATUS = 0 # used as response status for timeouts, exceptions
51
53
 
52
54
 
53
55
  def _get_partners(node_index, part_nodes):
@@ -81,17 +83,29 @@ def _full_path(node, part, relative_path, policy):
81
83
  :class:`~swift.common.storage_policy.BaseStoragePolicy`
82
84
  :return: string representation of absolute path on node plus policy index
83
85
  """
84
- if not isinstance(relative_path, six.text_type):
86
+ if not isinstance(relative_path, str):
85
87
  relative_path = relative_path.decode('utf8')
86
- return '%(replication_ip)s:%(replication_port)s' \
87
- '/%(device)s/%(part)s%(path)s ' \
88
- 'policy#%(policy)d' % {
89
- 'replication_ip': node['replication_ip'],
90
- 'replication_port': node['replication_port'],
91
- 'device': node['device'],
92
- 'part': part, 'path': relative_path,
93
- 'policy': policy,
94
- }
88
+ return '%(node)s/%(part)s%(path)s policy#%(policy)d' % {
89
+ 'node': node_to_string(node, replication=True),
90
+ 'part': part, 'path': relative_path,
91
+ 'policy': policy,
92
+ }
93
+
94
+
95
+ class ResponseBucket(object):
96
+ """
97
+ Encapsulates fragment GET response data related to a single timestamp.
98
+ """
99
+ def __init__(self):
100
+ # count of all responses associated with this Bucket
101
+ self.num_responses = 0
102
+ # map {frag_index: response} for subset of responses that could be used
103
+ # to rebuild the missing fragment
104
+ self.useful_responses = {}
105
+ # set if a durable timestamp was seen in responses
106
+ self.durable = False
107
+ # etag of the first response associated with the Bucket
108
+ self.etag = None
95
109
 
96
110
 
97
111
  class RebuildingECDiskFileStream(object):
@@ -143,15 +157,15 @@ class ObjectReconstructor(Daemon):
143
157
  def __init__(self, conf, logger=None):
144
158
  """
145
159
  :param conf: configuration object obtained from ConfigParser
146
- :param logger: logging object
160
+ :param logger: an instance of ``SwiftLogAdapter``.
147
161
  """
148
162
  self.conf = conf
149
- self.logger = PrefixLoggerAdapter(
150
- logger or get_logger(conf, log_route='object-reconstructor'), {})
163
+ self.logger = \
164
+ logger or get_logger(conf, log_route='object-reconstructor')
151
165
  self.devices_dir = conf.get('devices', '/srv/node')
152
166
  self.mount_check = config_true_value(conf.get('mount_check', 'true'))
153
167
  self.swift_dir = conf.get('swift_dir', '/etc/swift')
154
- self.bind_ip = conf.get('bind_ip', '0.0.0.0')
168
+ self.ring_ip = conf.get('ring_ip', conf.get('bind_ip', '0.0.0.0'))
155
169
  self.servers_per_port = int(conf.get('servers_per_port', '0') or 0)
156
170
  self.port = None if self.servers_per_port else \
157
171
  int(conf.get('bind_port', 6200))
@@ -162,23 +176,30 @@ class ObjectReconstructor(Daemon):
162
176
  self.reconstructor_workers = int(conf.get('reconstructor_workers', 0))
163
177
  self.policies = [policy for policy in POLICIES
164
178
  if policy.policy_type == EC_POLICY]
165
- self.stats_interval = int(conf.get('stats_interval', '300'))
166
- self.ring_check_interval = int(conf.get('ring_check_interval', 15))
179
+ self.stats_interval = float(conf.get('stats_interval', '300'))
180
+ self.ring_check_interval = float(conf.get('ring_check_interval', 15))
167
181
  self.next_check = time.time() + self.ring_check_interval
168
182
  self.partition_times = []
169
- self.interval = int(conf.get('interval') or
170
- conf.get('run_pause') or 30)
171
- if 'run_pause' in conf and 'interval' not in conf:
172
- self.logger.warning('Option object-reconstructor/run_pause '
173
- 'is deprecated and will be removed in a '
174
- 'future version. Update your configuration'
175
- ' to use option object-reconstructor/'
176
- 'interval.')
183
+ self.interval = float(conf.get('interval') or
184
+ conf.get('run_pause') or 30)
185
+ if 'run_pause' in conf:
186
+ if 'interval' in conf:
187
+ self.logger.warning(
188
+ 'Option object-reconstructor/run_pause is deprecated and '
189
+ 'object-reconstructor/interval is already configured. '
190
+ 'You can safely remove run_pause; it is now ignored and '
191
+ 'will be removed in a future version.')
192
+ else:
193
+ self.logger.warning(
194
+ 'Option object-reconstructor/run_pause is deprecated '
195
+ 'and will be removed in a future version. '
196
+ 'Update your configuration to use option '
197
+ 'object-reconstructor/interval.')
177
198
  self.http_timeout = int(conf.get('http_timeout', 60))
178
199
  self.lockup_timeout = int(conf.get('lockup_timeout', 1800))
179
200
  self.recon_cache_path = conf.get('recon_cache_path',
180
- '/var/cache/swift')
181
- self.rcache = os.path.join(self.recon_cache_path, "object.recon")
201
+ DEFAULT_RECON_CACHE_PATH)
202
+ self.rcache = os.path.join(self.recon_cache_path, RECON_OBJECT_FILE)
182
203
  self._next_rcache_update = time.time() + self.stats_interval
183
204
  # defaults subject to change after beta
184
205
  self.conn_timeout = float(conf.get('conn_timeout', 0.5))
@@ -208,8 +229,28 @@ class ObjectReconstructor(Daemon):
208
229
  'of handoffs_only.')
209
230
  self.rebuild_handoff_node_count = int(conf.get(
210
231
  'rebuild_handoff_node_count', 2))
232
+ self.quarantine_threshold = non_negative_int(
233
+ conf.get('quarantine_threshold', 0))
234
+ self.quarantine_age = int(
235
+ conf.get('quarantine_age',
236
+ conf.get('reclaim_age', DEFAULT_RECLAIM_AGE)))
237
+ self.request_node_count = config_request_node_count_value(
238
+ conf.get('request_node_count', '2 * replicas'))
239
+ self.max_objects_per_revert = non_negative_int(
240
+ conf.get('max_objects_per_revert', 0))
241
+ # When upgrading from liberasurecode<=1.5.0, you may want to continue
242
+ # writing legacy CRCs until all nodes are upgraded and capabale of
243
+ # reading fragments with zlib CRCs.
244
+ # See https://bugs.launchpad.net/liberasurecode/+bug/1886088 for more
245
+ # information.
246
+ if 'write_legacy_ec_crc' in conf:
247
+ os.environ['LIBERASURECODE_WRITE_LEGACY_CRC'] = \
248
+ '1' if config_true_value(conf['write_legacy_ec_crc']) else '0'
249
+ # else, assume operators know what they're doing and leave env alone
250
+
211
251
  self._df_router = DiskFileRouter(conf, self.logger)
212
252
  self.all_local_devices = self.get_local_devices()
253
+ self.rings_mtime = None
213
254
 
214
255
  def get_worker_args(self, once=False, **kwargs):
215
256
  """
@@ -263,6 +304,11 @@ class ObjectReconstructor(Daemon):
263
304
  if now > self._next_rcache_update:
264
305
  self._next_rcache_update = now + self.stats_interval
265
306
  self.aggregate_recon_update()
307
+ rings_mtime = [os.path.getmtime(self.load_object_ring(
308
+ policy).serialized_path) for policy in self.policies]
309
+ if self.rings_mtime == rings_mtime:
310
+ return True
311
+ self.rings_mtime = rings_mtime
266
312
  return self.get_local_devices() == self.all_local_devices
267
313
 
268
314
  def aggregate_recon_update(self):
@@ -327,56 +373,169 @@ class ObjectReconstructor(Daemon):
327
373
  return False
328
374
  return True
329
375
 
330
- def _get_response(self, node, part, path, headers, full_path):
376
+ def _get_response(self, node, policy, partition, path, headers):
331
377
  """
332
378
  Helper method for reconstruction that GETs a single EC fragment
333
379
  archive
334
380
 
335
381
  :param node: the node to GET from
336
- :param part: the partition
382
+ :param policy: the job policy
383
+ :param partition: the partition
337
384
  :param path: path of the desired EC archive relative to partition dir
338
385
  :param headers: the headers to send
339
- :param full_path: full path to desired EC archive
340
386
  :returns: response
341
387
  """
388
+ full_path = _full_path(node, partition, path, policy)
342
389
  resp = None
343
390
  try:
344
391
  with ConnectionTimeout(self.conn_timeout):
345
- conn = http_connect(node['ip'], node['port'], node['device'],
346
- part, 'GET', path, headers=headers)
392
+ conn = http_connect(
393
+ node['replication_ip'], node['replication_port'],
394
+ node['device'], partition, 'GET', path, headers=headers)
347
395
  with Timeout(self.node_timeout):
348
396
  resp = conn.getresponse()
349
397
  resp.full_path = full_path
350
- if resp.status not in [HTTP_OK, HTTP_NOT_FOUND]:
351
- self.logger.warning(
352
- _("Invalid response %(resp)s from %(full_path)s"),
353
- {'resp': resp.status, 'full_path': full_path})
354
- resp = None
355
- elif resp.status == HTTP_NOT_FOUND:
356
- resp = None
398
+ resp.node = node
357
399
  except (Exception, Timeout):
358
400
  self.logger.exception(
359
- _("Trying to GET %(full_path)s"), {
401
+ "Trying to GET %(full_path)s", {
360
402
  'full_path': full_path})
361
403
  return resp
362
404
 
363
- def reconstruct_fa(self, job, node, datafile_metadata):
405
+ def _handle_fragment_response(self, node, policy, partition, fi_to_rebuild,
406
+ path, buckets, error_responses, resp):
364
407
  """
365
- Reconstructs a fragment archive - this method is called from ssync
366
- after a remote node responds that is missing this object - the local
367
- diskfile is opened to provide metadata - but to reconstruct the
368
- missing fragment archive we must connect to multiple object servers.
408
+ Place ok responses into a per-timestamp bucket. Append bad responses to
409
+ a list per-status-code in error_responses.
369
410
 
370
- :param job: job from ssync_sender
371
- :param node: node that we're rebuilding to
372
- :param datafile_metadata: the datafile metadata to attach to
373
- the rebuilt fragment archive
374
- :returns: a DiskFile like class for use by ssync
375
- :raises DiskFileError: if the fragment archive cannot be reconstructed
411
+ :return: the per-timestamp bucket if the response is ok, otherwise
412
+ None.
376
413
  """
377
- # don't try and fetch a fragment from the node we're rebuilding to
378
- part_nodes = [n for n in job['policy'].object_ring.get_part_nodes(
379
- job['partition']) if n['id'] != node['id']]
414
+ if not resp:
415
+ error_responses[UNKNOWN_RESPONSE_STATUS].append(resp)
416
+ return None
417
+
418
+ if resp.status not in [HTTP_OK, HTTP_NOT_FOUND]:
419
+ self.logger.warning(
420
+ "Invalid response %(resp)s from %(full_path)s",
421
+ {'resp': resp.status, 'full_path': resp.full_path})
422
+ if resp.status != HTTP_OK:
423
+ error_responses[resp.status].append(resp)
424
+ return None
425
+
426
+ resp.headers = HeaderKeyDict(resp.getheaders())
427
+ frag_index = resp.headers.get('X-Object-Sysmeta-Ec-Frag-Index')
428
+ try:
429
+ resp_frag_index = int(frag_index)
430
+ except (TypeError, ValueError):
431
+ # The successful response should include valid X-Object-
432
+ # Sysmeta-Ec-Frag-Index but for safety, catching the case either
433
+ # missing X-Object-Sysmeta-Ec-Frag-Index or invalid frag index to
434
+ # reconstruct and dump warning log for that
435
+ self.logger.warning(
436
+ 'Invalid resp from %s '
437
+ '(invalid X-Object-Sysmeta-Ec-Frag-Index: %r)',
438
+ resp.full_path, frag_index)
439
+ error_responses[UNKNOWN_RESPONSE_STATUS].append(resp)
440
+ return None
441
+
442
+ timestamp = resp.headers.get('X-Backend-Data-Timestamp',
443
+ resp.headers.get('X-Backend-Timestamp'))
444
+ if not timestamp:
445
+ self.logger.warning(
446
+ 'Invalid resp from %s, frag index %s (missing '
447
+ 'X-Backend-Data-Timestamp and X-Backend-Timestamp)',
448
+ resp.full_path, resp_frag_index)
449
+ error_responses[UNKNOWN_RESPONSE_STATUS].append(resp)
450
+ return None
451
+ timestamp = Timestamp(timestamp)
452
+
453
+ etag = resp.headers.get('X-Object-Sysmeta-Ec-Etag')
454
+ if not etag:
455
+ self.logger.warning(
456
+ 'Invalid resp from %s, frag index %s (missing Etag)',
457
+ resp.full_path, resp_frag_index)
458
+ error_responses[UNKNOWN_RESPONSE_STATUS].append(resp)
459
+ return None
460
+
461
+ bucket = buckets[timestamp]
462
+ bucket.num_responses += 1
463
+ if bucket.etag is None:
464
+ bucket.etag = etag
465
+ elif bucket.etag != etag:
466
+ self.logger.error('Mixed Etag (%s, %s) for %s frag#%s',
467
+ etag, bucket.etag,
468
+ _full_path(node, partition, path, policy),
469
+ fi_to_rebuild)
470
+ return None
471
+
472
+ durable_timestamp = resp.headers.get('X-Backend-Durable-Timestamp')
473
+ if durable_timestamp:
474
+ buckets[Timestamp(durable_timestamp)].durable = True
475
+
476
+ if resp_frag_index == fi_to_rebuild:
477
+ # TODO: With duplicated EC frags it's not unreasonable to find the
478
+ # very fragment we're trying to rebuild exists on another primary
479
+ # node. In this case we should stream it directly from the remote
480
+ # node to our target instead of rebuild. But instead we ignore it.
481
+ self.logger.debug(
482
+ 'Found existing frag #%s at %s while rebuilding to %s',
483
+ fi_to_rebuild, resp.full_path,
484
+ _full_path(node, partition, path, policy))
485
+ elif resp_frag_index not in bucket.useful_responses:
486
+ bucket.useful_responses[resp_frag_index] = resp
487
+ # else: duplicate frag_index isn't useful for rebuilding
488
+
489
+ return bucket
490
+
491
+ def _is_quarantine_candidate(self, policy, buckets, error_responses, df):
492
+ # This condition is deliberately strict because it determines if
493
+ # more requests will be issued and ultimately if the fragment
494
+ # will be quarantined.
495
+ if list(error_responses.keys()) != [404]:
496
+ # only quarantine if all other responses are 404 so we are
497
+ # confident there are no other frags on queried nodes
498
+ return False
499
+
500
+ local_timestamp = Timestamp(df.get_datafile_metadata()['X-Timestamp'])
501
+ if list(buckets.keys()) != [local_timestamp]:
502
+ # don't quarantine if there's insufficient other timestamp
503
+ # frags, or no response for the local frag timestamp: we
504
+ # possibly could quarantine, but this unexpected case may be
505
+ # worth more investigation
506
+ return False
507
+
508
+ if time.time() - float(local_timestamp) <= self.quarantine_age:
509
+ # If the fragment has not yet passed reclaim age then it is
510
+ # likely that a tombstone will be reverted to this node, or
511
+ # neighbor frags will get reverted from handoffs to *other* nodes
512
+ # and we'll discover we *do* have enough to reconstruct. Don't
513
+ # quarantine it yet: better that it is cleaned up 'normally'.
514
+ return False
515
+
516
+ bucket = buckets[local_timestamp]
517
+ return (bucket.num_responses <= self.quarantine_threshold and
518
+ bucket.num_responses < policy.ec_ndata and
519
+ df._frag_index in bucket.useful_responses)
520
+
521
+ def _make_fragment_requests(self, job, node, df, buckets, error_responses):
522
+ """
523
+ Issue requests for fragments to the list of ``nodes`` and sort the
524
+ responses into per-timestamp ``buckets`` or per-status
525
+ ``error_responses``. If any bucket accumulates sufficient responses to
526
+ rebuild the missing fragment then return that bucket.
527
+
528
+ :param job: job from ssync_sender.
529
+ :param node: node to which we're rebuilding.
530
+ :param df: an instance of :class:`~swift.obj.diskfile.BaseDiskFile`.
531
+ :param buckets: dict of per-timestamp buckets for ok responses.
532
+ :param error_responses: dict of per-status lists of error responses.
533
+ :return: A per-timestamp with sufficient responses, or None if
534
+ there is no such bucket.
535
+ """
536
+ policy = job['policy']
537
+ partition = job['partition']
538
+ datafile_metadata = df.get_datafile_metadata()
380
539
 
381
540
  # the fragment index we need to reconstruct is the position index
382
541
  # of the node we're rebuilding to within the primary part list
@@ -385,126 +544,162 @@ class ObjectReconstructor(Daemon):
385
544
  # KISS send out connection requests to all nodes, see what sticks.
386
545
  # Use fragment preferences header to tell other nodes that we want
387
546
  # fragments at the same timestamp as our fragment, and that they don't
388
- # need to be durable.
547
+ # need to be durable. Accumulate responses into per-timestamp buckets
548
+ # and if any buckets gets enough responses then use those responses to
549
+ # rebuild.
389
550
  headers = self.headers.copy()
390
- headers['X-Backend-Storage-Policy-Index'] = int(job['policy'])
551
+ headers['X-Backend-Storage-Policy-Index'] = int(policy)
391
552
  headers['X-Backend-Replication'] = 'True'
392
- frag_prefs = [{'timestamp': datafile_metadata['X-Timestamp'],
393
- 'exclude': []}]
553
+ local_timestamp = Timestamp(datafile_metadata['X-Timestamp'])
554
+ frag_prefs = [{'timestamp': local_timestamp.normal, 'exclude': []}]
394
555
  headers['X-Backend-Fragment-Preferences'] = json.dumps(frag_prefs)
395
- pile = GreenAsyncPile(len(part_nodes))
396
556
  path = datafile_metadata['name']
397
- for _node in part_nodes:
398
- full_get_path = _full_path(
399
- _node, job['partition'], path, job['policy'])
400
- pile.spawn(self._get_response, _node, job['partition'],
401
- path, headers, full_get_path)
402
-
403
- buckets = defaultdict(dict)
404
- durable_buckets = {}
405
- etag_buckets = {}
406
- error_resp_count = 0
407
- for resp in pile:
408
- if not resp:
409
- error_resp_count += 1
410
- continue
411
- resp.headers = HeaderKeyDict(resp.getheaders())
412
- frag_index = resp.headers.get('X-Object-Sysmeta-Ec-Frag-Index')
413
- try:
414
- resp_frag_index = int(frag_index)
415
- except (TypeError, ValueError):
416
- # The successful response should include valid X-Object-
417
- # Sysmeta-Ec-Frag-Index but for safety, catching the case
418
- # either missing X-Object-Sysmeta-Ec-Frag-Index or invalid
419
- # frag index to reconstruct and dump warning log for that
420
- self.logger.warning(
421
- 'Invalid resp from %s '
422
- '(invalid X-Object-Sysmeta-Ec-Frag-Index: %r)',
423
- resp.full_path, frag_index)
424
- continue
425
557
 
426
- if fi_to_rebuild == resp_frag_index:
427
- # TODO: With duplicated EC frags it's not unreasonable to find
428
- # the very fragment we're trying to rebuild exists on another
429
- # primary node. In this case we should stream it directly from
430
- # the remote node to our target instead of rebuild. But
431
- # instead we ignore it.
432
- self.logger.debug(
433
- 'Found existing frag #%s at %s while rebuilding to %s',
434
- fi_to_rebuild, resp.full_path,
435
- _full_path(
436
- node, job['partition'], datafile_metadata['name'],
437
- job['policy']))
438
- continue
439
-
440
- timestamp = resp.headers.get('X-Backend-Timestamp')
441
- if not timestamp:
442
- self.logger.warning('Invalid resp from %s, frag index %s '
443
- '(missing X-Backend-Timestamp)',
444
- resp.full_path, resp_frag_index)
445
- continue
446
- timestamp = Timestamp(timestamp)
558
+ ring = policy.object_ring
559
+ primary_nodes = ring.get_part_nodes(partition)
560
+ # primary_node_count is the maximum number of nodes to consume in a
561
+ # normal rebuild attempt when there is no quarantine candidate,
562
+ # including the node to which we are rebuilding
563
+ primary_node_count = len(primary_nodes)
564
+ # don't try and fetch a fragment from the node we're rebuilding to
565
+ filtered_primary_nodes = [n for n in primary_nodes
566
+ if n['id'] != node['id']]
567
+ # concurrency is the number of requests fired off in initial batch
568
+ concurrency = len(filtered_primary_nodes)
569
+ # max_node_count is the maximum number of nodes to consume when
570
+ # verifying a quarantine candidate and is at least primary_node_count
571
+ max_node_count = max(primary_node_count,
572
+ self.request_node_count(primary_node_count))
573
+
574
+ pile = GreenAsyncPile(concurrency)
575
+ for primary_node in filtered_primary_nodes:
576
+ pile.spawn(self._get_response, primary_node, policy, partition,
577
+ path, headers)
578
+
579
+ useful_bucket = None
580
+ for resp in pile:
581
+ bucket = self._handle_fragment_response(
582
+ node, policy, partition, fi_to_rebuild, path, buckets,
583
+ error_responses, resp)
584
+ if bucket and len(bucket.useful_responses) >= policy.ec_ndata:
585
+ useful_bucket = bucket
586
+ break
447
587
 
448
- durable = resp.headers.get('X-Backend-Durable-Timestamp')
449
- if durable:
450
- durable_buckets[Timestamp(durable)] = True
588
+ # Once all rebuild nodes have responded, if we have a quarantine
589
+ # candidate, go beyond primary_node_count and on to handoffs. The
590
+ # first non-404 response will prevent quarantine, but the expected
591
+ # common case is all 404 responses so we use some concurrency to get an
592
+ # outcome faster at the risk of some unnecessary requests in the
593
+ # uncommon case.
594
+ if (not useful_bucket and
595
+ self._is_quarantine_candidate(
596
+ policy, buckets, error_responses, df)):
597
+ node_count = primary_node_count
598
+ handoff_iter = itertools.islice(ring.get_more_nodes(partition),
599
+ max_node_count - node_count)
600
+ for handoff_node in itertools.islice(handoff_iter, concurrency):
601
+ node_count += 1
602
+ pile.spawn(self._get_response, handoff_node, policy, partition,
603
+ path, headers)
604
+ for resp in pile:
605
+ bucket = self._handle_fragment_response(
606
+ node, policy, partition, fi_to_rebuild, path, buckets,
607
+ error_responses, resp)
608
+ if bucket and len(bucket.useful_responses) >= policy.ec_ndata:
609
+ useful_bucket = bucket
610
+ self.logger.debug(
611
+ 'Reconstructing frag from handoffs, node_count=%d'
612
+ % node_count)
613
+ break
614
+ elif self._is_quarantine_candidate(
615
+ policy, buckets, error_responses, df):
616
+ try:
617
+ handoff_node = next(handoff_iter)
618
+ node_count += 1
619
+ pile.spawn(self._get_response, handoff_node, policy,
620
+ partition, path, headers)
621
+ except StopIteration:
622
+ pass
623
+ # else: this frag is no longer a quarantine candidate, so we
624
+ # could break right here and ignore any remaining responses,
625
+ # but given that we may have actually found another frag we'll
626
+ # optimistically wait for any remaining responses in case a
627
+ # useful bucket is assembled.
628
+
629
+ return useful_bucket
630
+
631
+ def reconstruct_fa(self, job, node, df):
632
+ """
633
+ Reconstructs a fragment archive - this method is called from ssync
634
+ after a remote node responds that is missing this object - the local
635
+ diskfile is opened to provide metadata - but to reconstruct the
636
+ missing fragment archive we must connect to multiple object servers.
451
637
 
452
- etag = resp.headers.get('X-Object-Sysmeta-Ec-Etag')
453
- if not etag:
454
- self.logger.warning('Invalid resp from %s, frag index %s '
455
- '(missing Etag)',
456
- resp.full_path, resp_frag_index)
457
- continue
638
+ :param job: job from ssync_sender.
639
+ :param node: node to which we're rebuilding.
640
+ :param df: an instance of :class:`~swift.obj.diskfile.BaseDiskFile`.
641
+ :returns: a DiskFile like class for use by ssync.
642
+ :raises DiskFileQuarantined: if the fragment archive cannot be
643
+ reconstructed and has as a result been quarantined.
644
+ :raises DiskFileError: if the fragment archive cannot be reconstructed.
645
+ """
646
+ policy = job['policy']
647
+ partition = job['partition']
648
+ # the fragment index we need to reconstruct is the position index
649
+ # of the node we're rebuilding to within the primary part list
650
+ fi_to_rebuild = node['backend_index']
651
+ datafile_metadata = df.get_datafile_metadata()
652
+ if not df.validate_metadata():
653
+ raise df._quarantine(
654
+ df._data_file, "Invalid fragment #%s" % df._frag_index)
655
+ local_timestamp = Timestamp(datafile_metadata['X-Timestamp'])
656
+ path = datafile_metadata['name']
458
657
 
459
- if etag != etag_buckets.setdefault(timestamp, etag):
460
- self.logger.error(
461
- 'Mixed Etag (%s, %s) for %s frag#%s',
462
- etag, etag_buckets[timestamp],
463
- _full_path(node, job['partition'],
464
- datafile_metadata['name'], job['policy']),
465
- fi_to_rebuild)
466
- continue
658
+ buckets = defaultdict(ResponseBucket) # map timestamp -> Bucket
659
+ error_responses = defaultdict(list) # map status code -> response list
467
660
 
468
- if resp_frag_index not in buckets[timestamp]:
469
- buckets[timestamp][resp_frag_index] = resp
470
- if len(buckets[timestamp]) >= job['policy'].ec_ndata:
471
- responses = list(buckets[timestamp].values())
472
- self.logger.debug(
473
- 'Reconstruct frag #%s with frag indexes %s'
474
- % (fi_to_rebuild, list(buckets[timestamp])))
475
- break
476
- else:
477
- path = _full_path(node, job['partition'],
478
- datafile_metadata['name'],
479
- job['policy'])
480
-
481
- for timestamp, resp in sorted(buckets.items()):
482
- etag = etag_buckets[timestamp]
483
- durable = durable_buckets.get(timestamp)
484
- self.logger.error(
485
- 'Unable to get enough responses (%s/%s) to reconstruct '
486
- '%s %s frag#%s with ETag %s and timestamp %s' % (
487
- len(resp), job['policy'].ec_ndata,
488
- 'durable' if durable else 'non-durable',
489
- path, fi_to_rebuild, etag, timestamp.internal))
490
-
491
- if error_resp_count:
492
- durable = durable_buckets.get(Timestamp(
493
- datafile_metadata['X-Timestamp']))
494
- self.logger.error(
495
- 'Unable to get enough responses (%s error responses) '
496
- 'to reconstruct %s %s frag#%s' % (
497
- error_resp_count,
498
- 'durable' if durable else 'non-durable',
499
- path, fi_to_rebuild))
500
-
501
- raise DiskFileError('Unable to reconstruct EC archive')
502
-
503
- rebuilt_fragment_iter = self.make_rebuilt_fragment_iter(
504
- responses[:job['policy'].ec_ndata], path, job['policy'],
505
- fi_to_rebuild)
506
- return RebuildingECDiskFileStream(datafile_metadata, fi_to_rebuild,
507
- rebuilt_fragment_iter)
661
+ # don't try and fetch a fragment from the node we're rebuilding to
662
+ useful_bucket = self._make_fragment_requests(
663
+ job, node, df, buckets, error_responses)
664
+
665
+ if useful_bucket:
666
+ frag_indexes = list(useful_bucket.useful_responses.keys())
667
+ self.logger.debug('Reconstruct frag #%s with frag indexes %s'
668
+ % (fi_to_rebuild, frag_indexes))
669
+ responses = list(useful_bucket.useful_responses.values())
670
+ rebuilt_fragment_iter = self.make_rebuilt_fragment_iter(
671
+ responses[:policy.ec_ndata], path, policy, fi_to_rebuild)
672
+ return RebuildingECDiskFileStream(datafile_metadata, fi_to_rebuild,
673
+ rebuilt_fragment_iter)
674
+
675
+ full_path = _full_path(node, partition, path, policy)
676
+ for timestamp, bucket in sorted(buckets.items()):
677
+ self.logger.error(
678
+ 'Unable to get enough responses (%s/%s from %s ok responses) '
679
+ 'to reconstruct %s %s frag#%s with ETag %s and timestamp %s' %
680
+ (len(bucket.useful_responses), policy.ec_ndata,
681
+ bucket.num_responses,
682
+ 'durable' if bucket.durable else 'non-durable',
683
+ full_path, fi_to_rebuild, bucket.etag, timestamp.internal))
684
+
685
+ if error_responses:
686
+ durable = buckets[local_timestamp].durable
687
+ errors = ', '.join(
688
+ '%s x %s' % (len(responses),
689
+ 'unknown' if status == UNKNOWN_RESPONSE_STATUS
690
+ else status)
691
+ for status, responses in sorted(error_responses.items()))
692
+ self.logger.error(
693
+ 'Unable to get enough responses (%s error responses) '
694
+ 'to reconstruct %s %s frag#%s' % (
695
+ errors, 'durable' if durable else 'non-durable',
696
+ full_path, fi_to_rebuild))
697
+
698
+ if self._is_quarantine_candidate(policy, buckets, error_responses, df):
699
+ raise df._quarantine(
700
+ df._data_file, "Solitary fragment #%s" % df._frag_index)
701
+
702
+ raise DiskFileError('Unable to reconstruct EC archive')
508
703
 
509
704
  def _reconstruct(self, policy, fragment_payload, frag_index):
510
705
  return policy.pyeclib_driver.reconstruct(fragment_payload,
@@ -539,8 +734,8 @@ class ObjectReconstructor(Daemon):
539
734
  fragment_payload = [fragment for fragment in pile]
540
735
  except (Exception, Timeout):
541
736
  self.logger.exception(
542
- _("Error trying to rebuild %(path)s "
543
- "policy#%(policy)d frag#%(frag_index)s"),
737
+ "Error trying to rebuild %(path)s "
738
+ "policy#%(policy)d frag#%(frag_index)s",
544
739
  {'path': path,
545
740
  'policy': policy,
546
741
  'frag_index': frag_index,
@@ -562,9 +757,9 @@ class ObjectReconstructor(Daemon):
562
757
  elapsed = (time.time() - self.start) or 0.000001
563
758
  rate = self.reconstruction_part_count / elapsed
564
759
  self.logger.info(
565
- _("%(reconstructed)d/%(total)d (%(percentage).2f%%)"
566
- " partitions reconstructed in %(time).2fs "
567
- "(%(rate).2f/sec, %(remaining)s remaining)"),
760
+ "%(reconstructed)d/%(total)d (%(percentage).2f%%)"
761
+ " partitions reconstructed in %(time).2fs "
762
+ "(%(rate).2f/sec, %(remaining)s remaining)",
568
763
  {'reconstructed': self.reconstruction_part_count,
569
764
  'total': self.part_count,
570
765
  'percentage':
@@ -577,29 +772,31 @@ class ObjectReconstructor(Daemon):
577
772
 
578
773
  if self.suffix_count and self.partition_times:
579
774
  self.logger.info(
580
- _("%(checked)d suffixes checked - "
581
- "%(hashed).2f%% hashed, %(synced).2f%% synced"),
775
+ "%(checked)d suffixes checked - "
776
+ "%(hashed).2f%% hashed, %(synced).2f%% synced",
582
777
  {'checked': self.suffix_count,
583
778
  'hashed': (self.suffix_hash * 100.0) / self.suffix_count,
584
779
  'synced': (self.suffix_sync * 100.0) / self.suffix_count})
585
780
  self.partition_times.sort()
586
781
  self.logger.info(
587
- _("Partition times: max %(max).4fs, "
588
- "min %(min).4fs, med %(med).4fs"),
782
+ "Partition times: max %(max).4fs, "
783
+ "min %(min).4fs, med %(med).4fs",
589
784
  {'max': self.partition_times[-1],
590
785
  'min': self.partition_times[0],
591
786
  'med': self.partition_times[
592
787
  len(self.partition_times) // 2]})
593
788
  else:
594
789
  self.logger.info(
595
- _("Nothing reconstructed for %s seconds."),
790
+ "Nothing reconstructed for %s seconds.",
596
791
  (time.time() - self.start))
597
792
 
598
793
  def _emplace_log_prefix(self, worker_index):
599
- self.logger.set_prefix("[worker %d/%d pid=%s] " % (
600
- worker_index + 1, # use 1-based indexing for more readable logs
601
- self.reconstructor_workers,
602
- os.getpid()))
794
+ self.logger = get_prefixed_logger(
795
+ self.logger, "[worker %d/%d pid=%s] " % (
796
+ worker_index + 1,
797
+ # use 1-based indexing for more readable logs
798
+ self.reconstructor_workers,
799
+ os.getpid()))
603
800
 
604
801
  def kill_coros(self):
605
802
  """Utility function that kills all coroutines currently running."""
@@ -627,7 +824,7 @@ class ObjectReconstructor(Daemon):
627
824
  while True:
628
825
  sleep(self.lockup_timeout)
629
826
  if self.reconstruction_count == self.last_reconstruction_count:
630
- self.logger.error(_("Lockup detected.. killing live coros."))
827
+ self.logger.error("Lockup detected.. killing live coros.")
631
828
  self.kill_coros()
632
829
  self.last_reconstruction_count = self.reconstruction_count
633
830
 
@@ -664,22 +861,6 @@ class ObjectReconstructor(Daemon):
664
861
  suffixes.append(suffix)
665
862
  return suffixes
666
863
 
667
- def rehash_remote(self, node, job, suffixes):
668
- headers = self.headers.copy()
669
- headers['X-Backend-Storage-Policy-Index'] = int(job['policy'])
670
- try:
671
- with Timeout(self.http_timeout):
672
- conn = http_connect(
673
- node['replication_ip'], node['replication_port'],
674
- node['device'], job['partition'], 'REPLICATE',
675
- '/' + '-'.join(sorted(suffixes)),
676
- headers=headers)
677
- conn.getresponse().read()
678
- except (Exception, Timeout):
679
- self.logger.exception(
680
- _("Trying to sync suffixes with %s") % _full_path(
681
- node, job['partition'], '', job['policy']))
682
-
683
864
  def _iter_nodes_for_frag(self, policy, partition, node):
684
865
  """
685
866
  Generate a priority list of nodes that can sync to the given node.
@@ -688,7 +869,7 @@ class ObjectReconstructor(Daemon):
688
869
  handoffs.
689
870
 
690
871
  To avoid conflicts placing frags we'll skip through the handoffs and
691
- only yield back those that are offset equal to to the given primary
872
+ only yield back those that are offset equal to the given primary
692
873
  node index.
693
874
 
694
875
  Nodes returned from this iterator will have 'backend_index' set.
@@ -732,15 +913,17 @@ class ObjectReconstructor(Daemon):
732
913
  except StopIteration:
733
914
  break
734
915
  attempts_remaining -= 1
916
+ conn = None
735
917
  try:
736
918
  with Timeout(self.http_timeout):
737
- resp = http_connect(
919
+ conn = http_connect(
738
920
  node['replication_ip'], node['replication_port'],
739
921
  node['device'], job['partition'], 'REPLICATE',
740
- '', headers=headers).getresponse()
922
+ '', headers=headers)
923
+ resp = conn.getresponse()
741
924
  if resp.status == HTTP_INSUFFICIENT_STORAGE:
742
925
  self.logger.error(
743
- _('%s responded as unmounted'),
926
+ '%s responded as unmounted',
744
927
  _full_path(node, job['partition'], '',
745
928
  job['policy']))
746
929
  attempts_remaining += 1
@@ -748,10 +931,10 @@ class ObjectReconstructor(Daemon):
748
931
  full_path = _full_path(node, job['partition'], '',
749
932
  job['policy'])
750
933
  self.logger.error(
751
- _("Invalid response %(resp)s from %(full_path)s"),
934
+ "Invalid response %(resp)s from %(full_path)s",
752
935
  {'resp': resp.status, 'full_path': full_path})
753
936
  else:
754
- remote_suffixes = pickle.loads(resp.read())
937
+ remote_suffixes = pickle.loads(resp.read()) # nosec: B301
755
938
  except (Exception, Timeout):
756
939
  # all exceptions are logged here so that our caller can
757
940
  # safely catch our exception and continue to the next node
@@ -760,6 +943,9 @@ class ObjectReconstructor(Daemon):
760
943
  'from %r' % _full_path(
761
944
  node, job['partition'], '',
762
945
  job['policy']))
946
+ finally:
947
+ if conn:
948
+ conn.close()
763
949
  if remote_suffixes is None:
764
950
  raise SuffixSyncError('Unable to get remote suffix hashes')
765
951
 
@@ -781,7 +967,7 @@ class ObjectReconstructor(Daemon):
781
967
  self.suffix_count += len(suffixes)
782
968
  return suffixes, node
783
969
 
784
- def delete_reverted_objs(self, job, objects, frag_index):
970
+ def delete_reverted_objs(self, job, objects):
785
971
  """
786
972
  For EC we can potentially revert only some of a partition
787
973
  so we'll delete reverted objects here. Note that we delete
@@ -790,21 +976,45 @@ class ObjectReconstructor(Daemon):
790
976
  :param job: the job being processed
791
977
  :param objects: a dict of objects to be deleted, each entry maps
792
978
  hash=>timestamp
793
- :param frag_index: (int) the fragment index of data files to be deleted
794
979
  """
795
980
  df_mgr = self._df_router[job['policy']]
796
981
  suffixes_to_delete = set()
797
982
  for object_hash, timestamps in objects.items():
798
983
  try:
799
- df = df_mgr.get_diskfile_from_hash(
984
+ df, filenames = df_mgr.get_diskfile_and_filenames_from_hash(
800
985
  job['local_dev']['device'], job['partition'],
801
986
  object_hash, job['policy'],
802
- frag_index=frag_index)
803
- df.purge(timestamps['ts_data'], frag_index)
987
+ frag_index=job['frag_index'])
988
+ # legacy durable data files look like modern nondurable data
989
+ # files; we therefore override nondurable_purge_delay when we
990
+ # know the data file is durable so that legacy durable data
991
+ # files get purged
992
+ nondurable_purge_delay = (0 if timestamps.get('durable')
993
+ else df_mgr.commit_window)
994
+ data_files = [
995
+ f for f in filenames
996
+ if f.endswith('.data')]
997
+ purgable_data_files = [
998
+ f for f in data_files
999
+ if f.startswith(timestamps['ts_data'].internal)]
1000
+ if (job['primary_frag_index'] is None
1001
+ and len(purgable_data_files) == len(data_files) <= 1):
1002
+ # pure handoff node, and we're about to purge the last
1003
+ # .data file, so it's ok to remove any meta file that may
1004
+ # have been reverted
1005
+ meta_timestamp = timestamps.get('ts_meta')
1006
+ else:
1007
+ meta_timestamp = None
1008
+ df.purge(timestamps['ts_data'], job['frag_index'],
1009
+ nondurable_purge_delay, meta_timestamp)
1010
+ except DiskFileNotExist:
1011
+ # may have passed reclaim age since being reverted, or may have
1012
+ # raced with another reconstructor process trying the same
1013
+ pass
804
1014
  except DiskFileError:
805
1015
  self.logger.exception(
806
1016
  'Unable to purge DiskFile (%r %r %r)',
807
- object_hash, timestamps['ts_data'], frag_index)
1017
+ object_hash, timestamps['ts_data'], job['frag_index'])
808
1018
  suffixes_to_delete.add(object_hash[-3:])
809
1019
 
810
1020
  for suffix in suffixes_to_delete:
@@ -854,12 +1064,13 @@ class ObjectReconstructor(Daemon):
854
1064
  if not suffixes:
855
1065
  continue
856
1066
 
857
- # ssync any out-of-sync suffixes with the remote node
1067
+ # ssync any out-of-sync suffixes with the remote node; do not limit
1068
+ # max_objects - we need to check them all because, unlike a revert
1069
+ # job, we don't purge any objects so start with the same set each
1070
+ # cycle
858
1071
  success, _ = ssync_sender(
859
- self, node, job, suffixes)()
860
- # let remote end know to rehash it's suffixes
861
- if success:
862
- self.rehash_remote(node, job, suffixes)
1072
+ self, node, job, suffixes, include_non_durable=False,
1073
+ max_objects=0)()
863
1074
  # update stats for this attempt
864
1075
  self.suffix_sync += len(suffixes)
865
1076
  self.logger.update_stats('suffix.syncs', len(suffixes))
@@ -873,19 +1084,37 @@ class ObjectReconstructor(Daemon):
873
1084
  'partition.delete.count.%s' % (job['local_dev']['device'],))
874
1085
  syncd_with = 0
875
1086
  reverted_objs = {}
876
- for node in job['sync_to']:
877
- node['backend_index'] = job['policy'].get_backend_index(
878
- node['index'])
879
- success, in_sync_objs = ssync_sender(
880
- self, node, job, job['suffixes'])()
881
- if success:
882
- self.rehash_remote(node, job, job['suffixes'])
883
- syncd_with += 1
884
- reverted_objs.update(in_sync_objs)
885
- if syncd_with >= len(job['sync_to']):
886
- self.delete_reverted_objs(
887
- job, reverted_objs, job['frag_index'])
888
- else:
1087
+ try:
1088
+ df_mgr = self._df_router[job['policy']]
1089
+ # Only object-server can take this lock if an incoming SSYNC is
1090
+ # running on the same partition. Taking the lock here ensure we
1091
+ # won't enter a race condition where both nodes try to
1092
+ # cross-replicate the same partition and both delete it.
1093
+ with df_mgr.partition_lock(job['device'], job['policy'],
1094
+ job['partition'], name='replication',
1095
+ timeout=0.2):
1096
+ limited_by_max_objects = False
1097
+ for node in job['sync_to']:
1098
+ node['backend_index'] = job['policy'].get_backend_index(
1099
+ node['index'])
1100
+ sender = ssync_sender(
1101
+ self, node, job, job['suffixes'],
1102
+ include_non_durable=True,
1103
+ max_objects=self.max_objects_per_revert)
1104
+ success, in_sync_objs = sender()
1105
+ limited_by_max_objects |= sender.limited_by_max_objects
1106
+ if success:
1107
+ syncd_with += 1
1108
+ reverted_objs.update(in_sync_objs)
1109
+ if syncd_with >= len(job['sync_to']):
1110
+ self.delete_reverted_objs(job, reverted_objs)
1111
+ if syncd_with < len(job['sync_to']) or limited_by_max_objects:
1112
+ self.handoffs_remaining += 1
1113
+ except PartitionLockTimeout:
1114
+ self.logger.info("Unable to lock handoff partition %d for revert "
1115
+ "on device %s policy %d",
1116
+ job['partition'], job['device'], job['policy'])
1117
+ self.logger.increment('partition.lock-failure.count')
889
1118
  self.handoffs_remaining += 1
890
1119
  self.logger.timing_since('partition.delete.timing', begin)
891
1120
 
@@ -947,7 +1176,8 @@ class ObjectReconstructor(Daemon):
947
1176
  data_fi_to_suffixes[fi].append(suffix)
948
1177
 
949
1178
  # helper to ensure consistent structure of jobs
950
- def build_job(job_type, frag_index, suffixes, sync_to):
1179
+ def build_job(job_type, frag_index, suffixes, sync_to,
1180
+ primary_frag_index):
951
1181
  return {
952
1182
  'job_type': job_type,
953
1183
  'frag_index': frag_index,
@@ -960,28 +1190,33 @@ class ObjectReconstructor(Daemon):
960
1190
  'local_dev': local_dev,
961
1191
  # ssync likes to have it handy
962
1192
  'device': local_dev['device'],
1193
+ # provide a hint to revert jobs that the node is a primary for
1194
+ # one of the frag indexes
1195
+ 'primary_frag_index': primary_frag_index,
963
1196
  }
964
1197
 
965
1198
  # aggregate jobs for all the fragment index in this part
966
1199
  jobs = []
967
1200
 
968
1201
  # check the primary nodes - to see if the part belongs here
1202
+ primary_frag_index = None
969
1203
  part_nodes = policy.object_ring.get_part_nodes(partition)
970
1204
  for node in part_nodes:
971
1205
  if node['id'] == local_dev['id']:
972
1206
  # this partition belongs here, we'll need a sync job
973
- frag_index = policy.get_backend_index(node['index'])
1207
+ primary_frag_index = policy.get_backend_index(node['index'])
974
1208
  try:
975
- suffixes = data_fi_to_suffixes.pop(frag_index)
1209
+ suffixes = data_fi_to_suffixes.pop(primary_frag_index)
976
1210
  except KeyError:
977
1211
  # N.B. If this function ever returns an empty list of jobs
978
1212
  # the entire partition will be deleted.
979
1213
  suffixes = []
980
1214
  sync_job = build_job(
981
1215
  job_type=SYNC,
982
- frag_index=frag_index,
1216
+ frag_index=primary_frag_index,
983
1217
  suffixes=suffixes,
984
1218
  sync_to=_get_partners(node['index'], part_nodes),
1219
+ primary_frag_index=primary_frag_index
985
1220
  )
986
1221
  # ssync callback to rebuild missing fragment_archives
987
1222
  sync_job['sync_diskfile_builder'] = self.reconstruct_fa
@@ -1012,6 +1247,7 @@ class ObjectReconstructor(Daemon):
1012
1247
  frag_index=fi,
1013
1248
  suffixes=data_fi_to_suffixes[fi],
1014
1249
  sync_to=nodes_sync_to,
1250
+ primary_frag_index=primary_frag_index
1015
1251
  )
1016
1252
  jobs.append(revert_job)
1017
1253
 
@@ -1038,21 +1274,22 @@ class ObjectReconstructor(Daemon):
1038
1274
  job_type=REVERT,
1039
1275
  frag_index=None,
1040
1276
  suffixes=non_data_fragment_suffixes,
1041
- sync_to=random.sample(part_nodes, nsample)
1277
+ sync_to=random.sample(part_nodes, nsample),
1278
+ primary_frag_index=primary_frag_index
1042
1279
  ))
1043
1280
  # return a list of jobs for this part
1044
1281
  return jobs
1045
1282
 
1046
1283
  def get_policy2devices(self):
1047
- ips = whataremyips(self.bind_ip)
1284
+ ips = whataremyips(self.ring_ip)
1048
1285
  policy2devices = {}
1049
1286
  for policy in self.policies:
1050
1287
  self.load_object_ring(policy)
1051
- local_devices = list(six.moves.filter(
1052
- lambda dev: dev and is_local_device(
1288
+ local_devices = [
1289
+ dev for dev in policy.object_ring.devs
1290
+ if dev and is_local_device(
1053
1291
  ips, self.port,
1054
- dev['replication_ip'], dev['replication_port']),
1055
- policy.object_ring.devs))
1292
+ dev['replication_ip'], dev['replication_port'])]
1056
1293
  policy2devices[policy] = local_devices
1057
1294
  return policy2devices
1058
1295
 
@@ -1087,7 +1324,7 @@ class ObjectReconstructor(Daemon):
1087
1324
  policy.object_ring, 'next_part_power', None)
1088
1325
  if next_part_power is not None:
1089
1326
  self.logger.warning(
1090
- _("next_part_power set in policy '%s'. Skipping"),
1327
+ "next_part_power set in policy '%s'. Skipping",
1091
1328
  policy.name)
1092
1329
  continue
1093
1330
 
@@ -1099,7 +1336,7 @@ class ObjectReconstructor(Daemon):
1099
1336
  self.device_count += 1
1100
1337
  dev_path = df_mgr.get_dev_path(local_dev['device'])
1101
1338
  if not dev_path:
1102
- self.logger.warning(_('%s is not mounted'),
1339
+ self.logger.warning('%s is not mounted',
1103
1340
  local_dev['device'])
1104
1341
  continue
1105
1342
  data_dir = get_data_dir(policy)
@@ -1193,7 +1430,7 @@ class ObjectReconstructor(Daemon):
1193
1430
  shutil.rmtree(path, ignore_errors=True)
1194
1431
  remove_file(path)
1195
1432
 
1196
- self.logger.info(_("Removing partition: %s"), path)
1433
+ self.logger.info("Removing partition: %s", path)
1197
1434
  tpool.execute(kill_it, path)
1198
1435
 
1199
1436
  def reconstruct(self, **kwargs):
@@ -1203,15 +1440,21 @@ class ObjectReconstructor(Daemon):
1203
1440
 
1204
1441
  stats = spawn(self.heartbeat)
1205
1442
  lockup_detector = spawn(self.detect_lockups)
1443
+ changed_rings = set()
1206
1444
 
1207
1445
  try:
1208
1446
  self.run_pool = GreenPool(size=self.concurrency)
1209
1447
  for part_info in self.collect_parts(**kwargs):
1210
1448
  sleep() # Give spawns a cycle
1449
+ if part_info['policy'] in changed_rings:
1450
+ continue
1211
1451
  if not self.check_ring(part_info['policy'].object_ring):
1212
- self.logger.info(_("Ring change detected. Aborting "
1213
- "current reconstruction pass."))
1214
- return
1452
+ changed_rings.add(part_info['policy'])
1453
+ self.logger.info(
1454
+ "Ring change detected for policy %d (%s). Aborting "
1455
+ "current reconstruction pass for this policy.",
1456
+ part_info['policy'].idx, part_info['policy'].name)
1457
+ continue
1215
1458
 
1216
1459
  self.reconstruction_part_count += 1
1217
1460
  jobs = self.build_reconstruction_jobs(part_info)
@@ -1230,8 +1473,8 @@ class ObjectReconstructor(Daemon):
1230
1473
  with Timeout(self.lockup_timeout):
1231
1474
  self.run_pool.waitall()
1232
1475
  except (Exception, Timeout):
1233
- self.logger.exception(_("Exception in top-level "
1234
- "reconstruction loop"))
1476
+ self.logger.exception("Exception in top-level "
1477
+ "reconstruction loop")
1235
1478
  self.kill_coros()
1236
1479
  finally:
1237
1480
  stats.kill()
@@ -1239,14 +1482,14 @@ class ObjectReconstructor(Daemon):
1239
1482
  self.stats_line()
1240
1483
  if self.handoffs_only:
1241
1484
  if self.handoffs_remaining > 0:
1242
- self.logger.info(_(
1485
+ self.logger.info(
1243
1486
  "Handoffs only mode still has handoffs remaining. "
1244
- "Next pass will continue to revert handoffs."))
1487
+ "Next pass will continue to revert handoffs.")
1245
1488
  else:
1246
- self.logger.warning(_(
1489
+ self.logger.warning(
1247
1490
  "Handoffs only mode found no handoffs remaining. "
1248
1491
  "You should disable handoffs_only once all nodes "
1249
- "are reporting no handoffs remaining."))
1492
+ "are reporting no handoffs remaining.")
1250
1493
 
1251
1494
  def final_recon_dump(self, total, override_devices=None, **kwargs):
1252
1495
  """
@@ -1283,13 +1526,13 @@ class ObjectReconstructor(Daemon):
1283
1526
  if multiprocess_worker_index is not None:
1284
1527
  self._emplace_log_prefix(multiprocess_worker_index)
1285
1528
  start = time.time()
1286
- self.logger.info(_("Running object reconstructor in script mode."))
1529
+ self.logger.info("Running object reconstructor in script mode.")
1287
1530
  override_opts = parse_override_options(once=True, **kwargs)
1288
1531
  self.reconstruct(override_devices=override_opts.devices,
1289
1532
  override_partitions=override_opts.partitions)
1290
1533
  total = (time.time() - start) / 60
1291
1534
  self.logger.info(
1292
- _("Object reconstruction complete (once). (%.02f minutes)"), total)
1535
+ "Object reconstruction complete (once). (%.02f minutes)", total)
1293
1536
  # Only dump stats if they would actually be meaningful -- i.e. we're
1294
1537
  # collecting per-disk stats and covering all partitions, or we're
1295
1538
  # covering all partitions, all disks.
@@ -1302,21 +1545,39 @@ class ObjectReconstructor(Daemon):
1302
1545
  def run_forever(self, multiprocess_worker_index=None, *args, **kwargs):
1303
1546
  if multiprocess_worker_index is not None:
1304
1547
  self._emplace_log_prefix(multiprocess_worker_index)
1305
- self.logger.info(_("Starting object reconstructor in daemon mode."))
1548
+ self.logger.info("Starting object reconstructor in daemon mode.")
1306
1549
  # Run the reconstructor continually
1307
1550
  while True:
1308
1551
  start = time.time()
1309
- self.logger.info(_("Starting object reconstruction pass."))
1552
+ self.logger.info("Starting object reconstruction pass.")
1310
1553
  override_opts = parse_override_options(**kwargs)
1311
1554
  # Run the reconstructor
1312
1555
  self.reconstruct(override_devices=override_opts.devices,
1313
1556
  override_partitions=override_opts.partitions)
1314
1557
  total = (time.time() - start) / 60
1315
1558
  self.logger.info(
1316
- _("Object reconstruction complete. (%.02f minutes)"), total)
1559
+ "Object reconstruction complete. (%.02f minutes)", total)
1317
1560
  self.final_recon_dump(
1318
1561
  total, override_devices=override_opts.devices,
1319
1562
  override_partitions=override_opts.partitions)
1320
1563
  self.logger.debug('reconstruction sleeping for %s seconds.',
1321
1564
  self.interval)
1322
1565
  sleep(self.interval)
1566
+
1567
+
1568
+ def main():
1569
+ parser = OptionParser("%prog CONFIG [options]")
1570
+ parser.add_option('-d', '--devices',
1571
+ help='Reconstruct only given devices. '
1572
+ 'Comma-separated list. '
1573
+ 'Only has effect if --once is used.')
1574
+ parser.add_option('-p', '--partitions',
1575
+ help='Reconstruct only given partitions. '
1576
+ 'Comma-separated list. '
1577
+ 'Only has effect if --once is used.')
1578
+ conf_file, options = parse_options(parser=parser, once=True)
1579
+ run_daemon(ObjectReconstructor, conf_file, **options)
1580
+
1581
+
1582
+ if __name__ == '__main__':
1583
+ main()