toil 8.2.0__py3-none-any.whl → 9.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. toil/batchSystems/abstractBatchSystem.py +13 -5
  2. toil/batchSystems/abstractGridEngineBatchSystem.py +17 -5
  3. toil/batchSystems/kubernetes.py +13 -2
  4. toil/batchSystems/mesos/batchSystem.py +33 -2
  5. toil/batchSystems/registry.py +15 -118
  6. toil/batchSystems/slurm.py +191 -16
  7. toil/common.py +20 -1
  8. toil/cwl/cwltoil.py +97 -119
  9. toil/cwl/utils.py +103 -3
  10. toil/fileStores/__init__.py +1 -1
  11. toil/fileStores/abstractFileStore.py +5 -2
  12. toil/fileStores/cachingFileStore.py +1 -1
  13. toil/job.py +30 -14
  14. toil/jobStores/abstractJobStore.py +35 -255
  15. toil/jobStores/aws/jobStore.py +864 -1964
  16. toil/jobStores/aws/utils.py +24 -270
  17. toil/jobStores/fileJobStore.py +2 -1
  18. toil/jobStores/googleJobStore.py +32 -13
  19. toil/jobStores/utils.py +0 -327
  20. toil/leader.py +27 -22
  21. toil/lib/accelerators.py +1 -1
  22. toil/lib/aws/config.py +22 -0
  23. toil/lib/aws/s3.py +477 -9
  24. toil/lib/aws/utils.py +22 -33
  25. toil/lib/checksum.py +88 -0
  26. toil/lib/conversions.py +33 -31
  27. toil/lib/directory.py +217 -0
  28. toil/lib/ec2.py +97 -29
  29. toil/lib/exceptions.py +2 -1
  30. toil/lib/expando.py +2 -2
  31. toil/lib/generatedEC2Lists.py +138 -19
  32. toil/lib/io.py +33 -2
  33. toil/lib/memoize.py +21 -7
  34. toil/lib/misc.py +1 -1
  35. toil/lib/pipes.py +385 -0
  36. toil/lib/plugins.py +106 -0
  37. toil/lib/retry.py +1 -1
  38. toil/lib/threading.py +1 -1
  39. toil/lib/url.py +320 -0
  40. toil/lib/web.py +4 -5
  41. toil/options/cwl.py +13 -1
  42. toil/options/runner.py +17 -10
  43. toil/options/wdl.py +12 -1
  44. toil/provisioners/__init__.py +5 -2
  45. toil/provisioners/aws/__init__.py +43 -36
  46. toil/provisioners/aws/awsProvisioner.py +47 -15
  47. toil/provisioners/node.py +60 -12
  48. toil/resource.py +3 -13
  49. toil/server/app.py +12 -6
  50. toil/server/cli/wes_cwl_runner.py +2 -2
  51. toil/server/wes/abstract_backend.py +21 -43
  52. toil/server/wes/toil_backend.py +2 -2
  53. toil/test/__init__.py +16 -18
  54. toil/test/batchSystems/batchSystemTest.py +2 -9
  55. toil/test/batchSystems/batch_system_plugin_test.py +7 -0
  56. toil/test/batchSystems/test_slurm.py +103 -14
  57. toil/test/cwl/cwlTest.py +181 -8
  58. toil/test/cwl/staging_cat.cwl +27 -0
  59. toil/test/cwl/staging_make_file.cwl +25 -0
  60. toil/test/cwl/staging_workflow.cwl +43 -0
  61. toil/test/cwl/zero_default.cwl +61 -0
  62. toil/test/docs/scripts/tutorial_staging.py +17 -8
  63. toil/test/docs/scriptsTest.py +2 -1
  64. toil/test/jobStores/jobStoreTest.py +23 -133
  65. toil/test/lib/aws/test_iam.py +7 -7
  66. toil/test/lib/aws/test_s3.py +30 -33
  67. toil/test/lib/aws/test_utils.py +9 -9
  68. toil/test/lib/test_url.py +69 -0
  69. toil/test/lib/url_plugin_test.py +105 -0
  70. toil/test/provisioners/aws/awsProvisionerTest.py +60 -7
  71. toil/test/provisioners/clusterTest.py +15 -2
  72. toil/test/provisioners/gceProvisionerTest.py +1 -1
  73. toil/test/server/serverTest.py +78 -36
  74. toil/test/src/autoDeploymentTest.py +2 -3
  75. toil/test/src/fileStoreTest.py +89 -87
  76. toil/test/utils/ABCWorkflowDebug/ABC.txt +1 -0
  77. toil/test/utils/ABCWorkflowDebug/debugWorkflow.py +4 -4
  78. toil/test/utils/toilKillTest.py +35 -28
  79. toil/test/wdl/md5sum/md5sum-gs.json +1 -1
  80. toil/test/wdl/md5sum/md5sum.json +1 -1
  81. toil/test/wdl/testfiles/read_file.wdl +18 -0
  82. toil/test/wdl/testfiles/url_to_optional_file.wdl +2 -1
  83. toil/test/wdl/wdltoil_test.py +171 -162
  84. toil/test/wdl/wdltoil_test_kubernetes.py +9 -0
  85. toil/utils/toilDebugFile.py +6 -3
  86. toil/utils/toilSshCluster.py +23 -0
  87. toil/utils/toilStats.py +17 -2
  88. toil/utils/toilUpdateEC2Instances.py +1 -0
  89. toil/version.py +10 -10
  90. toil/wdl/wdltoil.py +1179 -825
  91. toil/worker.py +16 -8
  92. {toil-8.2.0.dist-info → toil-9.1.0.dist-info}/METADATA +32 -32
  93. {toil-8.2.0.dist-info → toil-9.1.0.dist-info}/RECORD +97 -85
  94. {toil-8.2.0.dist-info → toil-9.1.0.dist-info}/WHEEL +1 -1
  95. toil/lib/iterables.py +0 -112
  96. toil/test/docs/scripts/stagingExampleFiles/in.txt +0 -1
  97. {toil-8.2.0.dist-info → toil-9.1.0.dist-info}/entry_points.txt +0 -0
  98. {toil-8.2.0.dist-info → toil-9.1.0.dist-info}/licenses/LICENSE +0 -0
  99. {toil-8.2.0.dist-info → toil-9.1.0.dist-info}/top_level.txt +0 -0
@@ -11,675 +11,804 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
- import hashlib
15
- import itertools
16
- import logging
14
+ """
15
+ This file contains the AWS jobstore, which has its own docstring defining its use.
16
+
17
+ This docstring is about the organization of the file.
18
+
19
+ All direct AWS boto calls should live in toil.lib.aws, except for creating the
20
+ session instance and the resource/client (which should only be made ONCE in the jobstore).
21
+
22
+ Reasons for this
23
+ - DRY.
24
+ - All retries are on their individual boto functions, instead of here.
25
+ - Simple clear functions => simple clear unit tests (ideally).
26
+
27
+ Variables defining part size, parallelization, and other constants should live in toil.lib.aws.config.
28
+ """
17
29
  import os
30
+ import json
31
+ import logging
18
32
  import pickle
19
33
  import re
20
- import reprlib
21
34
  import stat
22
- import time
23
35
  import uuid
24
- from collections.abc import Generator
25
- from contextlib import contextmanager
36
+ import datetime
37
+
26
38
  from io import BytesIO
27
- from typing import IO, TYPE_CHECKING, Optional, Union, cast
28
- from urllib.parse import ParseResult, parse_qs, urlencode, urlsplit, urlunsplit
39
+ from contextlib import contextmanager
40
+ from urllib.parse import ParseResult, urlparse
41
+ from typing import (
42
+ ContextManager,
43
+ IO,
44
+ TYPE_CHECKING,
45
+ Optional,
46
+ Union,
47
+ cast,
48
+ Tuple,
49
+ Callable,
50
+ Dict,
51
+ Any,
52
+ Iterator,
53
+ Literal,
54
+ overload
55
+ )
29
56
 
57
+ # This file can't be imported if the AWS modules are not available.
30
58
  from botocore.exceptions import ClientError
31
59
 
32
- import toil.lib.encryption as encryption
33
60
  from toil.fileStores import FileID
34
- from toil.job import Job, JobDescription
35
- from toil.jobStores.abstractJobStore import (
36
- AbstractJobStore,
37
- ConcurrentFileModificationException,
38
- JobStoreExistsException,
39
- LocatorException,
40
- NoSuchFileException,
41
- NoSuchJobException,
42
- NoSuchJobStoreException,
43
- )
44
- from toil.jobStores.aws.utils import (
45
- SDBHelper,
46
- ServerSideCopyProhibitedError,
47
- copyKeyMultipart,
48
- fileSizeAndTime,
49
- no_such_sdb_domain,
50
- retry_sdb,
51
- sdb_unavailable,
52
- uploadFile,
53
- uploadFromPath,
54
- )
55
- from toil.jobStores.utils import ReadablePipe, ReadableTransformingPipe, WritablePipe
56
- from toil.lib.aws import build_tag_dict_from_env
57
- from toil.lib.aws.session import establish_boto3_session
58
- from toil.lib.aws.utils import (
59
- NoBucketLocationError,
60
- boto3_pager,
61
+ from toil.jobStores.abstractJobStore import (AbstractJobStore,
62
+ JobStoreExistsException,
63
+ NoSuchJobException,
64
+ NoSuchJobStoreException)
65
+ from toil.lib.aws.s3 import (
61
66
  create_s3_bucket,
62
- enable_public_objects,
63
- flatten_tags,
64
- get_bucket_region,
65
- get_item_from_attributes,
66
- get_object_for_url,
67
- list_objects_for_url,
68
- retry_s3,
69
- retryable_s3_errors,
67
+ delete_s3_bucket,
68
+ bucket_exists,
69
+ copy_s3_to_s3,
70
+ copy_local_to_s3,
71
+ copy_s3_to_local,
72
+ parse_s3_uri,
73
+ MultiPartPipe,
74
+ list_s3_items,
75
+ upload_to_s3,
76
+ download_stream,
77
+ s3_key_exists,
78
+ head_s3_object,
79
+ get_s3_object,
80
+ put_s3_object,
81
+ create_public_url,
82
+ AWSKeyNotFoundError,
70
83
  )
71
- from toil.lib.compatibility import compat_bytes
84
+ from toil.lib.aws.utils import get_object_for_url, list_objects_for_url
85
+ from toil.common import Config
86
+ from toil.jobStores.abstractJobStore import NoSuchFileException
72
87
  from toil.lib.ec2nodes import EC2Regions
73
- from toil.lib.exceptions import panic
74
- from toil.lib.io import AtomicFileCreate
75
- from toil.lib.memoize import strict_bool
76
- from toil.lib.objects import InnerClass
77
- from toil.lib.retry import get_error_code, get_error_status, retry
78
-
79
- if TYPE_CHECKING:
80
- from mypy_boto3_sdb.type_defs import (
81
- AttributeTypeDef,
82
- DeletableItemTypeDef,
83
- ItemTypeDef,
84
- ReplaceableAttributeTypeDef,
85
- ReplaceableItemTypeDef,
86
- UpdateConditionTypeDef,
87
- )
88
-
89
- from toil import Config
90
-
91
- boto3_session = establish_boto3_session()
92
- s3_boto3_resource = boto3_session.resource("s3")
93
- s3_boto3_client = boto3_session.client("s3")
88
+ from toil.lib.retry import get_error_status
89
+ from toil.version import version
90
+ from toil.lib.aws.session import establish_boto3_session
91
+ from toil.job import JobDescription, Job
92
+ from toil.lib.url import URLAccess
93
+
94
+
95
+ DEFAULT_AWS_PART_SIZE = 52428800
94
96
  logger = logging.getLogger(__name__)
95
97
 
96
- # Sometimes we have to wait for multipart uploads to become real. How long
97
- # should we wait?
98
- CONSISTENCY_TICKS = 5
99
- CONSISTENCY_TIME = 1
100
98
 
99
+ class AWSJobStore(AbstractJobStore, URLAccess):
100
+ """
101
+ The AWS jobstore can be thought of as an AWS s3 bucket, with functions to
102
+ centralize, store, and track files for the workflow.
101
103
 
102
- class ChecksumError(Exception):
103
- """Raised when a download from AWS does not contain the correct data."""
104
+ The AWS jobstore stores 4 things:
104
105
 
106
+ 1. Jobs: These are pickled as files, and contain the information necessary to run a job when unpickled.
107
+ A job's file is deleted when finished, and its absence means it completed.
105
108
 
106
- class DomainDoesNotExist(Exception):
107
- """Raised when a domain that is expected to exist does not exist."""
109
+ 2. Files: The inputs and outputs of jobs. Each file is written in s3 with the file pattern:
110
+ "files/{uuid4}/{original_filename}", where the file prefix
111
+ "files/{uuid4}" should only point to one file.
112
+ 3. Logs: The written log files of jobs that have run, plus the log file for the main Toil process.
108
113
 
109
- def __init__(self, domain_name):
110
- super().__init__(f"Expected domain {domain_name} to exist!")
114
+ 4. Shared Files: Files with himan=-readable names, used by Toil itself or Python workflows.
115
+ These include:
111
116
 
117
+ * environment.pickle (environment variables)
112
118
 
113
- class AWSJobStore(AbstractJobStore):
114
- """
115
- A job store that uses Amazon's S3 for file storage and SimpleDB for storing job info and
116
- enforcing strong consistency on the S3 file storage. There will be SDB domains for jobs and
117
- files and a versioned S3 bucket for file contents. Job objects are pickled, compressed,
118
- partitioned into chunks of 1024 bytes and each chunk is stored as a an attribute of the SDB
119
- item representing the job. UUIDs are used to identify jobs and files.
119
+ * config.pickle (user options)
120
+
121
+ * pid.log (process ID of the workflow; when it finishes, the workflow either succeeded/failed)
122
+ * userScript (hot deployment; this is the job module)
123
+
124
+ * rootJobReturnValue (workflow succeeded or not)
125
+
126
+ NOTES
127
+ - The AWS jobstore does not use a database (directly, at least) currently. We can get away with this because:
128
+
129
+ 1. AWS s3 has strong consistency.
130
+
131
+ 2. s3's filter/query speed is pretty good.
132
+
133
+ However, there may be reasons in the future to provide users with a database:
134
+
135
+ * s3 throttling has limits (3,500/5,000 requests (TODO: per
136
+ second?); something like dynamodb supports 100,000+ requests).
137
+
138
+ * Access and filtering would be sped up, though how much faster this would be needs testing.
139
+
140
+ ALSO NOTE: The caching filestore uses a local (per node) database with a very similar structure that maybe
141
+ could be synced up with this.
142
+
143
+ - TODO: Etags are s3's native checksum, so use that for file integrity checking since it's free when fetching
144
+ object headers from s3. Using an md5sum in addition to this would work well with the current filestore.
145
+ WARNING: Etag values differ for the same file when the part size changes, so part size should always
146
+ be Set In Stone, unless we hit s3's 10,000 part limit, and we need to account for that.
147
+
148
+ - This class fills in self.config only when initialized/restarted; it is None upon class instantiation. These
149
+ are the options/config set by the user. When jobs are loaded/unpickled, they must re-incorporate this.
150
+
151
+ - The config.sseKey field is the single source of truth for bucket encryption
152
+ status. The key is never stored inside this class; it is always read
153
+ from the file referenced by the config when needed. Modifying the config
154
+ at runtime will modify whether encryption is used. Note that files
155
+ written *without* encryption (i.e. config.pickle) can't be read when
156
+ encryption is enabled!
157
+
158
+ - TODO: In general, job stores should log the version of Toil they were
159
+ initialized with and warn the user if restarting with a different
160
+ version.
120
161
  """
162
+ def __init__(self, locator: str, partSize: int = DEFAULT_AWS_PART_SIZE) -> None:
163
+ super(AWSJobStore, self).__init__(locator)
164
+ # TODO: parsing of user options seems like it should be done outside of this class;
165
+ # pass in only the bucket name and region?
166
+ self.region, self.bucket_name = parse_jobstore_identifier(locator)
167
+ boto3_session = establish_boto3_session(region_name=self.region)
168
+ self.s3_resource = boto3_session.resource("s3")
169
+ self.s3_client = boto3_session.client("s3")
170
+ logger.info(f"Instantiating {self.__class__} with region: {self.region}")
171
+ self.part_size = DEFAULT_AWS_PART_SIZE # don't let users set the part size; it will throw off etag values
172
+
173
+ # created anew during self.initialize() or loaded using self.resume()
174
+ self.bucket = None
175
+
176
+ # pickled job files named with uuid4
177
+ self.job_key_prefix = 'jobs/'
178
+ # job-file associations; these are empty files mimicking a db w/naming convention: job_uuid4.file_uuid4
179
+ #
180
+ # TODO: a many-to-many system is implemented, but a simpler one-to-many
181
+ # system could be used, because each file should belong to at most one
182
+ # job. This should be changed to a hierarchical layout.
183
+ self.job_associations_key_prefix = 'job-associations/'
184
+ # input/output files named with uuid4
185
+ self.content_key_prefix = 'files/'
186
+ # these are special files, like 'environment.pickle'; place them in root
187
+ self.shared_key_prefix = ''
188
+ # read and unread; named with uuid4
189
+ self.logs_key_prefix = 'logs/'
190
+
191
+ ###################################### CREATE/DESTROY JOBSTORE ######################################
192
+
193
+ def initialize(self, config: Config) -> None:
194
+ """
195
+ Called when starting a new jobstore with a non-existent bucket.
121
196
 
122
- # Dots in bucket names should be avoided because bucket names are used in HTTPS bucket
123
- # URLs where the may interfere with the certificate common name. We use a double
124
- # underscore as a separator instead.
125
- #
126
- bucketNameRe = re.compile(r"^[a-z0-9][a-z0-9-]+[a-z0-9]$")
197
+ Create bucket, raise if it already exists.
198
+ Set options from config.
199
+ """
200
+ logger.debug(f"Instantiating {self.__class__} for region {self.region} with bucket: '{self.bucket_name}'")
201
+ if bucket_exists(self.s3_resource, self.bucket_name):
202
+ raise JobStoreExistsException(self.locator, 'aws')
203
+ self.bucket = create_s3_bucket(self.s3_resource, self.bucket_name, region=self.region) # type: ignore
204
+ super(AWSJobStore, self).initialize(config)
127
205
 
128
- # See http://docs.aws.amazon.com/AmazonS3/latest/dev/BucketRestrictions.html
129
- #
130
- minBucketNameLen = 3
131
- maxBucketNameLen = 63
132
- maxNameLen = 10
133
- nameSeparator = "--"
206
+ def resume(self) -> None:
207
+ """
208
+ Called when reusing an old jobstore with an existing bucket.
134
209
 
135
- def __init__(self, locator: str, partSize: int = 50 << 20) -> None:
210
+ :raise NoSuchJobStoreException: if the bucket doesn't exist.
136
211
  """
137
- Create a new job store in AWS or load an existing one from there.
212
+ if not bucket_exists(self.s3_resource, self.bucket_name):
213
+ raise NoSuchJobStoreException(self.locator, 'aws')
214
+ # This sets self.config to not be None and loads the encryption key
215
+ # path from the unencrypted config. So it needs the bucket to exist to
216
+ # read from.
217
+ super(AWSJobStore, self).resume()
218
+
219
+ def destroy(self) -> None:
220
+ delete_s3_bucket(self.s3_resource, self.bucket_name)
138
221
 
139
- :param int partSize: The size of each individual part used for multipart operations like
140
- upload and copy, must be >= 5 MiB but large enough to not exceed 10k parts for the
141
- whole file
222
+ ###################################### BUCKET UTIL API ######################################
223
+
224
+ def _key_in_bucket(
225
+ self,
226
+ identifier: str,
227
+ prefix: str,
228
+ ) -> str:
142
229
  """
143
- super().__init__(locator)
144
- region, namePrefix = locator.split(":")
145
- regions = EC2Regions.keys()
146
- if region not in regions:
147
- raise ValueError(f'Region "{region}" is not one of: {regions}')
148
- if not self.bucketNameRe.match(namePrefix):
149
- raise ValueError(
150
- "Invalid name prefix '%s'. Name prefixes must contain only digits, "
151
- "hyphens or lower-case letters and must not start or end in a "
152
- "hyphen." % namePrefix
153
- )
154
- # Reserve 13 for separator and suffix
155
- if len(namePrefix) > self.maxBucketNameLen - self.maxNameLen - len(
156
- self.nameSeparator
157
- ):
158
- raise ValueError(
159
- "Invalid name prefix '%s'. Name prefixes may not be longer than 50 "
160
- "characters." % namePrefix
161
- )
162
- if "--" in namePrefix:
163
- raise ValueError(
164
- "Invalid name prefix '%s'. Name prefixes may not contain "
165
- "%s." % (namePrefix, self.nameSeparator)
166
- )
167
- logger.debug(
168
- "Instantiating %s for region %s and name prefix '%s'",
169
- self.__class__,
170
- region,
171
- namePrefix,
230
+ Get the key in the bucket for the given identifier and prefix.
231
+
232
+ We have this so higher-level code doesn't need to worry about the
233
+ pasting together of prefixes and identifiers, so it never ahs to be
234
+ mixed with the identifier=/prefix= calling convention.
235
+ """
236
+ return f'{prefix}{identifier}'
237
+
238
+ def is_in_bucket(
239
+ self,
240
+ identifier: str,
241
+ prefix: str,
242
+ bucket: Optional[str] = None,
243
+ ) -> bool:
244
+ """
245
+ Check if the key for the given identifier and prefix is in the bucket.
246
+ """
247
+ bucket = bucket or self.bucket_name
248
+
249
+ return s3_key_exists(
250
+ s3_resource=self.s3_resource,
251
+ bucket=bucket,
252
+ key=self._key_in_bucket(identifier=identifier, prefix=prefix),
253
+ extra_args=self._get_encryption_args()
172
254
  )
173
- self.region = region
174
- self.name_prefix = namePrefix
175
- self.part_size = partSize
176
- self.jobs_domain_name: Optional[str] = None
177
- self.files_domain_name: Optional[str] = None
178
- self.files_bucket = None
179
- self.db = boto3_session.client(service_name="sdb", region_name=region)
180
-
181
- self.s3_resource = boto3_session.resource("s3", region_name=self.region)
182
- self.s3_client = self.s3_resource.meta.client
183
-
184
- def initialize(self, config: "Config") -> None:
185
- if self._registered:
186
- raise JobStoreExistsException(self.locator, "aws")
187
- self._registered = None
188
- try:
189
- self._bind(create=True)
190
- except:
191
- with panic(logger):
192
- self.destroy()
193
- else:
194
- super().initialize(config)
195
- # Only register after job store has been full initialized
196
- self._registered = True
197
255
 
198
- @property
199
- def sseKeyPath(self) -> Optional[str]:
200
- return self.config.sseKey
201
256
 
202
- def resume(self) -> None:
203
- if not self._registered:
204
- raise NoSuchJobStoreException(self.locator, "aws")
205
- self._bind(create=False)
206
- super().resume()
257
+ def write_to_bucket(
258
+ self,
259
+ identifier: str,
260
+ prefix: str,
261
+ data: Optional[Union[bytes, str, Dict[str, Any]]],
262
+ bucket: Optional[str] = None,
263
+ encrypted: Optional[bool] = None,
264
+ ) -> None:
265
+ """
266
+ Write something directly to a bucket.
267
+
268
+ Use for small files. Does not parallelize or use multipart.
269
+
270
+ :param encrypted: Can be set to False to disable encryption.
271
+ """
272
+ # only used if exporting to a URL
273
+ encryption_args = {} if encrypted is False else self._get_encryption_args()
274
+ bucket = bucket or self.bucket_name
275
+
276
+ if isinstance(data, dict):
277
+ data = json.dumps(data).encode('utf-8')
278
+ elif isinstance(data, str):
279
+ data = data.encode('utf-8')
280
+ elif data is None:
281
+ data = b''
282
+
283
+ assert isinstance(data, bytes)
284
+ put_s3_object(
285
+ s3_resource=self.s3_resource,
286
+ bucket=bucket,
287
+ key=self._key_in_bucket(identifier=identifier, prefix=prefix),
288
+ body=data,
289
+ extra_args=encryption_args,
290
+ )
207
291
 
208
- def _bind(
292
+ def read_from_bucket(
209
293
  self,
210
- create: bool = False,
211
- block: bool = True,
212
- check_versioning_consistency: bool = True,
213
- ) -> None:
214
- def qualify(name):
215
- assert len(name) <= self.maxNameLen
216
- return self.name_prefix + self.nameSeparator + name
217
-
218
- # The order in which this sequence of events happens is important. We can easily handle the
219
- # inability to bind a domain, but it is a little harder to handle some cases of binding the
220
- # jobstore bucket. Maintaining this order allows for an easier `destroy` method.
221
- if self.jobs_domain_name is None:
222
- self.jobs_domain_name = qualify("jobs")
223
- self._bindDomain(self.jobs_domain_name, create=create, block=block)
224
- if self.files_domain_name is None:
225
- self.files_domain_name = qualify("files")
226
- self._bindDomain(self.files_domain_name, create=create, block=block)
227
- if self.files_bucket is None:
228
- self.files_bucket = self._bindBucket(
229
- qualify("files"),
230
- create=create,
231
- block=block,
232
- versioning=True,
233
- check_versioning_consistency=check_versioning_consistency,
234
- )
294
+ identifier: str,
295
+ prefix: str,
296
+ bucket: Optional[str] = None,
297
+ ) -> bytes:
298
+ """
299
+ Read something directly from a bucket.
235
300
 
236
- @property
237
- def _registered(self) -> Optional[bool]:
301
+ Use for small files. Does not parallelize or use multipart.
302
+
303
+ :raises NoSuchJobException: if the prefix is the job prefix and the
304
+ identifier is not found.
305
+ :raises NoSuchFileException: if the prefix is the content prefix and
306
+ the identifier is not found.
307
+ :raises self.s3_client.exceptions.NoSuchKey: in other cases where the
308
+ identifier is not found.
309
+ """
310
+ bucket = bucket or self.bucket_name
311
+
312
+ try:
313
+ return get_s3_object(
314
+ s3_resource=self.s3_resource,
315
+ bucket=bucket,
316
+ key=self._key_in_bucket(identifier=identifier, prefix=prefix),
317
+ extra_args=self._get_encryption_args(),
318
+ )['Body'].read()
319
+ except self.s3_client.exceptions.NoSuchKey:
320
+ if prefix == self.job_key_prefix:
321
+ raise NoSuchJobException(identifier)
322
+ elif prefix == self.content_key_prefix:
323
+ raise NoSuchFileException(identifier)
324
+ else:
325
+ raise
326
+ except ClientError as e:
327
+ if e.response.get('ResponseMetadata', {}).get('HTTPStatusCode') == 404:
328
+ if prefix == self.job_key_prefix:
329
+ raise NoSuchJobException(identifier)
330
+ elif prefix == self.content_key_prefix:
331
+ raise NoSuchFileException(identifier)
332
+ else:
333
+ raise
334
+ else:
335
+ raise
336
+
337
+ ###################################### JOBS API ######################################
338
+
339
+ def assign_job_id(self, jobDescription: JobDescription) -> None:
340
+ jobDescription.jobStoreID = str(uuid.uuid4())
341
+ logger.debug("Assigning Job ID %s", jobDescription.jobStoreID)
342
+
343
+ def create_job(self, jobDescription: JobDescription) -> JobDescription:
344
+ """
345
+ Pickle a jobDescription object and write it to the jobstore as a file.
346
+
347
+ Responsible for calling :meth:`toil.job.JobDescription.pre_update_hook`
348
+ on the job description.
238
349
  """
239
- A optional boolean property indicating whether this job store is registered. The
240
- registry is the authority on deciding if a job store exists or not. If True, this job
241
- store exists, if None the job store is transitioning from True to False or vice versa,
242
- if False the job store doesn't exist.
243
350
 
244
- :type: bool|None
351
+ jobDescription.pre_update_hook()
352
+
353
+ self.write_to_bucket(identifier=str(jobDescription.jobStoreID),
354
+ prefix=self.job_key_prefix,
355
+ data=pickle.dumps(jobDescription, protocol=pickle.HIGHEST_PROTOCOL))
356
+ return jobDescription
357
+
358
+ def job_exists(self, job_id: str, check: bool = False) -> bool:
359
+ """
360
+ Checks if the job_id is found in s3.
361
+
362
+ :param check: If True, raise an exception instead of returning false
363
+ when a job does not exist.
245
364
  """
246
- # The weird mapping of the SDB item attribute value to the property value is due to
247
- # backwards compatibility. 'True' becomes True, that's easy. Toil < 3.3.0 writes this at
248
- # the end of job store creation. Absence of either the registry, the item or the
249
- # attribute becomes False, representing a truly absent, non-existing job store. An
250
- # attribute value of 'False', which is what Toil < 3.3.0 writes at the *beginning* of job
251
- # store destruction, indicates a job store in transition, reflecting the fact that 3.3.0
252
- # may leak buckets or domains even though the registry reports 'False' for them. We
253
- # can't handle job stores that were partially created by 3.3.0, though.
254
- registry_domain_name = "toil-registry"
255
365
  try:
256
- self._bindDomain(
257
- domain_name=registry_domain_name, create=False, block=False
366
+ self.s3_client.head_object(
367
+ Bucket=self.bucket_name,
368
+ Key=self._key_in_bucket(
369
+ identifier=job_id,
370
+ prefix=self.job_key_prefix,
371
+ ),
372
+ **self._get_encryption_args()
258
373
  )
259
- except DomainDoesNotExist:
260
- return False
261
-
262
- for attempt in retry_sdb():
263
- with attempt:
264
- get_result = self.db.get_attributes(
265
- DomainName=registry_domain_name,
266
- ItemName=self.name_prefix,
267
- AttributeNames=["exists"],
268
- ConsistentRead=True,
269
- )
270
- attributes: list["AttributeTypeDef"] = get_result.get(
271
- "Attributes", []
272
- ) # the documentation says 'Attributes' should always exist, but this is not true
273
- exists: Optional[str] = get_item_from_attributes(
274
- attributes=attributes, name="exists"
275
- )
276
- if exists is None:
277
- return False
278
- elif exists == "True":
279
- return True
280
- elif exists == "False":
281
- return None
282
- else:
283
- assert False
374
+ return True
375
+ except ClientError as e:
376
+ if e.response.get('ResponseMetadata', {}).get('HTTPStatusCode') == 404:
377
+ if check:
378
+ raise NoSuchJobException(job_id)
379
+ else:
380
+ raise
381
+ except self.s3_client.exceptions.NoSuchKey:
382
+ if check:
383
+ raise NoSuchJobException(job_id)
384
+ else:
385
+ raise
386
+ return False
284
387
 
285
- @_registered.setter
286
- def _registered(self, value: bool) -> None:
287
- registry_domain_name = "toil-registry"
388
+ def jobs(self) -> Iterator[JobDescription]:
389
+ for result in list_s3_items(self.s3_resource, bucket=self.bucket_name, prefix=self.job_key_prefix):
390
+ try:
391
+ job_id = result['Key'][len(self.job_key_prefix):] # strip self.job_key_prefix
392
+ yield self.load_job(job_id)
393
+ except NoSuchJobException:
394
+ # job may have been deleted between showing up in the list and getting loaded
395
+ pass
396
+
397
+ def load_job(self, job_id: str) -> JobDescription:
398
+ """Use a job_id to get a job from the jobstore's s3 bucket, unpickle, and return it."""
288
399
  try:
289
- self._bindDomain(
290
- domain_name=registry_domain_name,
291
- # Only create registry domain when registering or
292
- # transitioning a store
293
- create=value is not False,
294
- block=False,
400
+ job = pickle.loads(self.read_from_bucket(identifier=job_id, prefix=self.job_key_prefix))
401
+ except NoSuchJobException:
402
+ raise
403
+
404
+ if not isinstance(job, JobDescription):
405
+ raise RuntimeError(
406
+ f"While trying to load a JobDescription for {job_id}, got a {type(job)} instead!",
295
407
  )
296
- except DomainDoesNotExist:
297
- pass
298
- else:
299
- for attempt in retry_sdb():
300
- with attempt:
301
- if value is False:
302
- self.db.delete_attributes(
303
- DomainName=registry_domain_name, ItemName=self.name_prefix
304
- )
305
- else:
306
- if value is True:
307
- attributes: list["ReplaceableAttributeTypeDef"] = [
308
- {"Name": "exists", "Value": "True", "Replace": True}
309
- ]
310
- elif value is None:
311
- attributes = [
312
- {"Name": "exists", "Value": "False", "Replace": True}
313
- ]
314
- else:
315
- assert False
316
- self.db.put_attributes(
317
- DomainName=registry_domain_name,
318
- ItemName=self.name_prefix,
319
- Attributes=attributes,
320
- )
321
-
322
- def _checkItem(self, item: "ItemTypeDef", enforce: bool = True) -> None:
323
- """
324
- Make sure that the given SimpleDB item actually has the attributes we think it should.
325
408
 
326
- Throw otherwise.
409
+ # Now we know it's the right type
410
+ job.assignConfig(self.config)
411
+ return job
327
412
 
328
- If enforce is false, log but don't throw.
329
- """
330
- self._checkAttributes(item["Attributes"], enforce)
413
+ def update_job(self, jobDescription: JobDescription) -> None:
414
+ self.create_job(jobDescription)
331
415
 
332
- def _checkAttributes(
333
- self, attributes: list["AttributeTypeDef"], enforce: bool = True
334
- ) -> None:
335
- if get_item_from_attributes(attributes=attributes, name="overlargeID") is None:
336
- logger.error(
337
- "overlargeID attribute isn't present: either SimpleDB entry is "
338
- "corrupt or jobstore is from an extremely old Toil: %s",
339
- attributes,
416
+ def delete_job(self, job_id: str) -> None:
417
+ logger.debug("Deleting job %s", job_id)
418
+
419
+ # delete the actual job file
420
+ self.s3_client.delete_object(
421
+ Bucket=self.bucket_name,
422
+ Key=self._key_in_bucket(
423
+ identifier=job_id,
424
+ prefix=self.job_key_prefix,
340
425
  )
341
- if enforce:
342
- raise RuntimeError(
343
- "encountered SimpleDB entry missing required attribute "
344
- "'overlargeID'; is your job store ancient?"
345
- )
426
+ )
346
427
 
347
- def _awsJobFromAttributes(self, attributes: list["AttributeTypeDef"]) -> Job:
348
- """
349
- Get a Toil Job object from attributes that are defined in an item from the DB
350
- :param attributes: List of attributes
351
- :return: Toil job
352
- """
353
- self._checkAttributes(attributes)
354
- overlarge_id_value = get_item_from_attributes(
355
- attributes=attributes, name="overlargeID"
428
+ # delete any files marked as associated with the job
429
+ job_file_associations_to_delete = []
430
+ root_key = self._key_in_bucket(
431
+ identifier=job_id,
432
+ prefix=self.job_associations_key_prefix,
356
433
  )
357
- if overlarge_id_value:
358
- assert self.file_exists(overlarge_id_value)
359
- # This is an overlarge job, download the actual attributes
360
- # from the file store
361
- logger.debug("Loading overlarge job from S3.")
362
- with self.read_file_stream(overlarge_id_value) as fh:
363
- binary = fh.read()
364
- else:
365
- binary, _ = SDBHelper.attributesToBinary(attributes)
366
- assert binary is not None
367
- job = pickle.loads(binary)
368
- if job is not None:
369
- job.assignConfig(self.config)
370
- return job
434
+ for associated_job_file in list_s3_items(self.s3_resource,
435
+ bucket=self.bucket_name,
436
+ prefix=root_key):
437
+ job_file_associations_to_delete.append(associated_job_file['Key'])
438
+ file_id = associated_job_file['Key'].split('.')[-1]
439
+ self.delete_file(file_id)
440
+
441
+ # delete the job-file association references (these are empty files the simply connect jobs to files)
442
+ for job_file_association in job_file_associations_to_delete:
443
+ self.s3_client.delete_object(Bucket=self.bucket_name, Key=f'{job_file_association}')
444
+
445
+ def associate_job_with_file(self, job_id: str, file_id: str) -> None:
446
+ # associate this job with this file; the file will be deleted when the job is
447
+ self.write_to_bucket(identifier=f'{job_id}.{file_id}', prefix=self.job_associations_key_prefix, data=None)
371
448
 
372
- def _awsJobFromItem(self, item: "ItemTypeDef") -> Job:
449
+ ###################################### FILES API ######################################
450
+
451
+ def write_file(self, local_path: str, job_id: Optional[str] = None, cleanup: bool = False) -> FileID:
373
452
  """
374
- Get a Toil Job object from an item from the DB
375
- :return: Toil Job
453
+ Write a local file into the jobstore and return a file_id referencing it.
454
+
455
+ :param job_id:
456
+ If job_id AND cleanup are supplied, associate this file with that job. When the job is deleted, the
457
+ file will be deleted as well.
458
+
459
+ :param cleanup:
460
+ If job_id AND cleanup are supplied, associate this file with that job. When the job is deleted, the
461
+ file will be deleted as well.
462
+ TODO: we don't need cleanup; remove it and only use job_id
376
463
  """
377
- return self._awsJobFromAttributes(item["Attributes"])
378
-
379
- def _awsJobToAttributes(self, job: JobDescription) -> list["AttributeTypeDef"]:
380
- binary = pickle.dumps(job, protocol=pickle.HIGHEST_PROTOCOL)
381
- if len(binary) > SDBHelper.maxBinarySize(extraReservedChunks=1):
382
- # Store as an overlarge job in S3
383
- with self.write_file_stream() as (writable, fileID):
384
- writable.write(binary)
385
- item = SDBHelper.binaryToAttributes(None)
386
- item["overlargeID"] = fileID
387
- else:
388
- item = SDBHelper.binaryToAttributes(binary)
389
- item["overlargeID"] = ""
390
- return SDBHelper.attributeDictToList(item)
464
+ # TODO: etag = compute_checksum_for_file(local_path, algorithm='etag')[len('etag$'):]
465
+ file_id = str(uuid.uuid4()) # mint a new file_id
466
+ file_attributes = os.stat(local_path)
467
+ size = file_attributes.st_size
468
+ executable = file_attributes.st_mode & stat.S_IXUSR != 0
469
+
470
+ if job_id and cleanup:
471
+ # associate this job with this file; then the file reference will be deleted when the job is
472
+ self.associate_job_with_file(job_id, file_id)
473
+
474
+ # Each file gets a prefix under which we put exactly one key, to hide
475
+ # metadata in the key.
476
+ prefix = self._key_in_bucket(
477
+ identifier=file_id,
478
+ prefix=self.content_key_prefix
479
+ )
391
480
 
392
- def _awsJobToItem(self, job: JobDescription, name: str) -> "ItemTypeDef":
393
- return {"Name": name, "Attributes": self._awsJobToAttributes(job)}
481
+ copy_local_to_s3(
482
+ s3_resource=self.s3_resource,
483
+ local_file_path=local_path,
484
+ dst_bucket=self.bucket_name,
485
+ dst_key=f'{prefix}/{os.path.basename(local_path)}',
486
+ extra_args=self._get_encryption_args()
487
+ )
488
+ return FileID(file_id, size, executable)
394
489
 
395
- jobsPerBatchInsert = 25
490
+ def find_s3_key_from_file_id(self, file_id: str) -> str:
491
+ """This finds an s3 key for which file_id is the prefix, and which already exists."""
492
+ prefix = self._key_in_bucket(
493
+ identifier=file_id,
494
+ prefix=self.content_key_prefix
495
+ )
496
+ s3_keys = [s3_item for s3_item in list_s3_items(self.s3_resource, bucket=self.bucket_name, prefix=prefix)]
497
+ if len(s3_keys) == 0:
498
+ raise NoSuchFileException(file_id)
499
+ if len(s3_keys) > 1:
500
+ # There can be only one.
501
+ raise RuntimeError(f'File ID: {file_id} should be unique, but includes: {s3_keys}')
502
+ return s3_keys[0]['Key']
396
503
 
397
504
  @contextmanager
398
- def batch(self) -> None:
399
- self._batchedUpdates = []
400
- yield
401
- batches = [
402
- self._batchedUpdates[i : i + self.jobsPerBatchInsert]
403
- for i in range(0, len(self._batchedUpdates), self.jobsPerBatchInsert)
404
- ]
405
-
406
- for batch in batches:
407
- items: list["ReplaceableItemTypeDef"] = []
408
- for jobDescription in batch:
409
- item_attributes: list["ReplaceableAttributeTypeDef"] = []
410
- jobDescription.pre_update_hook()
411
- item_name = compat_bytes(jobDescription.jobStoreID)
412
- got_job_attributes: list["AttributeTypeDef"] = self._awsJobToAttributes(
413
- jobDescription
414
- )
415
- for each_attribute in got_job_attributes:
416
- new_attribute: "ReplaceableAttributeTypeDef" = {
417
- "Name": each_attribute["Name"],
418
- "Value": each_attribute["Value"],
419
- "Replace": True,
420
- }
421
- item_attributes.append(new_attribute)
422
- items.append({"Name": item_name, "Attributes": item_attributes})
423
-
424
- for attempt in retry_sdb():
425
- with attempt:
426
- self.db.batch_put_attributes(
427
- DomainName=self.jobs_domain_name, Items=items
428
- )
429
- self._batchedUpdates = None
430
-
431
- def assign_job_id(self, job_description: JobDescription) -> None:
432
- jobStoreID = self._new_job_id()
433
- logger.debug("Assigning ID to job %s", jobStoreID)
434
- job_description.jobStoreID = jobStoreID
435
-
436
- def create_job(self, job_description: JobDescription) -> JobDescription:
437
- if hasattr(self, "_batchedUpdates") and self._batchedUpdates is not None:
438
- self._batchedUpdates.append(job_description)
439
- else:
440
- self.update_job(job_description)
441
- return job_description
442
-
443
- def job_exists(self, job_id: Union[bytes, str]) -> bool:
444
- for attempt in retry_sdb():
445
- with attempt:
446
- return (
447
- len(
448
- self.db.get_attributes(
449
- DomainName=self.jobs_domain_name,
450
- ItemName=compat_bytes(job_id),
451
- AttributeNames=[SDBHelper.presenceIndicator()],
452
- ConsistentRead=True,
453
- ).get("Attributes", [])
454
- )
455
- > 0
456
- )
457
-
458
- def jobs(self) -> Generator[Job, None, None]:
459
- job_items: Optional[list["ItemTypeDef"]] = None
460
- for attempt in retry_sdb():
461
- with attempt:
462
- job_items = boto3_pager(
463
- self.db.select,
464
- "Items",
465
- ConsistentRead=True,
466
- SelectExpression="select * from `%s`" % self.jobs_domain_name,
467
- )
468
- assert job_items is not None
469
- for jobItem in job_items:
470
- yield self._awsJobFromItem(jobItem)
471
-
472
- def load_job(self, job_id: FileID) -> Job:
473
- item_attributes = None
474
- for attempt in retry_sdb():
475
- with attempt:
476
- item_attributes = self.db.get_attributes(
477
- DomainName=self.jobs_domain_name,
478
- ItemName=compat_bytes(job_id),
479
- ConsistentRead=True,
480
- ).get("Attributes", [])
481
- if not item_attributes:
482
- raise NoSuchJobException(job_id)
483
- job = self._awsJobFromAttributes(item_attributes)
484
- if job is None:
485
- raise NoSuchJobException(job_id)
486
- logger.debug("Loaded job %s", job_id)
487
- return job
505
+ def write_file_stream(
506
+ self,
507
+ job_id: Optional[str] = None,
508
+ cleanup: bool = False,
509
+ basename: Optional[str] = None,
510
+ encoding: Optional[str] = None,
511
+ errors: Optional[str] = None,
512
+ ) -> Iterator[tuple[IO[bytes], str]]:
513
+ file_id = str(uuid.uuid4())
514
+ if job_id and cleanup:
515
+ self.associate_job_with_file(job_id, file_id)
516
+ prefix = self._key_in_bucket(
517
+ identifier=file_id,
518
+ prefix=self.content_key_prefix
519
+ )
488
520
 
489
- def update_job(self, job_description):
490
- logger.debug("Updating job %s", job_description.jobStoreID)
491
- job_description.pre_update_hook()
492
- job_attributes = self._awsJobToAttributes(job_description)
493
- update_attributes: list["ReplaceableAttributeTypeDef"] = [
494
- {"Name": attribute["Name"], "Value": attribute["Value"], "Replace": True}
495
- for attribute in job_attributes
496
- ]
497
- for attempt in retry_sdb():
498
- with attempt:
499
- self.db.put_attributes(
500
- DomainName=self.jobs_domain_name,
501
- ItemName=compat_bytes(job_description.jobStoreID),
502
- Attributes=update_attributes,
503
- )
521
+ pipe = MultiPartPipe(part_size=self.part_size,
522
+ s3_resource=self.s3_resource,
523
+ bucket_name=self.bucket_name,
524
+ file_id=f'{prefix}/{str(basename)}',
525
+ encryption_args=self._get_encryption_args(),
526
+ encoding=encoding,
527
+ errors=errors)
528
+ with pipe as writable:
529
+ yield writable, file_id
504
530
 
505
- itemsPerBatchDelete = 25
531
+ @contextmanager
532
+ def update_file_stream(
533
+ self,
534
+ file_id: str,
535
+ encoding: Optional[str] = None,
536
+ errors: Optional[str] = None
537
+ ) -> Iterator[IO[Any]]:
538
+ logger.debug("Replacing file %s via multipart upload", file_id)
539
+ pipe = MultiPartPipe(
540
+ part_size=self.part_size,
541
+ s3_resource=self.s3_resource,
542
+ bucket_name=self.bucket_name,
543
+ file_id=self.find_s3_key_from_file_id(file_id),
544
+ encryption_args=self._get_encryption_args(),
545
+ encoding=encoding,
546
+ errors=errors,
547
+ )
548
+ with pipe as writable:
549
+ yield writable
506
550
 
507
- def delete_job(self, job_id):
508
- # remove job and replace with jobStoreId.
509
- logger.debug("Deleting job %s", job_id)
551
+ @contextmanager
552
+ def write_shared_file_stream(
553
+ self,
554
+ shared_file_name: str,
555
+ encrypted: Optional[bool] = None,
556
+ encoding: Optional[str] = None,
557
+ errors: Optional[str] = None,
558
+ ) -> Iterator[IO[bytes]]:
559
+ encryption_args = {} if encrypted is False else self._get_encryption_args()
560
+ pipe = MultiPartPipe(
561
+ part_size=self.part_size,
562
+ s3_resource=self.s3_resource,
563
+ bucket_name=self.bucket_name,
564
+ file_id=self._key_in_bucket(
565
+ identifier=shared_file_name,
566
+ prefix=self.shared_key_prefix,
567
+ ),
568
+ encryption_args=encryption_args,
569
+ encoding=encoding,
570
+ errors=errors,
571
+ )
572
+ with pipe as writable:
573
+ yield writable
510
574
 
511
- # If the job is overlarge, delete its file from the filestore
512
- for attempt in retry_sdb():
513
- with attempt:
514
- attributes = self.db.get_attributes(
515
- DomainName=self.jobs_domain_name,
516
- ItemName=compat_bytes(job_id),
517
- ConsistentRead=True,
518
- ).get("Attributes", [])
519
- # If the overlargeID has fallen off, maybe we partially deleted the
520
- # attributes of the item? Or raced on it? Or hit SimpleDB being merely
521
- # eventually consistent? We should still be able to get rid of it.
522
- self._checkAttributes(attributes, enforce=False)
523
- overlarge_id_value = get_item_from_attributes(
524
- attributes=attributes, name="overlargeID"
575
+ def update_file(self, file_id: str, local_path: str) -> None:
576
+ copy_local_to_s3(
577
+ s3_resource=self.s3_resource,
578
+ local_file_path=local_path,
579
+ dst_bucket=self.bucket_name,
580
+ dst_key=self.find_s3_key_from_file_id(file_id),
581
+ extra_args=self._get_encryption_args()
525
582
  )
526
- if overlarge_id_value:
527
- logger.debug("Deleting job from filestore")
528
- self.delete_file(overlarge_id_value)
529
- for attempt in retry_sdb():
530
- with attempt:
531
- self.db.delete_attributes(
532
- DomainName=self.jobs_domain_name, ItemName=compat_bytes(job_id)
533
- )
534
- items: Optional[list["ItemTypeDef"]] = None
535
- for attempt in retry_sdb():
536
- with attempt:
537
- items = list(
538
- boto3_pager(
539
- self.db.select,
540
- "Items",
541
- ConsistentRead=True,
542
- SelectExpression=f"select version from `{self.files_domain_name}` where ownerID='{job_id}'",
543
- )
544
- )
545
- assert items is not None
546
- if items:
547
- logger.debug(
548
- "Deleting %d file(s) associated with job %s", len(items), job_id
583
+
584
+ def file_exists(self, file_id: str) -> bool:
585
+ try:
586
+ # This throws if the file doesn't exist.
587
+ self.find_s3_key_from_file_id(file_id)
588
+ except NoSuchFileException:
589
+ # It didn't exist
590
+ return False
591
+ return True
592
+
593
+ def get_file_size(self, file_id: str) -> int:
594
+ """Do we need both get_file_size and _get_size???"""
595
+ full_s3_key = self.find_s3_key_from_file_id(file_id)
596
+ return self._get_size(url=urlparse(f's3://{self.bucket_name}/{full_s3_key}')) or 0
597
+
598
+ @classmethod
599
+ def _get_size(cls, url: ParseResult) -> Optional[int]:
600
+ """Do we need both get_file_size and _get_size???"""
601
+ try:
602
+ return get_object_for_url(url, existing=True).content_length
603
+ except (AWSKeyNotFoundError, NoSuchFileException):
604
+ return 0
605
+
606
+ def read_file(self, file_id: str, local_path: str, symlink: bool = False) -> None:
607
+ full_s3_key = self.find_s3_key_from_file_id(file_id)
608
+ executable = getattr(file_id, "executable", False)
609
+ try:
610
+ copy_s3_to_local(
611
+ s3_resource=self.s3_resource,
612
+ local_file_path=local_path,
613
+ src_bucket=self.bucket_name,
614
+ src_key=full_s3_key,
615
+ extra_args=self._get_encryption_args()
549
616
  )
550
- n = self.itemsPerBatchDelete
551
- batches = [items[i : i + n] for i in range(0, len(items), n)]
552
- for batch in batches:
553
- delete_items: list["DeletableItemTypeDef"] = [
554
- {"Name": item["Name"]} for item in batch
555
- ]
556
- for attempt in retry_sdb():
557
- with attempt:
558
- self.db.batch_delete_attributes(
559
- DomainName=self.files_domain_name, Items=delete_items
560
- )
561
- for item in items:
562
- item: "ItemTypeDef"
563
- version = get_item_from_attributes(
564
- attributes=item["Attributes"], name="version"
565
- )
566
- for attempt in retry_s3():
567
- with attempt:
568
- if version:
569
- self.s3_client.delete_object(
570
- Bucket=self.files_bucket.name,
571
- Key=compat_bytes(item["Name"]),
572
- VersionId=version,
573
- )
574
- else:
575
- self.s3_client.delete_object(
576
- Bucket=self.files_bucket.name,
577
- Key=compat_bytes(item["Name"]),
578
- )
617
+ if executable:
618
+ os.chmod(local_path, os.stat(local_path).st_mode | stat.S_IXUSR)
619
+ except self.s3_client.exceptions.NoSuchKey:
620
+ raise NoSuchFileException(file_id)
621
+ except ClientError as e:
622
+ if e.response.get('ResponseMetadata', {}).get('HTTPStatusCode') == 404:
623
+ raise NoSuchFileException(file_id)
624
+ else:
625
+ raise
579
626
 
580
- def get_empty_file_store_id(
581
- self, jobStoreID=None, cleanup=False, basename=None
582
- ) -> FileID:
583
- info = self.FileInfo.create(jobStoreID if cleanup else None)
584
- with info.uploadStream() as _:
585
- # Empty
586
- pass
587
- info.save()
588
- logger.debug("Created %r.", info)
589
- return info.fileID
627
+ @contextmanager # type: ignore
628
+ def read_file_stream( # type: ignore
629
+ self,
630
+ file_id: Union[FileID, str],
631
+ encoding: Optional[str] = None,
632
+ errors: Optional[str] = None,
633
+ ) -> Union[ContextManager[IO[bytes]], ContextManager[IO[str]]]:
634
+ full_s3_key = self.find_s3_key_from_file_id(file_id)
635
+ try:
636
+ with download_stream(self.s3_resource,
637
+ bucket=self.bucket_name,
638
+ key=full_s3_key,
639
+ extra_args=self._get_encryption_args(),
640
+ encoding=encoding,
641
+ errors=errors) as readable:
642
+ yield readable
643
+ except self.s3_client.exceptions.NoSuchKey:
644
+ raise NoSuchFileException(file_id)
645
+ except ClientError as e:
646
+ if e.response.get('ResponseMetadata', {}).get('HTTPStatusCode') == 404:
647
+ raise NoSuchFileException(file_id)
648
+ else:
649
+ raise
650
+
651
+ @overload
652
+ @contextmanager
653
+ def read_shared_file_stream(
654
+ self,
655
+ shared_file_name: str,
656
+ encoding: str,
657
+ errors: Optional[str] = None,
658
+ ) -> Iterator[IO[str]]: ...
659
+
660
+ @overload
661
+ @contextmanager
662
+ def read_shared_file_stream(
663
+ self,
664
+ shared_file_name: str,
665
+ encoding: Literal[None] = None,
666
+ errors: Optional[str] = None,
667
+ ) -> Iterator[IO[bytes]]: ...
668
+
669
+ @contextmanager
670
+ def read_shared_file_stream(
671
+ self,
672
+ shared_file_name: str,
673
+ encoding: Optional[str] = None,
674
+ errors: Optional[str] = None,
675
+ ) -> Iterator[Union[IO[bytes], IO[str]]]:
676
+ self._requireValidSharedFileName(shared_file_name)
677
+ key = self._key_in_bucket(identifier=shared_file_name, prefix=self.shared_key_prefix)
678
+ if not self.is_in_bucket(
679
+ identifier=shared_file_name,
680
+ prefix=self.shared_key_prefix,
681
+ ):
682
+ # TRAVIS=true TOIL_OWNER_TAG="shared" /home/quokka/git/toil/v3nv/bin/python -m pytest --durations=0 --log-level DEBUG --log-cli-level INFO -r s /home/quokka/git/toil/src/toil/test/jobStores/jobStoreTest.py::EncryptedAWSJobStoreTest::testJobDeletions
683
+ # throw NoSuchFileException in download_stream
684
+ raise NoSuchFileException(f's3://{self.bucket_name}/{key}')
685
+
686
+ try:
687
+ with download_stream(self.s3_resource,
688
+ bucket=self.bucket_name,
689
+ key=key,
690
+ encoding=encoding,
691
+ errors=errors,
692
+ extra_args=self._get_encryption_args()) as readable:
693
+ yield readable
694
+ except self.s3_client.exceptions.NoSuchKey:
695
+ raise NoSuchFileException(shared_file_name)
696
+ except ClientError as e:
697
+ if e.response.get('ResponseMetadata', {}).get('HTTPStatusCode') == 404:
698
+ raise NoSuchFileException(shared_file_name)
699
+ else:
700
+ raise
701
+
702
+ def delete_file(self, file_id: str) -> None:
703
+ try:
704
+ full_s3_key = self.find_s3_key_from_file_id(file_id)
705
+ except NoSuchFileException:
706
+ # The file is gone. That's great, we're idempotent.
707
+ return
708
+ self.s3_client.delete_object(Bucket=self.bucket_name, Key=full_s3_key)
709
+
710
+ ###################################### URI API ######################################
590
711
 
591
712
  def _import_file(
592
713
  self,
593
- otherCls,
714
+ otherCls: type[URLAccess],
594
715
  uri: ParseResult,
595
716
  shared_file_name: Optional[str] = None,
596
717
  hardlink: bool = False,
597
718
  symlink: bool = True,
598
719
  ) -> Optional[FileID]:
599
- try:
600
- if issubclass(otherCls, AWSJobStore):
601
- srcObj = get_object_for_url(uri, existing=True)
602
- size = srcObj.content_length
603
- if shared_file_name is None:
604
- info = self.FileInfo.create(srcObj.key)
605
- else:
606
- self._requireValidSharedFileName(shared_file_name)
607
- jobStoreFileID = self._shared_file_id(shared_file_name)
608
- info = self.FileInfo.loadOrCreate(
609
- jobStoreFileID=jobStoreFileID,
610
- ownerID=str(self.sharedFileOwnerID),
611
- encrypted=None,
612
- )
613
- info.copyFrom(srcObj)
614
- info.save()
615
- return FileID(info.fileID, size) if shared_file_name is None else None
616
- except (NoBucketLocationError, ServerSideCopyProhibitedError):
617
- # AWS refuses to tell us where the bucket is or do this copy for us
618
- logger.warning(
619
- "Falling back to copying via the local machine. This could get expensive!"
720
+ """
721
+ Upload a file into the s3 bucket jobstore from the source uri.
722
+
723
+ This db entry's existence should always be in sync with the file's existence (when one exists,
724
+ so must the other).
725
+ """
726
+ # we are copying from s3 to s3
727
+ if isinstance(otherCls, AWSJobStore):
728
+ src_bucket_name, src_key_name = parse_s3_uri(uri.geturl())
729
+ response = head_s3_object(self.s3_resource, bucket=src_bucket_name, key=src_key_name, check=True)
730
+ content_length = response['ContentLength'] # e.g. 65536
731
+
732
+ file_id = str(uuid.uuid4())
733
+ if shared_file_name:
734
+ dst_key = self._key_in_bucket(identifier=shared_file_name, prefix=self.shared_key_prefix)
735
+ else:
736
+ # cannot determine exec bit from foreign s3 so default to False
737
+ dst_key = "/".join([
738
+ self._key_in_bucket(identifier=file_id, prefix=self.content_key_prefix),
739
+ src_key_name.split("/")[-1],
740
+ ])
741
+
742
+ copy_s3_to_s3(
743
+ s3_resource=self.s3_resource,
744
+ src_bucket=src_bucket_name,
745
+ src_key=src_key_name,
746
+ dst_bucket=self.bucket_name,
747
+ dst_key=dst_key,
748
+ extra_args=self._get_encryption_args()
620
749
  )
750
+ # TODO: verify etag after copying here?
621
751
 
622
- # copy if exception
623
- return super()._import_file(otherCls, uri, shared_file_name=shared_file_name)
752
+ return FileID(file_id, content_length) if not shared_file_name else None
753
+ else:
754
+ return super(AWSJobStore, self)._import_file(
755
+ otherCls=otherCls,
756
+ uri=uri,
757
+ shared_file_name=shared_file_name,
758
+ hardlink=hardlink,
759
+ symlink=symlink
760
+ )
624
761
 
625
- def _export_file(self, otherCls, file_id: FileID, uri: ParseResult) -> None:
626
- try:
627
- if issubclass(otherCls, AWSJobStore):
628
- dstObj = get_object_for_url(uri)
629
- info = self.FileInfo.loadOrFail(file_id)
630
- info.copyTo(dstObj)
631
- return
632
- except (NoBucketLocationError, ServerSideCopyProhibitedError):
633
- # AWS refuses to tell us where the bucket is or do this copy for us
634
- logger.warning(
635
- "Falling back to copying via the local machine. This could get expensive!"
762
+ def _export_file(
763
+ self, otherCls: type[URLAccess], jobStoreFileID: FileID, url: ParseResult
764
+ ) -> None:
765
+ """Export a file_id in the jobstore to the url."""
766
+ if isinstance(otherCls, AWSJobStore):
767
+ src_full_s3_key = self.find_s3_key_from_file_id(jobStoreFileID)
768
+ dst_bucket_name, dst_key_name = parse_s3_uri(url.geturl())
769
+ copy_s3_to_s3(
770
+ s3_resource=self.s3_resource,
771
+ src_bucket=self.bucket_name,
772
+ src_key=src_full_s3_key,
773
+ dst_bucket=dst_bucket_name,
774
+ dst_key=dst_key_name,
775
+ extra_args=self._get_encryption_args()
636
776
  )
637
777
  else:
638
- super()._default_export_file(otherCls, file_id, uri)
778
+ super(AWSJobStore, self)._default_export_file(otherCls, jobStoreFileID, url)
639
779
 
640
- ###
641
- # URL access implementation
642
- ###
780
+ @classmethod
781
+ def _read_from_url(
782
+ cls, url: ParseResult, writable: Union[IO[bytes], IO[str]]
783
+ ) -> tuple[int, bool]:
784
+ src_obj = get_object_for_url(url, existing=True)
785
+ src_obj.download_fileobj(writable)
786
+ executable = False
787
+ return src_obj.content_length, executable
643
788
 
644
- # URL access methods aren't used by the rest of the job store methods.
789
+ @classmethod
790
+ def _write_to_url(
791
+ cls,
792
+ readable: Union[IO[bytes], IO[str]],
793
+ url: ParseResult,
794
+ executable: bool = False,
795
+ ) -> None:
796
+ dst_obj = get_object_for_url(url)
797
+ upload_to_s3(readable=readable,
798
+ s3_resource=establish_boto3_session().resource("s3"),
799
+ bucket=dst_obj.bucket_name,
800
+ key=dst_obj.key)
645
801
 
646
802
  @classmethod
647
803
  def _url_exists(cls, url: ParseResult) -> bool:
648
804
  try:
649
- try:
650
- get_object_for_url(url, existing=True, anonymous=True)
651
- except PermissionError:
652
- # If we can't look anonymously, log in
653
- get_object_for_url(url, existing=True)
805
+ get_object_for_url(url, existing=True)
654
806
  return True
655
807
  except FileNotFoundError:
656
808
  # Not a file
657
- # Might be a directory. Or we might not have access to know.
658
- # See if it's a directory.
809
+ # Might be a directory.
659
810
  return cls._get_is_directory(url)
660
811
 
661
- @classmethod
662
- def _get_size(cls, url: ParseResult) -> int:
663
- try:
664
- src_obj = get_object_for_url(url, existing=True, anonymous=True)
665
- except PermissionError:
666
- src_obj = get_object_for_url(url, existing=True)
667
- return src_obj.content_length
668
-
669
- @classmethod
670
- def _read_from_url(cls, url: ParseResult, writable):
671
- try:
672
- src_obj = get_object_for_url(url, existing=True, anonymous=True)
673
- src_obj.download_fileobj(writable)
674
- except Exception as e:
675
- if isinstance(e, PermissionError) or (isinstance(e, ClientError) and get_error_status(e) == 403):
676
- # The object setup or the download does not have permission. Try again with a login.
677
- src_obj = get_object_for_url(url, existing=True)
678
- src_obj.download_fileobj(writable)
679
- else:
680
- raise
681
- return (src_obj.content_length, False) # executable bit is always False
682
-
683
812
  @classmethod
684
813
  def _open_url(cls, url: ParseResult) -> IO[bytes]:
685
814
  try:
@@ -695,1415 +824,186 @@ class AWSJobStore(AbstractJobStore):
695
824
  # We should get back a response with a stream in 'Body'
696
825
  if "Body" not in response:
697
826
  raise RuntimeError(f"Could not fetch body stream for {url}")
698
- return response["Body"]
827
+ return response["Body"] # type: ignore
699
828
 
700
829
  @classmethod
701
- def _write_to_url(
702
- cls, readable, url: ParseResult, executable: bool = False
703
- ) -> None:
704
- # Don't try to do anonympus writes.
705
- dstObj = get_object_for_url(url)
706
-
707
- logger.debug("Uploading %s", dstObj.key)
708
- # uploadFile takes care of using multipart upload if the file is larger than partSize (default to 5MB)
709
- uploadFile(
710
- readable=readable,
711
- resource=s3_boto3_resource,
712
- bucketName=dstObj.bucket_name,
713
- fileID=dstObj.key,
714
- partSize=5 * 1000 * 1000,
715
- )
830
+ def _list_url(cls, url: ParseResult) -> list[str]:
831
+ return list_objects_for_url(url)
716
832
 
717
833
  @classmethod
718
- def _list_url(cls, url: ParseResult) -> list[str]:
834
+ def _supports_url(cls, url: ParseResult, export: bool = False) -> bool:
835
+ # TODO: export seems unused
836
+ return url.scheme.lower() == 's3'
837
+
838
+ def get_public_url(self, file_id: str) -> str:
839
+ """Turn s3:// into http:// and put a public-read ACL on it."""
719
840
  try:
720
- return list_objects_for_url(url, anonymous=True)
721
- except PermissionError:
722
- return list_objects_for_url(url)
723
-
841
+ return create_public_url(
842
+ self.s3_resource,
843
+ bucket=self.bucket_name,
844
+ key=self._key_in_bucket(
845
+ identifier=file_id,
846
+ prefix=self.content_key_prefix,
847
+ ),
848
+ )
849
+ except self.s3_client.exceptions.NoSuchKey:
850
+ raise NoSuchFileException(file_id)
851
+ except ClientError as e:
852
+ if e.response.get('ResponseMetadata', {}).get('HTTPStatusCode') == 404:
853
+ raise NoSuchFileException(file_id)
854
+ else:
855
+ raise
856
+
857
+ def get_shared_public_url(self, file_id: str) -> str:
858
+ """Turn s3:// into http:// and put a public-read ACL on it."""
859
+ # since this is only for a few files like "config.pickle"... why and what is this used for?
860
+ self._requireValidSharedFileName(file_id)
861
+ try:
862
+ return create_public_url(
863
+ self.s3_resource,
864
+ bucket=self.bucket_name,
865
+ key=self._key_in_bucket(
866
+ identifier=file_id,
867
+ prefix=self.shared_key_prefix,
868
+ ),
869
+ )
870
+ except self.s3_client.exceptions.NoSuchKey:
871
+ raise NoSuchFileException(file_id)
872
+ except ClientError as e:
873
+ if e.response.get('ResponseMetadata', {}).get('HTTPStatusCode') == 404:
874
+ raise NoSuchFileException(file_id)
875
+ else:
876
+ raise
724
877
 
725
878
  @classmethod
726
879
  def _get_is_directory(cls, url: ParseResult) -> bool:
727
880
  # We consider it a directory if anything is in it.
728
881
  # TODO: Can we just get the first item and not the whole list?
729
- return len(cls._list_url(url)) > 0
730
-
731
- @classmethod
732
- def _supports_url(cls, url: ParseResult, export: bool = False) -> bool:
733
- return url.scheme.lower() == "s3"
882
+ return len(list_objects_for_url(url)) > 0
734
883
 
735
- def write_file(
736
- self, local_path: FileID, job_id: Optional[FileID] = None, cleanup: bool = False
737
- ) -> FileID:
738
- info = self.FileInfo.create(job_id if cleanup else None)
739
- info.upload(local_path, not self.config.disableJobStoreChecksumVerification)
740
- info.save()
741
- logger.debug("Wrote %r of from %r", info, local_path)
742
- return info.fileID
743
-
744
- @contextmanager
745
- def write_file_stream(
884
+ def get_empty_file_store_id(
746
885
  self,
747
- job_id: Optional[FileID] = None,
886
+ job_id: Optional[str] = None,
748
887
  cleanup: bool = False,
749
- basename=None,
750
- encoding=None,
751
- errors=None,
752
- ):
753
- info = self.FileInfo.create(job_id if cleanup else None)
754
- with info.uploadStream(encoding=encoding, errors=errors) as writable:
755
- yield writable, info.fileID
756
- info.save()
757
- logger.debug("Wrote %r.", info)
758
-
759
- @contextmanager
760
- def write_shared_file_stream(
761
- self, shared_file_name, encrypted=None, encoding=None, errors=None
762
- ):
763
- self._requireValidSharedFileName(shared_file_name)
764
- info = self.FileInfo.loadOrCreate(
765
- jobStoreFileID=self._shared_file_id(shared_file_name),
766
- ownerID=str(self.sharedFileOwnerID),
767
- encrypted=encrypted,
768
- )
769
- with info.uploadStream(encoding=encoding, errors=errors) as writable:
770
- yield writable
771
- info.save()
772
- logger.debug("Wrote %r for shared file %r.", info, shared_file_name)
773
-
774
- def update_file(self, file_id, local_path):
775
- info = self.FileInfo.loadOrFail(file_id)
776
- info.upload(local_path, not self.config.disableJobStoreChecksumVerification)
777
- info.save()
778
- logger.debug("Wrote %r from path %r.", info, local_path)
779
-
780
- @contextmanager
781
- def update_file_stream(self, file_id, encoding=None, errors=None):
782
- info = self.FileInfo.loadOrFail(file_id)
783
- with info.uploadStream(encoding=encoding, errors=errors) as writable:
784
- yield writable
785
- info.save()
786
- logger.debug("Wrote %r from stream.", info)
787
-
788
- def file_exists(self, file_id):
789
- return self.FileInfo.exists(file_id)
790
-
791
- def get_file_size(self, file_id):
792
- if not self.file_exists(file_id):
793
- return 0
794
- info = self.FileInfo.loadOrFail(file_id)
795
- return info.getSize()
796
-
797
- def read_file(self, file_id, local_path, symlink=False):
798
- info = self.FileInfo.loadOrFail(file_id)
799
- logger.debug("Reading %r into %r.", info, local_path)
800
- info.download(local_path, not self.config.disableJobStoreChecksumVerification)
801
- if getattr(file_id, "executable", False):
802
- os.chmod(local_path, os.stat(local_path).st_mode | stat.S_IXUSR)
803
-
804
- @contextmanager
805
- def read_file_stream(self, file_id, encoding=None, errors=None):
806
- info = self.FileInfo.loadOrFail(file_id)
807
- logger.debug("Reading %r into stream.", info)
808
- with info.downloadStream(encoding=encoding, errors=errors) as readable:
809
- yield readable
810
-
811
- @contextmanager
812
- def read_shared_file_stream(self, shared_file_name, encoding=None, errors=None):
813
- self._requireValidSharedFileName(shared_file_name)
814
- jobStoreFileID = self._shared_file_id(shared_file_name)
815
- info = self.FileInfo.loadOrFail(jobStoreFileID, customName=shared_file_name)
816
- logger.debug(
817
- "Reading %r for shared file %r into stream.", info, shared_file_name
818
- )
819
- with info.downloadStream(encoding=encoding, errors=errors) as readable:
820
- yield readable
821
-
822
- def delete_file(self, file_id):
823
- info = self.FileInfo.load(file_id)
824
- if info is None:
825
- logger.debug("File %s does not exist, skipping deletion.", file_id)
826
- else:
827
- info.delete()
828
-
829
- def write_logs(self, msg):
830
- info = self.FileInfo.create(str(self.statsFileOwnerID))
831
- with info.uploadStream(multipart=False) as writeable:
832
- if isinstance(msg, str):
833
- # This stream is for binary data, so encode any non-encoded things
834
- msg = msg.encode("utf-8", errors="ignore")
835
- writeable.write(msg)
836
- info.save()
837
-
838
- def read_logs(self, callback, read_all=False):
839
- itemsProcessed = 0
840
-
841
- for info in self._read_logs(callback, self.statsFileOwnerID):
842
- info._ownerID = str(self.readStatsFileOwnerID) # boto3 requires strings
843
- info.save()
844
- itemsProcessed += 1
845
-
846
- if read_all:
847
- for _ in self._read_logs(callback, self.readStatsFileOwnerID):
848
- itemsProcessed += 1
849
-
850
- return itemsProcessed
851
-
852
- def _read_logs(self, callback, ownerId):
853
- items = None
854
- for attempt in retry_sdb():
855
- with attempt:
856
- items = boto3_pager(
857
- self.db.select,
858
- "Items",
859
- ConsistentRead=True,
860
- SelectExpression=f"select * from `{self.files_domain_name}` where ownerID='{str(ownerId)}'",
861
- )
862
- assert items is not None
863
- for item in items:
864
- info = self.FileInfo.fromItem(item)
865
- with info.downloadStream() as readable:
866
- callback(readable)
867
- yield info
868
-
869
- # TODO: Make this retry more specific?
870
- # example: https://github.com/DataBiosphere/toil/issues/3378
871
- @retry()
872
- def get_public_url(self, jobStoreFileID):
873
- info = self.FileInfo.loadOrFail(jobStoreFileID)
874
- if info.content is not None:
875
- with info.uploadStream(allowInlining=False) as f:
876
- f.write(info.content)
877
-
878
- self.files_bucket.Object(compat_bytes(jobStoreFileID)).Acl().put(
879
- ACL="public-read"
888
+ basename: Optional[str] = None,
889
+ ) -> str:
890
+ """Create an empty file in s3 and return a bare string file ID."""
891
+ file_id = str(uuid.uuid4())
892
+ self.write_to_bucket(
893
+ identifier=f'{file_id}/0/{basename}',
894
+ prefix=self.content_key_prefix,
895
+ data=None,
896
+ bucket=self.bucket_name
880
897
  )
881
-
882
- url = self.s3_client.generate_presigned_url(
883
- "get_object",
884
- Params={
885
- "Bucket": self.files_bucket.name,
886
- "Key": compat_bytes(jobStoreFileID),
887
- "VersionId": info.version,
888
- },
889
- ExpiresIn=self.publicUrlExpiration.total_seconds(),
898
+ if job_id and cleanup:
899
+ self.associate_job_with_file(job_id, file_id)
900
+ return file_id
901
+
902
+ ###################################### LOGGING API ######################################
903
+
904
+ def write_logs(self, log_msg: Union[bytes, str]) -> None:
905
+ if isinstance(log_msg, str):
906
+ log_msg = log_msg.encode('utf-8', errors='ignore')
907
+ file_obj = BytesIO(log_msg)
908
+
909
+ key_name = self._key_in_bucket(
910
+ identifier=f'{datetime.datetime.now()}{str(uuid.uuid4())}'.replace(
911
+ ' ', '_'
912
+ ),
913
+ prefix=self.logs_key_prefix,
890
914
  )
915
+ self.s3_client.upload_fileobj(Bucket=self.bucket_name,
916
+ Key=key_name,
917
+ ExtraArgs=self._get_encryption_args(),
918
+ Fileobj=file_obj)
891
919
 
892
- # boto doesn't properly remove the x-amz-security-token parameter when
893
- # query_auth is False when using an IAM role (see issue #2043). Including the
894
- # x-amz-security-token parameter without the access key results in a 403,
895
- # even if the resource is public, so we need to remove it.
896
- scheme, netloc, path, query, fragment = urlsplit(url)
897
- params = parse_qs(query)
898
- if "x-amz-security-token" in params:
899
- del params["x-amz-security-token"]
900
- if "AWSAccessKeyId" in params:
901
- del params["AWSAccessKeyId"]
902
- if "Signature" in params:
903
- del params["Signature"]
904
- query = urlencode(params, doseq=True)
905
- url = urlunsplit((scheme, netloc, path, query, fragment))
906
- return url
907
-
908
- def get_shared_public_url(self, shared_file_name):
909
- self._requireValidSharedFileName(shared_file_name)
910
- return self.get_public_url(self._shared_file_id(shared_file_name))
911
-
912
- def _bindBucket(
913
- self,
914
- bucket_name: str,
915
- create: bool = False,
916
- block: bool = True,
917
- versioning: bool = False,
918
- check_versioning_consistency: bool = True,
919
- ):
920
+ def read_logs(self, callback: Callable[..., Any], read_all: bool = False) -> int:
920
921
  """
921
- Return the Boto Bucket object representing the S3 bucket with the given name. If the
922
- bucket does not exist and `create` is True, it will be created.
923
-
924
- :param str bucket_name: the name of the bucket to bind to
925
-
926
- :param bool create: Whether to create bucket the if it doesn't exist
927
-
928
- :param bool block: If False, return None if the bucket doesn't exist. If True, wait until
929
- bucket appears. Ignored if `create` is True.
930
-
931
- :rtype: Bucket|None
932
- :raises botocore.exceptions.ClientError: If `block` is True and the bucket still doesn't exist after the
933
- retry timeout expires.
922
+ This fetches all referenced logs in the database from s3 as readable objects
923
+ and runs "callback()" on them.
934
924
  """
935
- assert self.minBucketNameLen <= len(bucket_name) <= self.maxBucketNameLen
936
- assert self.bucketNameRe.match(bucket_name)
937
- logger.debug("Binding to job store bucket '%s'.", bucket_name)
938
-
939
- def bucket_retry_predicate(error):
940
- """
941
- Decide, given an error, whether we should retry binding the bucket.
942
- """
943
-
944
- if isinstance(error, ClientError) and get_error_status(error) in (404, 409):
945
- # Handle cases where the bucket creation is in a weird state that might let us proceed.
946
- # https://github.com/BD2KGenomics/toil/issues/955
947
- # https://github.com/BD2KGenomics/toil/issues/995
948
- # https://github.com/BD2KGenomics/toil/issues/1093
949
-
950
- # BucketAlreadyOwnedByYou == 409
951
- # OperationAborted == 409
952
- # NoSuchBucket == 404
953
- return True
954
- if get_error_code(error) == "SlowDown":
955
- # We may get told to SlowDown by AWS when we try to create our
956
- # bucket. In that case, we should retry and use the exponential
957
- # backoff.
958
- return True
959
- return False
960
-
961
- bucketExisted = True
962
- for attempt in retry_s3(predicate=bucket_retry_predicate):
963
- with attempt:
964
- try:
965
- # the head_bucket() call makes sure that the bucket exists and the user can access it
966
- self.s3_client.head_bucket(Bucket=bucket_name)
967
-
968
- bucket = self.s3_resource.Bucket(bucket_name)
969
- except ClientError as e:
970
- error_http_status = get_error_status(e)
971
- if error_http_status == 404:
972
- bucketExisted = False
973
- logger.debug("Bucket '%s' does not exist.", bucket_name)
974
- if create:
975
- bucket = create_s3_bucket(
976
- self.s3_resource, bucket_name, self.region
977
- )
978
- # Wait until the bucket exists before checking the region and adding tags
979
- bucket.wait_until_exists()
980
-
981
- # It is possible for create_bucket to return but
982
- # for an immediate request for the bucket region to
983
- # produce an S3ResponseError with code
984
- # NoSuchBucket. We let that kick us back up to the
985
- # main retry loop.
986
- assert (
987
- get_bucket_region(bucket_name) == self.region
988
- ), f"bucket_name: {bucket_name}, {get_bucket_region(bucket_name)} != {self.region}"
989
-
990
- tags = build_tag_dict_from_env()
991
-
992
- if tags:
993
- flat_tags = flatten_tags(tags)
994
- bucket_tagging = self.s3_resource.BucketTagging(
995
- bucket_name
996
- )
997
- bucket_tagging.put(Tagging={"TagSet": flat_tags})
998
-
999
- # Configure bucket so that we can make objects in
1000
- # it public, which was the historical default.
1001
- enable_public_objects(bucket_name)
1002
- elif block:
1003
- raise
1004
- else:
1005
- return None
1006
- elif error_http_status == 301:
1007
- # This is raised if the user attempts to get a bucket in a region outside
1008
- # the specified one, if the specified one is not `us-east-1`. The us-east-1
1009
- # server allows a user to use buckets from any region.
1010
- raise BucketLocationConflictException(
1011
- get_bucket_region(bucket_name)
1012
- )
1013
- else:
1014
- raise
1015
- else:
1016
- bucketRegion = get_bucket_region(bucket_name)
1017
- if bucketRegion != self.region:
1018
- raise BucketLocationConflictException(bucketRegion)
1019
-
1020
- if versioning and not bucketExisted:
1021
- # only call this method on bucket creation
1022
- bucket.Versioning().enable()
1023
- # Now wait until versioning is actually on. Some uploads
1024
- # would come back with no versions; maybe they were
1025
- # happening too fast and this setting isn't sufficiently
1026
- # consistent?
1027
- time.sleep(1)
1028
- while not self._getBucketVersioning(bucket_name):
1029
- logger.warning(
1030
- f"Waiting for versioning activation on bucket '{bucket_name}'..."
1031
- )
1032
- time.sleep(1)
1033
- elif check_versioning_consistency:
1034
- # now test for versioning consistency
1035
- # we should never see any of these errors since 'versioning' should always be true
1036
- bucket_versioning = self._getBucketVersioning(bucket_name)
1037
- if bucket_versioning != versioning:
1038
- assert False, "Cannot modify versioning on existing bucket"
1039
- elif bucket_versioning is None:
1040
- assert False, "Cannot use a bucket with versioning suspended"
1041
- if bucketExisted:
1042
- logger.debug(
1043
- f"Using pre-existing job store bucket '{bucket_name}'."
1044
- )
1045
- else:
1046
- logger.debug(
1047
- f"Created new job store bucket '{bucket_name}' with versioning state {versioning}."
1048
- )
1049
-
1050
- return bucket
1051
-
1052
- def _bindDomain(
1053
- self, domain_name: str, create: bool = False, block: bool = True
1054
- ) -> None:
925
+ items_processed = 0
926
+ LOG_MARKER = "most_recently_read_log.marker"
927
+ read_log_marker = "0"
928
+ if not read_all:
929
+ # We want to pick up reading where we left off
930
+ try:
931
+ read_log_marker = self.read_from_bucket(
932
+ identifier=LOG_MARKER,
933
+ prefix=self.shared_key_prefix
934
+ ).decode('utf-8')
935
+ except self.s3_client.exceptions.NoSuchKey:
936
+ # We haven't recorded that we've read anything yet.
937
+ # Leave read_log_marker at "0"
938
+ pass
939
+
940
+ startafter = None if read_log_marker == "0" else read_log_marker
941
+ for result in list_s3_items(self.s3_resource, bucket=self.bucket_name, prefix=self.logs_key_prefix, startafter=startafter):
942
+ if result['Key'] > read_log_marker or read_all:
943
+ read_log_marker = result['Key']
944
+ with download_stream(self.s3_resource,
945
+ bucket=self.bucket_name,
946
+ key=result['Key'],
947
+ extra_args=self._get_encryption_args()) as readable:
948
+ callback(readable)
949
+ items_processed += 1
950
+
951
+ if items_processed > 0:
952
+ # We processed something, so we need to update the marker.
953
+ self.write_to_bucket(identifier=LOG_MARKER,
954
+ prefix=self.shared_key_prefix,
955
+ data=read_log_marker)
956
+ return items_processed
957
+
958
+ def _get_encryption_args(self) -> dict[str, Any]:
1055
959
  """
1056
- Return the Boto3 domain name representing the SDB domain. When create=True, it will
1057
- create the domain if it does not exist.
1058
- Return the Boto Domain object representing the SDB domain of the given name. If the
1059
- domain does not exist and `create` is True, it will be created.
1060
-
1061
- :param str domain_name: the name of the domain to bind to
960
+ Get the encryption arguments to pass to an AWS function.
1062
961
 
1063
- :param bool create: True if domain should be created if it doesn't exist
962
+ Reads live from the SSE key file referenced by the config.
1064
963
 
1065
- :param bool block: If False, raise DomainDoesNotExist if the domain doesn't exist. If True, wait until
1066
- domain appears. This parameter is ignored if create is True.
964
+ If the config is not available, returns an empty dict.
1067
965
 
1068
- :rtype: None
1069
- :raises ClientError: If `block` is True and the domain still doesn't exist after the
1070
- retry timeout expires.
1071
- """
1072
- logger.debug("Binding to job store domain '%s'.", domain_name)
1073
- retryargs = dict(
1074
- predicate=lambda e: no_such_sdb_domain(e) or sdb_unavailable(e)
1075
- )
1076
- if not block:
1077
- retryargs["timeout"] = 15
1078
- for attempt in retry_sdb(**retryargs):
1079
- with attempt:
1080
- try:
1081
- self.db.domain_metadata(DomainName=domain_name)
1082
- return
1083
- except ClientError as e:
1084
- if no_such_sdb_domain(e):
1085
- if create:
1086
- self.db.create_domain(DomainName=domain_name)
1087
- return
1088
- elif block:
1089
- raise
1090
- else:
1091
- raise DomainDoesNotExist(domain_name)
1092
- else:
1093
- raise
1094
-
1095
- def _new_job_id(self):
1096
- return str(uuid.uuid4())
1097
-
1098
- # A dummy job ID under which all shared files are stored
1099
- sharedFileOwnerID = uuid.UUID("891f7db6-e4d9-4221-a58e-ab6cc4395f94")
1100
-
1101
- # A dummy job ID under which all unread stats files are stored
1102
- statsFileOwnerID = uuid.UUID("bfcf5286-4bc7-41ef-a85d-9ab415b69d53")
1103
-
1104
- # A dummy job ID under which all read stats files are stored
1105
- readStatsFileOwnerID = uuid.UUID("e77fc3aa-d232-4255-ae04-f64ee8eb0bfa")
1106
-
1107
- def _shared_file_id(self, shared_file_name):
1108
- return str(uuid.uuid5(self.sharedFileOwnerID, shared_file_name))
1109
-
1110
- @InnerClass
1111
- class FileInfo(SDBHelper):
966
+ :raises ValueError: If the key data is not formatted correctly.
1112
967
  """
1113
- Represents a file in this job store.
1114
- """
1115
-
1116
- outer = None
1117
- """
1118
- :type: AWSJobStore
1119
- """
1120
-
1121
- def __init__(
1122
- self,
1123
- fileID,
1124
- ownerID,
1125
- encrypted,
1126
- version=None,
1127
- content=None,
1128
- numContentChunks=0,
1129
- checksum=None,
1130
- ):
1131
- """
1132
- :type fileID: str
1133
- :param fileID: the file's ID
1134
-
1135
- :type ownerID: str
1136
- :param ownerID: ID of the entity owning this file, typically a job ID aka jobStoreID
1137
-
1138
- :type encrypted: bool
1139
- :param encrypted: whether the file is stored in encrypted form
1140
-
1141
- :type version: str|None
1142
- :param version: a non-empty string containing the most recent version of the S3
1143
- object storing this file's content, None if the file is new, or empty string if the
1144
- file is inlined.
1145
-
1146
- :type content: str|None
1147
- :param content: this file's inlined content
1148
-
1149
- :type numContentChunks: int
1150
- :param numContentChunks: the number of SDB domain attributes occupied by this files
1151
-
1152
- :type checksum: str|None
1153
- :param checksum: the checksum of the file, if available. Formatted
1154
- as <algorithm>$<lowercase hex hash>.
1155
-
1156
- inlined content. Note that an inlined empty string still occupies one chunk.
1157
- """
1158
- super().__init__()
1159
- self._fileID = fileID
1160
- self._ownerID = ownerID
1161
- self.encrypted = encrypted
1162
- self._version = version
1163
- self._previousVersion = version
1164
- self._content = content
1165
- self._checksum = checksum
1166
- self._numContentChunks = numContentChunks
1167
-
1168
- @property
1169
- def fileID(self):
1170
- return self._fileID
1171
-
1172
- @property
1173
- def ownerID(self):
1174
- return self._ownerID
1175
-
1176
- @property
1177
- def version(self):
1178
- return self._version
1179
-
1180
- @version.setter
1181
- def version(self, version):
1182
- # Version should only change once
1183
- assert self._previousVersion == self._version
1184
- self._version = version
1185
- if version:
1186
- self.content = None
1187
-
1188
- @property
1189
- def previousVersion(self):
1190
- return self._previousVersion
1191
-
1192
- @property
1193
- def content(self):
1194
- return self._content
1195
-
1196
- @property
1197
- def checksum(self):
1198
- return self._checksum
1199
-
1200
- @checksum.setter
1201
- def checksum(self, checksum):
1202
- self._checksum = checksum
1203
-
1204
- @content.setter
1205
- def content(self, content):
1206
- assert content is None or isinstance(content, bytes)
1207
- self._content = content
1208
- if content is not None:
1209
- self.version = ""
1210
-
1211
- @classmethod
1212
- def create(cls, ownerID: str):
1213
- return cls(
1214
- str(uuid.uuid4()), ownerID, encrypted=cls.outer.sseKeyPath is not None
1215
- )
1216
-
1217
- @classmethod
1218
- def presenceIndicator(cls):
1219
- return "encrypted"
1220
-
1221
- @classmethod
1222
- def exists(cls, jobStoreFileID):
1223
- for attempt in retry_sdb():
1224
- with attempt:
1225
- return bool(
1226
- cls.outer.db.get_attributes(
1227
- DomainName=cls.outer.files_domain_name,
1228
- ItemName=compat_bytes(jobStoreFileID),
1229
- AttributeNames=[cls.presenceIndicator()],
1230
- ConsistentRead=True,
1231
- ).get("Attributes", [])
1232
- )
1233
-
1234
- @classmethod
1235
- def load(cls, jobStoreFileID):
1236
- for attempt in retry_sdb():
1237
- with attempt:
1238
- self = cls.fromItem(
1239
- {
1240
- "Name": compat_bytes(jobStoreFileID),
1241
- "Attributes": cls.outer.db.get_attributes(
1242
- DomainName=cls.outer.files_domain_name,
1243
- ItemName=compat_bytes(jobStoreFileID),
1244
- ConsistentRead=True,
1245
- ).get("Attributes", []),
1246
- }
1247
- )
1248
- return self
1249
-
1250
- @classmethod
1251
- def loadOrCreate(cls, jobStoreFileID, ownerID, encrypted):
1252
- self = cls.load(jobStoreFileID)
1253
- if encrypted is None:
1254
- encrypted = cls.outer.sseKeyPath is not None
1255
- if self is None:
1256
- self = cls(jobStoreFileID, ownerID, encrypted=encrypted)
1257
- else:
1258
- assert self.fileID == jobStoreFileID
1259
- assert self.ownerID == ownerID
1260
- self.encrypted = encrypted
1261
- return self
1262
-
1263
- @classmethod
1264
- def loadOrFail(cls, jobStoreFileID, customName=None):
1265
- """
1266
- :rtype: AWSJobStore.FileInfo
1267
- :return: an instance of this class representing the file with the given ID
1268
- :raises NoSuchFileException: if given file does not exist
1269
- """
1270
- self = cls.load(jobStoreFileID)
1271
- if self is None:
1272
- raise NoSuchFileException(jobStoreFileID, customName=customName)
1273
- else:
1274
- return self
1275
-
1276
- @classmethod
1277
- def fromItem(cls, item: "ItemTypeDef"):
1278
- """
1279
- Convert an SDB item to an instance of this class.
1280
-
1281
- :type item: Item
1282
- """
1283
- assert item is not None
1284
-
1285
- # Strings come back from SDB as unicode
1286
- def strOrNone(s):
1287
- return s if s is None else str(s)
1288
-
1289
- # ownerID and encrypted are the only mandatory attributes
1290
- ownerID, encrypted, version, checksum = SDBHelper.get_attributes_from_item(
1291
- item, ["ownerID", "encrypted", "version", "checksum"]
1292
- )
1293
- if ownerID is None:
1294
- assert encrypted is None
1295
- return None
1296
- else:
1297
- encrypted = strict_bool(encrypted)
1298
- content, numContentChunks = cls.attributesToBinary(item["Attributes"])
1299
- if encrypted:
1300
- sseKeyPath = cls.outer.sseKeyPath
1301
- if sseKeyPath is None:
1302
- raise AssertionError(
1303
- "Content is encrypted but no key was provided."
1304
- )
1305
- if content is not None:
1306
- content = encryption.decrypt(content, sseKeyPath)
1307
- self = cls(
1308
- fileID=item["Name"],
1309
- ownerID=ownerID,
1310
- encrypted=encrypted,
1311
- version=version,
1312
- content=content,
1313
- numContentChunks=numContentChunks,
1314
- checksum=checksum,
1315
- )
1316
- return self
1317
-
1318
- def toItem(self) -> tuple[dict[str, str], int]:
1319
- """
1320
- Convert this instance to a dictionary of attribute names to values
1321
-
1322
- :return: the attributes dict and an integer specifying the the number of chunk
1323
- attributes in the dictionary that are used for storing inlined content.
1324
- """
1325
- content = self.content
1326
- assert content is None or isinstance(content, bytes)
1327
- if self.encrypted and content is not None:
1328
- sseKeyPath = self.outer.sseKeyPath
1329
- if sseKeyPath is None:
1330
- raise AssertionError(
1331
- "Encryption requested but no key was provided."
1332
- )
1333
- content = encryption.encrypt(content, sseKeyPath)
1334
- assert content is None or isinstance(content, bytes)
1335
- attributes = self.binaryToAttributes(content)
1336
- numChunks = int(attributes["numChunks"])
1337
- attributes.update(
1338
- dict(
1339
- ownerID=self.ownerID or "",
1340
- encrypted=str(self.encrypted),
1341
- version=self.version or "",
1342
- checksum=self.checksum or "",
1343
- )
1344
- )
1345
- return attributes, numChunks
1346
-
1347
- @classmethod
1348
- def _reservedAttributes(cls):
1349
- return 3 + super()._reservedAttributes()
1350
-
1351
- @staticmethod
1352
- def maxInlinedSize():
1353
- return 256
1354
-
1355
- def save(self):
1356
- attributes, numNewContentChunks = self.toItem()
1357
- attributes_boto3 = SDBHelper.attributeDictToList(attributes)
1358
- # False stands for absence
1359
- if self.previousVersion is None:
1360
- expected: "UpdateConditionTypeDef" = {
1361
- "Name": "version",
1362
- "Exists": False,
1363
- }
1364
- else:
1365
- expected = {"Name": "version", "Value": cast(str, self.previousVersion)}
1366
- try:
1367
- for attempt in retry_sdb():
1368
- with attempt:
1369
- self.outer.db.put_attributes(
1370
- DomainName=self.outer.files_domain_name,
1371
- ItemName=compat_bytes(self.fileID),
1372
- Attributes=[
1373
- {
1374
- "Name": attribute["Name"],
1375
- "Value": attribute["Value"],
1376
- "Replace": True,
1377
- }
1378
- for attribute in attributes_boto3
1379
- ],
1380
- Expected=expected,
1381
- )
1382
- # clean up the old version of the file if necessary and safe
1383
- if self.previousVersion and (self.previousVersion != self.version):
1384
- for attempt in retry_s3():
1385
- with attempt:
1386
- self.outer.s3_client.delete_object(
1387
- Bucket=self.outer.files_bucket.name,
1388
- Key=compat_bytes(self.fileID),
1389
- VersionId=self.previousVersion,
1390
- )
1391
- self._previousVersion = self._version
1392
- if numNewContentChunks < self._numContentChunks:
1393
- residualChunks = range(numNewContentChunks, self._numContentChunks)
1394
- residual_chunk_names = [self._chunkName(i) for i in residualChunks]
1395
- # boto3 requires providing the value as well as the name in the attribute, and we don't store it locally
1396
- # the php sdk resolves this issue by not requiring the Value key https://github.com/aws/aws-sdk-php/issues/185
1397
- # but this doesnt extend to boto3
1398
- delete_attributes = self.outer.db.get_attributes(
1399
- DomainName=self.outer.files_domain_name,
1400
- ItemName=compat_bytes(self.fileID),
1401
- AttributeNames=[chunk for chunk in residual_chunk_names],
1402
- ).get("Attributes")
1403
- for attempt in retry_sdb():
1404
- with attempt:
1405
- self.outer.db.delete_attributes(
1406
- DomainName=self.outer.files_domain_name,
1407
- ItemName=compat_bytes(self.fileID),
1408
- Attributes=delete_attributes,
1409
- )
1410
- self.outer.db.get_attributes(
1411
- DomainName=self.outer.files_domain_name,
1412
- ItemName=compat_bytes(self.fileID),
1413
- )
1414
-
1415
- self._numContentChunks = numNewContentChunks
1416
- except ClientError as e:
1417
- if get_error_code(e) == "ConditionalCheckFailed":
1418
- raise ConcurrentFileModificationException(self.fileID)
1419
- else:
1420
- raise
1421
-
1422
- def upload(self, localFilePath, calculateChecksum=True):
1423
- file_size, file_time = fileSizeAndTime(localFilePath)
1424
- if file_size <= self.maxInlinedSize():
1425
- with open(localFilePath, "rb") as f:
1426
- self.content = f.read()
1427
- # Clear out any old checksum in case of overwrite
1428
- self.checksum = ""
1429
- else:
1430
- headerArgs = self._s3EncryptionArgs()
1431
- # Create a new Resource in case it needs to be on its own thread
1432
- resource = boto3_session.resource("s3", region_name=self.outer.region)
1433
-
1434
- self.checksum = (
1435
- self._get_file_checksum(localFilePath)
1436
- if calculateChecksum
1437
- else None
1438
- )
1439
- self.version = uploadFromPath(
1440
- localFilePath,
1441
- resource=resource,
1442
- bucketName=self.outer.files_bucket.name,
1443
- fileID=compat_bytes(self.fileID),
1444
- headerArgs=headerArgs,
1445
- partSize=self.outer.part_size,
1446
- )
1447
-
1448
- def _start_checksum(self, to_match=None, algorithm="sha1"):
1449
- """
1450
- Get a hasher that can be used with _update_checksum and
1451
- _finish_checksum.
1452
-
1453
- If to_match is set, it is a precomputed checksum which we expect
1454
- the result to match.
1455
-
1456
- The right way to compare checksums is to feed in the checksum to be
1457
- matched, so we can see its algorithm, instead of getting a new one
1458
- and comparing. If a checksum to match is fed in, _finish_checksum()
1459
- will raise a ChecksumError if it isn't matched.
1460
- """
1461
-
1462
- # If we have an expexted result it will go here.
1463
- expected = None
1464
-
1465
- if to_match is not None:
1466
- parts = to_match.split("$")
1467
- algorithm = parts[0]
1468
- expected = parts[1]
1469
-
1470
- wrapped = getattr(hashlib, algorithm)()
1471
- logger.debug(f"Starting {algorithm} checksum to match {expected}")
1472
- return algorithm, wrapped, expected
1473
-
1474
- def _update_checksum(self, checksum_in_progress, data):
1475
- """
1476
- Update a checksum in progress from _start_checksum with new data.
1477
- """
1478
- checksum_in_progress[1].update(data)
1479
-
1480
- def _finish_checksum(self, checksum_in_progress):
1481
- """
1482
- Complete a checksum in progress from _start_checksum and return the
1483
- checksum result string.
1484
- """
1485
-
1486
- result_hash = checksum_in_progress[1].hexdigest()
1487
-
1488
- logger.debug(
1489
- f"Completed checksum with hash {result_hash} vs. expected {checksum_in_progress[2]}"
1490
- )
1491
- if checksum_in_progress[2] is not None:
1492
- # We expected a particular hash
1493
- if result_hash != checksum_in_progress[2]:
1494
- raise ChecksumError(
1495
- "Checksum mismatch. Expected: %s Actual: %s"
1496
- % (checksum_in_progress[2], result_hash)
1497
- )
1498
-
1499
- return "$".join([checksum_in_progress[0], result_hash])
1500
-
1501
- def _get_file_checksum(self, localFilePath, to_match=None):
1502
- with open(localFilePath, "rb") as f:
1503
- hasher = self._start_checksum(to_match=to_match)
1504
- contents = f.read(1024 * 1024)
1505
- while contents != b"":
1506
- self._update_checksum(hasher, contents)
1507
- contents = f.read(1024 * 1024)
1508
- return self._finish_checksum(hasher)
1509
-
1510
- @contextmanager
1511
- def uploadStream(
1512
- self, multipart=True, allowInlining=True, encoding=None, errors=None
1513
- ):
1514
- """
1515
- Context manager that gives out a binary or text mode upload stream to upload data.
1516
- """
1517
-
1518
- # Note that we have to handle already having a content or a version
1519
- # if we are overwriting something.
1520
-
1521
- # But make sure we don't have both.
1522
- assert not (bool(self.version) and self.content is not None)
1523
-
1524
- info = self
1525
- store = self.outer
1526
-
1527
- class MultiPartPipe(WritablePipe):
1528
- def readFrom(self, readable):
1529
- # Get the first block of data we want to put
1530
- buf = readable.read(store.part_size)
1531
- assert isinstance(buf, bytes)
1532
-
1533
- if allowInlining and len(buf) <= info.maxInlinedSize():
1534
- logger.debug("Inlining content of %d bytes", len(buf))
1535
- info.content = buf
1536
- # There will be no checksum
1537
- info.checksum = ""
1538
- else:
1539
- # We will compute a checksum
1540
- hasher = info._start_checksum()
1541
- logger.debug("Updating checksum with %d bytes", len(buf))
1542
- info._update_checksum(hasher, buf)
1543
-
1544
- client = store.s3_client
1545
- bucket_name = store.files_bucket.name
1546
- headerArgs = info._s3EncryptionArgs()
1547
-
1548
- for attempt in retry_s3():
1549
- with attempt:
1550
- logger.debug("Starting multipart upload")
1551
- # low-level clients are thread safe
1552
- upload = client.create_multipart_upload(
1553
- Bucket=bucket_name,
1554
- Key=compat_bytes(info.fileID),
1555
- **headerArgs,
1556
- )
1557
- uploadId = upload["UploadId"]
1558
- parts = []
1559
- logger.debug("Multipart upload started as %s", uploadId)
1560
-
1561
- for attempt in retry_s3():
1562
- with attempt:
1563
- for i in range(CONSISTENCY_TICKS):
1564
- # Sometimes we can create a multipart upload and not see it. Wait around for it.
1565
- response = client.list_multipart_uploads(
1566
- Bucket=bucket_name,
1567
- MaxUploads=1,
1568
- Prefix=compat_bytes(info.fileID),
1569
- )
1570
- if (
1571
- "Uploads" in response
1572
- and len(response["Uploads"]) != 0
1573
- and response["Uploads"][0]["UploadId"]
1574
- == uploadId
1575
- ):
1576
-
1577
- logger.debug(
1578
- "Multipart upload visible as %s", uploadId
1579
- )
1580
- break
1581
- else:
1582
- logger.debug(
1583
- "Multipart upload %s is not visible; we see %s",
1584
- uploadId,
1585
- response.get("Uploads"),
1586
- )
1587
- time.sleep(CONSISTENCY_TIME * 2**i)
1588
-
1589
- try:
1590
- for part_num in itertools.count():
1591
- for attempt in retry_s3():
1592
- with attempt:
1593
- logger.debug(
1594
- "Uploading part %d of %d bytes to %s",
1595
- part_num + 1,
1596
- len(buf),
1597
- uploadId,
1598
- )
1599
- # TODO: include the Content-MD5 header:
1600
- # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.complete_multipart_upload
1601
- part = client.upload_part(
1602
- Bucket=bucket_name,
1603
- Key=compat_bytes(info.fileID),
1604
- PartNumber=part_num + 1,
1605
- UploadId=uploadId,
1606
- Body=BytesIO(buf),
1607
- **headerArgs,
1608
- )
1609
-
1610
- parts.append(
1611
- {
1612
- "PartNumber": part_num + 1,
1613
- "ETag": part["ETag"],
1614
- }
1615
- )
1616
-
1617
- # Get the next block of data we want to put
1618
- buf = readable.read(info.outer.part_size)
1619
- assert isinstance(buf, bytes)
1620
- if len(buf) == 0:
1621
- # Don't allow any part other than the very first to be empty.
1622
- break
1623
- info._update_checksum(hasher, buf)
1624
- except:
1625
- with panic(log=logger):
1626
- for attempt in retry_s3():
1627
- with attempt:
1628
- client.abort_multipart_upload(
1629
- Bucket=bucket_name,
1630
- Key=compat_bytes(info.fileID),
1631
- UploadId=uploadId,
1632
- )
1633
-
1634
- else:
1635
-
1636
- while not store._getBucketVersioning(
1637
- store.files_bucket.name
1638
- ):
1639
- logger.warning(
1640
- "Versioning does not appear to be enabled yet. Deferring multipart "
1641
- "upload completion..."
1642
- )
1643
- time.sleep(1)
1644
-
1645
- # Save the checksum
1646
- info.checksum = info._finish_checksum(hasher)
1647
-
1648
- for attempt in retry_s3(timeout=600):
1649
- # Wait here for a bit longer if S3 breaks,
1650
- # because we have been known to flake out here
1651
- # in tests
1652
- # (https://github.com/DataBiosphere/toil/issues/3894)
1653
- with attempt:
1654
- logger.debug("Attempting to complete upload...")
1655
- completed = client.complete_multipart_upload(
1656
- Bucket=bucket_name,
1657
- Key=compat_bytes(info.fileID),
1658
- UploadId=uploadId,
1659
- MultipartUpload={"Parts": parts},
1660
- )
1661
-
1662
- logger.debug(
1663
- "Completed upload object of type %s: %s",
1664
- str(type(completed)),
1665
- repr(completed),
1666
- )
1667
- info.version = completed.get("VersionId")
1668
- logger.debug(
1669
- "Completed upload with version %s",
1670
- str(info.version),
1671
- )
1672
-
1673
- if info.version is None:
1674
- # Somehow we don't know the version. Try and get it.
1675
- for attempt in retry_s3(
1676
- predicate=lambda e: retryable_s3_errors(e)
1677
- or isinstance(e, AssertionError)
1678
- ):
1679
- with attempt:
1680
- version = client.head_object(
1681
- Bucket=bucket_name,
1682
- Key=compat_bytes(info.fileID),
1683
- **headerArgs,
1684
- ).get("VersionId", None)
1685
- logger.warning(
1686
- "Loaded key for upload with no version and got version %s",
1687
- str(version),
1688
- )
1689
- info.version = version
1690
- assert info.version is not None
1691
-
1692
- # Make sure we actually wrote something, even if an empty file
1693
- assert bool(info.version) or info.content is not None
1694
-
1695
- class SinglePartPipe(WritablePipe):
1696
- def readFrom(self, readable):
1697
- buf = readable.read()
1698
- assert isinstance(buf, bytes)
1699
- dataLength = len(buf)
1700
- if allowInlining and dataLength <= info.maxInlinedSize():
1701
- logger.debug("Inlining content of %d bytes", len(buf))
1702
- info.content = buf
1703
- # There will be no checksum
1704
- info.checksum = ""
1705
- else:
1706
- # We will compute a checksum
1707
- hasher = info._start_checksum()
1708
- info._update_checksum(hasher, buf)
1709
- info.checksum = info._finish_checksum(hasher)
1710
-
1711
- bucket_name = store.files_bucket.name
1712
- headerArgs = info._s3EncryptionArgs()
1713
- client = store.s3_client
1714
-
1715
- buf = BytesIO(buf)
1716
-
1717
- while not store._getBucketVersioning(bucket_name):
1718
- logger.warning(
1719
- "Versioning does not appear to be enabled yet. Deferring single part "
1720
- "upload..."
1721
- )
1722
- time.sleep(1)
1723
-
1724
- for attempt in retry_s3():
1725
- with attempt:
1726
- logger.debug(
1727
- "Uploading single part of %d bytes", dataLength
1728
- )
1729
- client.upload_fileobj(
1730
- Bucket=bucket_name,
1731
- Key=compat_bytes(info.fileID),
1732
- Fileobj=buf,
1733
- ExtraArgs=headerArgs,
1734
- )
1735
-
1736
- # use head_object with the SSE headers to access versionId and content_length attributes
1737
- headObj = client.head_object(
1738
- Bucket=bucket_name,
1739
- Key=compat_bytes(info.fileID),
1740
- **headerArgs,
1741
- )
1742
- assert dataLength == headObj.get("ContentLength", None)
1743
- info.version = headObj.get("VersionId", None)
1744
- logger.debug(
1745
- "Upload received version %s", str(info.version)
1746
- )
1747
-
1748
- if info.version is None:
1749
- # Somehow we don't know the version
1750
- for attempt in retry_s3(
1751
- predicate=lambda e: retryable_s3_errors(e)
1752
- or isinstance(e, AssertionError)
1753
- ):
1754
- with attempt:
1755
- headObj = client.head_object(
1756
- Bucket=bucket_name,
1757
- Key=compat_bytes(info.fileID),
1758
- **headerArgs,
1759
- )
1760
- info.version = headObj.get("VersionId", None)
1761
- logger.warning(
1762
- "Reloaded key with no version and got version %s",
1763
- str(info.version),
1764
- )
1765
- assert info.version is not None
1766
-
1767
- # Make sure we actually wrote something, even if an empty file
1768
- assert bool(info.version) or info.content is not None
1769
-
1770
- if multipart:
1771
- pipe = MultiPartPipe(encoding=encoding, errors=errors)
1772
- else:
1773
- pipe = SinglePartPipe(encoding=encoding, errors=errors)
968
+ # TODO: Maybe memoize the file read, subject to config field changes?
1774
969
 
1775
- with pipe as writable:
1776
- yield writable
1777
-
1778
- if not pipe.reader_done:
1779
- logger.debug(f"Version: {self.version} Content: {self.content}")
1780
- raise RuntimeError(
1781
- "Escaped context manager without written data being read!"
970
+ try:
971
+ config = self.config
972
+ except AttributeError:
973
+ # The config isn't set yet. This happens during resume(), when we
974
+ # need to get the encryption args to talk to the job store to
975
+ # download the config, before we have it.
976
+ return {}
977
+
978
+ if config is not None and config.sseKey:
979
+ with open(config.sseKey, 'r') as f:
980
+ sse_key = f.read()
981
+ if not len(sse_key) == 32: # TODO: regex
982
+ raise ValueError(
983
+ f'Check that {self.config.sseKey} '
984
+ f'is the path to a real SSE key. '
985
+ f'(Key length {len(sse_key)} != 32)'
1782
986
  )
987
+ return {'SSECustomerAlgorithm': 'AES256', 'SSECustomerKey': sse_key}
988
+ else:
989
+ return {}
1783
990
 
1784
- # We check our work to make sure we have exactly one of embedded
1785
- # content or a real object version.
1786
-
1787
- if self.content is None:
1788
- if not bool(self.version):
1789
- logger.debug(f"Version: {self.version} Content: {self.content}")
1790
- raise RuntimeError("No content added and no version created")
1791
- else:
1792
- if bool(self.version):
1793
- logger.debug(f"Version: {self.version} Content: {self.content}")
1794
- raise RuntimeError("Content added and version created")
1795
-
1796
- def copyFrom(self, srcObj):
1797
- """
1798
- Copies contents of source key into this file.
1799
-
1800
- :param S3.Object srcObj: The key (object) that will be copied from
1801
- """
1802
- assert srcObj.content_length is not None
1803
- if srcObj.content_length <= self.maxInlinedSize():
1804
- self.content = srcObj.get().get("Body").read()
1805
- else:
1806
- # Create a new Resource in case it needs to be on its own thread
1807
- resource = boto3_session.resource("s3", region_name=self.outer.region)
1808
- self.version = copyKeyMultipart(
1809
- resource,
1810
- srcBucketName=compat_bytes(srcObj.bucket_name),
1811
- srcKeyName=compat_bytes(srcObj.key),
1812
- srcKeyVersion=compat_bytes(srcObj.version_id),
1813
- dstBucketName=compat_bytes(self.outer.files_bucket.name),
1814
- dstKeyName=compat_bytes(self._fileID),
1815
- sseAlgorithm="AES256",
1816
- sseKey=self._getSSEKey(),
1817
- )
991
+ def parse_jobstore_identifier(jobstore_identifier: str) -> Tuple[str, str]:
992
+ region, jobstore_name = jobstore_identifier.split(':')
993
+ bucket_name = f'{jobstore_name}--toil'
1818
994
 
1819
- def copyTo(self, dstObj):
1820
- """
1821
- Copies contents of this file to the given key.
1822
-
1823
- :param S3.Object dstObj: The key (object) to copy this file's content to
1824
- """
1825
- if self.content is not None:
1826
- for attempt in retry_s3():
1827
- with attempt:
1828
- dstObj.put(Body=self.content)
1829
- elif self.version:
1830
- # Create a new Resource in case it needs to be on its own thread
1831
- resource = boto3_session.resource("s3", region_name=self.outer.region)
1832
-
1833
- for attempt in retry_s3():
1834
- # encrypted = True if self.outer.sseKeyPath else False
1835
- with attempt:
1836
- copyKeyMultipart(
1837
- resource,
1838
- srcBucketName=compat_bytes(self.outer.files_bucket.name),
1839
- srcKeyName=compat_bytes(self.fileID),
1840
- srcKeyVersion=compat_bytes(self.version),
1841
- dstBucketName=compat_bytes(dstObj.bucket_name),
1842
- dstKeyName=compat_bytes(dstObj.key),
1843
- copySourceSseAlgorithm="AES256",
1844
- copySourceSseKey=self._getSSEKey(),
1845
- )
1846
- else:
1847
- assert False
1848
-
1849
- def download(self, localFilePath, verifyChecksum=True):
1850
- if self.content is not None:
1851
- with AtomicFileCreate(localFilePath) as tmpPath:
1852
- with open(tmpPath, "wb") as f:
1853
- f.write(self.content)
1854
- elif self.version:
1855
- headerArgs = self._s3EncryptionArgs()
1856
- obj = self.outer.files_bucket.Object(compat_bytes(self.fileID))
1857
-
1858
- for attempt in retry_s3(
1859
- predicate=lambda e: retryable_s3_errors(e)
1860
- or isinstance(e, ChecksumError)
1861
- ):
1862
- with attempt:
1863
- with AtomicFileCreate(localFilePath) as tmpPath:
1864
- obj.download_file(
1865
- Filename=tmpPath,
1866
- ExtraArgs={"VersionId": self.version, **headerArgs},
1867
- )
1868
-
1869
- if verifyChecksum and self.checksum:
1870
- try:
1871
- # This automatically compares the result and matches the algorithm.
1872
- self._get_file_checksum(localFilePath, self.checksum)
1873
- except ChecksumError as e:
1874
- # Annotate checksum mismatches with file name
1875
- raise ChecksumError(
1876
- "Checksums do not match for file %s."
1877
- % localFilePath
1878
- ) from e
1879
- # The error will get caught and result in a retry of the download until we run out of retries.
1880
- # TODO: handle obviously truncated downloads by resuming instead.
1881
- else:
1882
- assert False
1883
-
1884
- @contextmanager
1885
- def downloadStream(self, verifyChecksum=True, encoding=None, errors=None):
1886
- """
1887
- Context manager that gives out a download stream to download data.
1888
- """
1889
- info = self
1890
-
1891
- class DownloadPipe(ReadablePipe):
1892
- def writeTo(self, writable):
1893
- if info.content is not None:
1894
- writable.write(info.content)
1895
- elif info.version:
1896
- headerArgs = info._s3EncryptionArgs()
1897
- obj = info.outer.files_bucket.Object(compat_bytes(info.fileID))
1898
- for attempt in retry_s3():
1899
- with attempt:
1900
- obj.download_fileobj(
1901
- writable,
1902
- ExtraArgs={"VersionId": info.version, **headerArgs},
1903
- )
1904
- else:
1905
- assert False
1906
-
1907
- class HashingPipe(ReadableTransformingPipe):
1908
- """
1909
- Class which checksums all the data read through it. If it
1910
- reaches EOF and the checksum isn't correct, raises
1911
- ChecksumError.
1912
-
1913
- Assumes info actually has a checksum.
1914
- """
1915
-
1916
- def transform(self, readable, writable):
1917
- hasher = info._start_checksum(to_match=info.checksum)
1918
- contents = readable.read(1024 * 1024)
1919
- while contents != b"":
1920
- info._update_checksum(hasher, contents)
1921
- try:
1922
- writable.write(contents)
1923
- except BrokenPipeError:
1924
- # Read was stopped early by user code.
1925
- # Can't check the checksum.
1926
- return
1927
- contents = readable.read(1024 * 1024)
1928
- # We reached EOF in the input.
1929
- # Finish checksumming and verify.
1930
- info._finish_checksum(hasher)
1931
- # Now stop so EOF happens in the output.
1932
-
1933
- if verifyChecksum and self.checksum:
1934
- with DownloadPipe() as readable:
1935
- # Interpose a pipe to check the hash
1936
- with HashingPipe(
1937
- readable, encoding=encoding, errors=errors
1938
- ) as verified:
1939
- yield verified
1940
- else:
1941
- # Readable end of pipe produces text mode output if encoding specified
1942
- with DownloadPipe(encoding=encoding, errors=errors) as readable:
1943
- # No true checksum available, so don't hash
1944
- yield readable
1945
-
1946
- def delete(self):
1947
- store = self.outer
1948
- if self.previousVersion is not None:
1949
- expected: "UpdateConditionTypeDef" = {
1950
- "Name": "version",
1951
- "Value": cast(str, self.previousVersion),
1952
- }
1953
- for attempt in retry_sdb():
1954
- with attempt:
1955
- store.db.delete_attributes(
1956
- DomainName=store.files_domain_name,
1957
- ItemName=compat_bytes(self.fileID),
1958
- Expected=expected,
1959
- )
1960
- if self.previousVersion:
1961
- for attempt in retry_s3():
1962
- with attempt:
1963
- store.s3_client.delete_object(
1964
- Bucket=store.files_bucket.name,
1965
- Key=compat_bytes(self.fileID),
1966
- VersionId=self.previousVersion,
1967
- )
1968
-
1969
- def getSize(self):
1970
- """
1971
- Return the size of the referenced item in bytes.
1972
- """
1973
- if self.content is not None:
1974
- return len(self.content)
1975
- elif self.version:
1976
- for attempt in retry_s3():
1977
- with attempt:
1978
- obj = self.outer.files_bucket.Object(compat_bytes(self.fileID))
1979
- return obj.content_length
1980
- else:
1981
- return 0
1982
-
1983
- def _getSSEKey(self) -> Optional[bytes]:
1984
- sseKeyPath = self.outer.sseKeyPath
1985
- if sseKeyPath:
1986
- with open(sseKeyPath, "rb") as f:
1987
- sseKey = f.read()
1988
- return sseKey
1989
-
1990
- def _s3EncryptionArgs(self):
1991
- # the keys of the returned dictionary are unpacked to the corresponding boto3 optional
1992
- # parameters and will be used to set the http headers
1993
- if self.encrypted:
1994
- sseKey = self._getSSEKey()
1995
- assert (
1996
- sseKey is not None
1997
- ), "Content is encrypted but no key was provided."
1998
- assert len(sseKey) == 32
1999
- # boto3 encodes the key and calculates the MD5 for us
2000
- return {"SSECustomerAlgorithm": "AES256", "SSECustomerKey": sseKey}
2001
- else:
2002
- return {}
2003
-
2004
- def __repr__(self):
2005
- r = custom_repr
2006
- d = (
2007
- ("fileID", r(self.fileID)),
2008
- ("ownerID", r(self.ownerID)),
2009
- ("encrypted", r(self.encrypted)),
2010
- ("version", r(self.version)),
2011
- ("previousVersion", r(self.previousVersion)),
2012
- ("content", r(self.content)),
2013
- ("checksum", r(self.checksum)),
2014
- ("_numContentChunks", r(self._numContentChunks)),
2015
- )
2016
- return "{}({})".format(
2017
- type(self).__name__, ", ".join(f"{k}={v}" for k, v in d)
2018
- )
995
+ regions = EC2Regions.keys()
996
+ if region not in regions:
997
+ raise ValueError(f'AWS Region "{region}" is not one of: {regions}')
2019
998
 
2020
- versionings = dict(Enabled=True, Disabled=False, Suspended=None)
999
+ if not 3 <= len(jobstore_name) <= 56:
1000
+ raise ValueError(f'AWS jobstore name must be between 3 and 56 chars: '
1001
+ f'{jobstore_name} (len: {len(jobstore_name)})')
2021
1002
 
2022
- def _getBucketVersioning(self, bucket_name):
2023
- """
2024
- The status attribute of BucketVersioning can be 'Enabled', 'Suspended' or None (Disabled)
2025
- which we map to True, None and False respectively. Note that we've never seen a versioning
2026
- status of 'Disabled', only the None return value. Calling BucketVersioning.suspend() will
2027
- cause BucketVersioning.status to then return 'Suspended' even on a new bucket that never
2028
- had versioning enabled.
1003
+ if not re.compile(r'^[a-z0-9][a-z0-9-]+[a-z0-9]$').match(jobstore_name):
1004
+ raise ValueError(f"Invalid AWS jobstore name: '{jobstore_name}'. Must contain only digits, "
1005
+ f"lower-case letters, and hyphens. Must also not start or end in a hyphen.")
2029
1006
 
2030
- :param bucket_name: str
2031
- """
2032
- for attempt in retry_s3():
2033
- with attempt:
2034
- status = self.s3_resource.BucketVersioning(bucket_name).status
2035
- return self.versionings.get(status) if status else False
2036
-
2037
- # TODO: Make this retry more specific?
2038
- # example: https://github.com/DataBiosphere/toil/issues/3378
2039
- @retry()
2040
- def destroy(self):
2041
- # FIXME: Destruction of encrypted stores only works after initialize() or .resume()
2042
- # See https://github.com/BD2KGenomics/toil/issues/1041
2043
- try:
2044
- self._bind(create=False, block=False, check_versioning_consistency=False)
2045
- except BucketLocationConflictException:
2046
- # If the unique jobstore bucket name existed, _bind would have raised a
2047
- # BucketLocationConflictException before calling destroy. Calling _bind here again
2048
- # would reraise the same exception so we need to catch and ignore that exception.
2049
- pass
2050
- # TODO: Add other failure cases to be ignored here.
2051
- self._registered = None
2052
- if self.files_bucket is not None:
2053
- self._delete_bucket(self.files_bucket)
2054
- self.files_bucket = None
2055
- for name in "files_domain_name", "jobs_domain_name":
2056
- domainName = getattr(self, name)
2057
- if domainName is not None:
2058
- self._delete_domain(domainName)
2059
- setattr(self, name, None)
2060
- self._registered = False
2061
-
2062
- def _delete_domain(self, domainName):
2063
- for attempt in retry_sdb():
2064
- with attempt:
2065
- try:
2066
- self.db.delete_domain(DomainName=domainName)
2067
- except ClientError as e:
2068
- if not no_such_sdb_domain(e):
2069
- raise
2070
-
2071
- @staticmethod
2072
- def _delete_bucket(bucket):
2073
- """
2074
- :param bucket: S3.Bucket
2075
- """
2076
- for attempt in retry_s3():
2077
- with attempt:
2078
- try:
2079
- uploads = s3_boto3_client.list_multipart_uploads(
2080
- Bucket=bucket.name
2081
- ).get("Uploads")
2082
- if uploads:
2083
- for u in uploads:
2084
- s3_boto3_client.abort_multipart_upload(
2085
- Bucket=bucket.name, Key=u["Key"], UploadId=u["UploadId"]
2086
- )
2087
-
2088
- bucket.objects.all().delete()
2089
- bucket.object_versions.delete()
2090
- bucket.delete()
2091
- except s3_boto3_client.exceptions.NoSuchBucket:
2092
- pass
2093
- except ClientError as e:
2094
- if get_error_status(e) != 404:
2095
- raise
2096
-
2097
-
2098
- aRepr = reprlib.Repr()
2099
- aRepr.maxstring = 38 # so UUIDs don't get truncated (36 for UUID plus 2 for quotes)
2100
- custom_repr = aRepr.repr
2101
-
2102
-
2103
- class BucketLocationConflictException(LocatorException):
2104
- def __init__(self, bucketRegion):
2105
- super().__init__(
2106
- "A bucket with the same name as the jobstore was found in another region (%s). "
2107
- "Cannot proceed as the unique bucket name is already in use.",
2108
- locator=bucketRegion,
2109
- )
1007
+ if '--' in jobstore_name:
1008
+ raise ValueError(f"AWS jobstore names may not contain '--': {jobstore_name}")
1009
+ return region, bucket_name