toil 9.0.0__py3-none-any.whl → 9.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. toil/batchSystems/abstractBatchSystem.py +13 -5
  2. toil/batchSystems/abstractGridEngineBatchSystem.py +17 -5
  3. toil/batchSystems/kubernetes.py +13 -2
  4. toil/batchSystems/mesos/batchSystem.py +33 -2
  5. toil/batchSystems/slurm.py +191 -16
  6. toil/cwl/cwltoil.py +17 -82
  7. toil/fileStores/__init__.py +1 -1
  8. toil/fileStores/abstractFileStore.py +5 -2
  9. toil/fileStores/cachingFileStore.py +1 -1
  10. toil/job.py +30 -14
  11. toil/jobStores/abstractJobStore.py +24 -19
  12. toil/jobStores/aws/jobStore.py +862 -1963
  13. toil/jobStores/aws/utils.py +24 -270
  14. toil/jobStores/googleJobStore.py +25 -9
  15. toil/jobStores/utils.py +0 -327
  16. toil/leader.py +27 -22
  17. toil/lib/aws/config.py +22 -0
  18. toil/lib/aws/s3.py +477 -9
  19. toil/lib/aws/utils.py +22 -33
  20. toil/lib/checksum.py +88 -0
  21. toil/lib/conversions.py +33 -31
  22. toil/lib/directory.py +217 -0
  23. toil/lib/ec2.py +97 -29
  24. toil/lib/exceptions.py +2 -1
  25. toil/lib/expando.py +2 -2
  26. toil/lib/generatedEC2Lists.py +73 -16
  27. toil/lib/io.py +33 -2
  28. toil/lib/memoize.py +21 -7
  29. toil/lib/pipes.py +385 -0
  30. toil/lib/retry.py +1 -1
  31. toil/lib/threading.py +1 -1
  32. toil/lib/web.py +4 -5
  33. toil/provisioners/__init__.py +5 -2
  34. toil/provisioners/aws/__init__.py +43 -36
  35. toil/provisioners/aws/awsProvisioner.py +22 -13
  36. toil/provisioners/node.py +60 -12
  37. toil/resource.py +3 -13
  38. toil/test/__init__.py +14 -16
  39. toil/test/batchSystems/test_slurm.py +103 -14
  40. toil/test/cwl/staging_cat.cwl +27 -0
  41. toil/test/cwl/staging_make_file.cwl +25 -0
  42. toil/test/cwl/staging_workflow.cwl +43 -0
  43. toil/test/cwl/zero_default.cwl +61 -0
  44. toil/test/docs/scripts/tutorial_staging.py +17 -8
  45. toil/test/jobStores/jobStoreTest.py +23 -133
  46. toil/test/lib/aws/test_iam.py +7 -7
  47. toil/test/lib/aws/test_s3.py +30 -33
  48. toil/test/lib/aws/test_utils.py +9 -9
  49. toil/test/provisioners/aws/awsProvisionerTest.py +59 -6
  50. toil/test/src/autoDeploymentTest.py +2 -3
  51. toil/test/src/fileStoreTest.py +89 -87
  52. toil/test/utils/ABCWorkflowDebug/ABC.txt +1 -0
  53. toil/test/utils/ABCWorkflowDebug/debugWorkflow.py +4 -4
  54. toil/test/utils/toilKillTest.py +35 -28
  55. toil/test/wdl/md5sum/md5sum.json +1 -1
  56. toil/test/wdl/wdltoil_test.py +98 -38
  57. toil/test/wdl/wdltoil_test_kubernetes.py +9 -0
  58. toil/utils/toilDebugFile.py +6 -3
  59. toil/utils/toilStats.py +17 -2
  60. toil/version.py +6 -6
  61. toil/wdl/wdltoil.py +1032 -546
  62. toil/worker.py +5 -2
  63. {toil-9.0.0.dist-info → toil-9.1.0.dist-info}/METADATA +12 -12
  64. {toil-9.0.0.dist-info → toil-9.1.0.dist-info}/RECORD +68 -61
  65. toil/lib/iterables.py +0 -112
  66. toil/test/docs/scripts/stagingExampleFiles/in.txt +0 -1
  67. {toil-9.0.0.dist-info → toil-9.1.0.dist-info}/WHEEL +0 -0
  68. {toil-9.0.0.dist-info → toil-9.1.0.dist-info}/entry_points.txt +0 -0
  69. {toil-9.0.0.dist-info → toil-9.1.0.dist-info}/licenses/LICENSE +0 -0
  70. {toil-9.0.0.dist-info → toil-9.1.0.dist-info}/top_level.txt +0 -0
@@ -11,676 +11,804 @@
11
11
  # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
12
  # See the License for the specific language governing permissions and
13
13
  # limitations under the License.
14
- import hashlib
15
- import itertools
16
- import logging
14
+ """
15
+ This file contains the AWS jobstore, which has its own docstring defining its use.
16
+
17
+ This docstring is about the organization of the file.
18
+
19
+ All direct AWS boto calls should live in toil.lib.aws, except for creating the
20
+ session instance and the resource/client (which should only be made ONCE in the jobstore).
21
+
22
+ Reasons for this
23
+ - DRY.
24
+ - All retries are on their individual boto functions, instead of here.
25
+ - Simple clear functions => simple clear unit tests (ideally).
26
+
27
+ Variables defining part size, parallelization, and other constants should live in toil.lib.aws.config.
28
+ """
17
29
  import os
30
+ import json
31
+ import logging
18
32
  import pickle
19
33
  import re
20
- import reprlib
21
34
  import stat
22
- import time
23
35
  import uuid
24
- from collections.abc import Generator
25
- from contextlib import contextmanager
36
+ import datetime
37
+
26
38
  from io import BytesIO
27
- from typing import IO, TYPE_CHECKING, Optional, Union, cast
28
- from urllib.parse import ParseResult, parse_qs, urlencode, urlsplit, urlunsplit
39
+ from contextlib import contextmanager
40
+ from urllib.parse import ParseResult, urlparse
41
+ from typing import (
42
+ ContextManager,
43
+ IO,
44
+ TYPE_CHECKING,
45
+ Optional,
46
+ Union,
47
+ cast,
48
+ Tuple,
49
+ Callable,
50
+ Dict,
51
+ Any,
52
+ Iterator,
53
+ Literal,
54
+ overload
55
+ )
29
56
 
57
+ # This file can't be imported if the AWS modules are not available.
30
58
  from botocore.exceptions import ClientError
31
59
 
32
- import toil.lib.encryption as encryption
33
60
  from toil.fileStores import FileID
34
- from toil.job import Job, JobDescription
35
- from toil.jobStores.abstractJobStore import (
36
- AbstractJobStore,
37
- ConcurrentFileModificationException,
38
- JobStoreExistsException,
39
- LocatorException,
40
- NoSuchFileException,
41
- NoSuchJobException,
42
- NoSuchJobStoreException,
43
- )
44
- from toil.jobStores.aws.utils import (
45
- SDBHelper,
46
- ServerSideCopyProhibitedError,
47
- copyKeyMultipart,
48
- fileSizeAndTime,
49
- no_such_sdb_domain,
50
- retry_sdb,
51
- sdb_unavailable,
52
- uploadFile,
53
- uploadFromPath,
54
- )
55
- from toil.jobStores.utils import ReadablePipe, ReadableTransformingPipe, WritablePipe
56
- from toil.lib.aws import build_tag_dict_from_env
57
- from toil.lib.aws.session import establish_boto3_session
58
- from toil.lib.aws.utils import (
59
- NoBucketLocationError,
60
- boto3_pager,
61
+ from toil.jobStores.abstractJobStore import (AbstractJobStore,
62
+ JobStoreExistsException,
63
+ NoSuchJobException,
64
+ NoSuchJobStoreException)
65
+ from toil.lib.aws.s3 import (
61
66
  create_s3_bucket,
62
- enable_public_objects,
63
- flatten_tags,
64
- get_bucket_region,
65
- get_item_from_attributes,
66
- get_object_for_url,
67
- list_objects_for_url,
68
- retry_s3,
69
- retryable_s3_errors,
67
+ delete_s3_bucket,
68
+ bucket_exists,
69
+ copy_s3_to_s3,
70
+ copy_local_to_s3,
71
+ copy_s3_to_local,
72
+ parse_s3_uri,
73
+ MultiPartPipe,
74
+ list_s3_items,
75
+ upload_to_s3,
76
+ download_stream,
77
+ s3_key_exists,
78
+ head_s3_object,
79
+ get_s3_object,
80
+ put_s3_object,
81
+ create_public_url,
82
+ AWSKeyNotFoundError,
70
83
  )
71
- from toil.lib.compatibility import compat_bytes
84
+ from toil.lib.aws.utils import get_object_for_url, list_objects_for_url
85
+ from toil.common import Config
86
+ from toil.jobStores.abstractJobStore import NoSuchFileException
72
87
  from toil.lib.ec2nodes import EC2Regions
73
- from toil.lib.exceptions import panic
74
- from toil.lib.io import AtomicFileCreate
75
- from toil.lib.memoize import strict_bool
76
- from toil.lib.objects import InnerClass
77
- from toil.lib.retry import get_error_code, get_error_status, retry
88
+ from toil.lib.retry import get_error_status
89
+ from toil.version import version
90
+ from toil.lib.aws.session import establish_boto3_session
91
+ from toil.job import JobDescription, Job
78
92
  from toil.lib.url import URLAccess
79
93
 
80
- if TYPE_CHECKING:
81
- from mypy_boto3_sdb.type_defs import (
82
- AttributeTypeDef,
83
- DeletableItemTypeDef,
84
- ItemTypeDef,
85
- ReplaceableAttributeTypeDef,
86
- ReplaceableItemTypeDef,
87
- UpdateConditionTypeDef,
88
- )
89
-
90
- from toil import Config
91
-
92
- boto3_session = establish_boto3_session()
93
- s3_boto3_resource = boto3_session.resource("s3")
94
- s3_boto3_client = boto3_session.client("s3")
94
+
95
+ DEFAULT_AWS_PART_SIZE = 52428800
95
96
  logger = logging.getLogger(__name__)
96
97
 
97
- # Sometimes we have to wait for multipart uploads to become real. How long
98
- # should we wait?
99
- CONSISTENCY_TICKS = 5
100
- CONSISTENCY_TIME = 1
101
98
 
99
+ class AWSJobStore(AbstractJobStore, URLAccess):
100
+ """
101
+ The AWS jobstore can be thought of as an AWS s3 bucket, with functions to
102
+ centralize, store, and track files for the workflow.
103
+
104
+ The AWS jobstore stores 4 things:
102
105
 
103
- class ChecksumError(Exception):
104
- """Raised when a download from AWS does not contain the correct data."""
106
+ 1. Jobs: These are pickled as files, and contain the information necessary to run a job when unpickled.
107
+ A job's file is deleted when finished, and its absence means it completed.
105
108
 
109
+ 2. Files: The inputs and outputs of jobs. Each file is written in s3 with the file pattern:
110
+ "files/{uuid4}/{original_filename}", where the file prefix
111
+ "files/{uuid4}" should only point to one file.
112
+ 3. Logs: The written log files of jobs that have run, plus the log file for the main Toil process.
106
113
 
107
- class DomainDoesNotExist(Exception):
108
- """Raised when a domain that is expected to exist does not exist."""
114
+ 4. Shared Files: Files with himan=-readable names, used by Toil itself or Python workflows.
115
+ These include:
109
116
 
110
- def __init__(self, domain_name):
111
- super().__init__(f"Expected domain {domain_name} to exist!")
117
+ * environment.pickle (environment variables)
112
118
 
119
+ * config.pickle (user options)
113
120
 
114
- class AWSJobStore(AbstractJobStore, URLAccess):
115
- """
116
- A job store that uses Amazon's S3 for file storage and SimpleDB for storing job info and
117
- enforcing strong consistency on the S3 file storage. There will be SDB domains for jobs and
118
- files and a versioned S3 bucket for file contents. Job objects are pickled, compressed,
119
- partitioned into chunks of 1024 bytes and each chunk is stored as a an attribute of the SDB
120
- item representing the job. UUIDs are used to identify jobs and files.
121
+ * pid.log (process ID of the workflow; when it finishes, the workflow either succeeded/failed)
122
+ * userScript (hot deployment; this is the job module)
123
+
124
+ * rootJobReturnValue (workflow succeeded or not)
125
+
126
+ NOTES
127
+ - The AWS jobstore does not use a database (directly, at least) currently. We can get away with this because:
128
+
129
+ 1. AWS s3 has strong consistency.
130
+
131
+ 2. s3's filter/query speed is pretty good.
132
+
133
+ However, there may be reasons in the future to provide users with a database:
134
+
135
+ * s3 throttling has limits (3,500/5,000 requests (TODO: per
136
+ second?); something like dynamodb supports 100,000+ requests).
137
+
138
+ * Access and filtering would be sped up, though how much faster this would be needs testing.
139
+
140
+ ALSO NOTE: The caching filestore uses a local (per node) database with a very similar structure that maybe
141
+ could be synced up with this.
142
+
143
+ - TODO: Etags are s3's native checksum, so use that for file integrity checking since it's free when fetching
144
+ object headers from s3. Using an md5sum in addition to this would work well with the current filestore.
145
+ WARNING: Etag values differ for the same file when the part size changes, so part size should always
146
+ be Set In Stone, unless we hit s3's 10,000 part limit, and we need to account for that.
147
+
148
+ - This class fills in self.config only when initialized/restarted; it is None upon class instantiation. These
149
+ are the options/config set by the user. When jobs are loaded/unpickled, they must re-incorporate this.
150
+
151
+ - The config.sseKey field is the single source of truth for bucket encryption
152
+ status. The key is never stored inside this class; it is always read
153
+ from the file referenced by the config when needed. Modifying the config
154
+ at runtime will modify whether encryption is used. Note that files
155
+ written *without* encryption (i.e. config.pickle) can't be read when
156
+ encryption is enabled!
157
+
158
+ - TODO: In general, job stores should log the version of Toil they were
159
+ initialized with and warn the user if restarting with a different
160
+ version.
121
161
  """
162
+ def __init__(self, locator: str, partSize: int = DEFAULT_AWS_PART_SIZE) -> None:
163
+ super(AWSJobStore, self).__init__(locator)
164
+ # TODO: parsing of user options seems like it should be done outside of this class;
165
+ # pass in only the bucket name and region?
166
+ self.region, self.bucket_name = parse_jobstore_identifier(locator)
167
+ boto3_session = establish_boto3_session(region_name=self.region)
168
+ self.s3_resource = boto3_session.resource("s3")
169
+ self.s3_client = boto3_session.client("s3")
170
+ logger.info(f"Instantiating {self.__class__} with region: {self.region}")
171
+ self.part_size = DEFAULT_AWS_PART_SIZE # don't let users set the part size; it will throw off etag values
172
+
173
+ # created anew during self.initialize() or loaded using self.resume()
174
+ self.bucket = None
175
+
176
+ # pickled job files named with uuid4
177
+ self.job_key_prefix = 'jobs/'
178
+ # job-file associations; these are empty files mimicking a db w/naming convention: job_uuid4.file_uuid4
179
+ #
180
+ # TODO: a many-to-many system is implemented, but a simpler one-to-many
181
+ # system could be used, because each file should belong to at most one
182
+ # job. This should be changed to a hierarchical layout.
183
+ self.job_associations_key_prefix = 'job-associations/'
184
+ # input/output files named with uuid4
185
+ self.content_key_prefix = 'files/'
186
+ # these are special files, like 'environment.pickle'; place them in root
187
+ self.shared_key_prefix = ''
188
+ # read and unread; named with uuid4
189
+ self.logs_key_prefix = 'logs/'
190
+
191
+ ###################################### CREATE/DESTROY JOBSTORE ######################################
192
+
193
+ def initialize(self, config: Config) -> None:
194
+ """
195
+ Called when starting a new jobstore with a non-existent bucket.
122
196
 
123
- # Dots in bucket names should be avoided because bucket names are used in HTTPS bucket
124
- # URLs where the may interfere with the certificate common name. We use a double
125
- # underscore as a separator instead.
126
- #
127
- bucketNameRe = re.compile(r"^[a-z0-9][a-z0-9-]+[a-z0-9]$")
197
+ Create bucket, raise if it already exists.
198
+ Set options from config.
199
+ """
200
+ logger.debug(f"Instantiating {self.__class__} for region {self.region} with bucket: '{self.bucket_name}'")
201
+ if bucket_exists(self.s3_resource, self.bucket_name):
202
+ raise JobStoreExistsException(self.locator, 'aws')
203
+ self.bucket = create_s3_bucket(self.s3_resource, self.bucket_name, region=self.region) # type: ignore
204
+ super(AWSJobStore, self).initialize(config)
128
205
 
129
- # See http://docs.aws.amazon.com/AmazonS3/latest/dev/BucketRestrictions.html
130
- #
131
- minBucketNameLen = 3
132
- maxBucketNameLen = 63
133
- maxNameLen = 10
134
- nameSeparator = "--"
206
+ def resume(self) -> None:
207
+ """
208
+ Called when reusing an old jobstore with an existing bucket.
135
209
 
136
- def __init__(self, locator: str, partSize: int = 50 << 20) -> None:
210
+ :raise NoSuchJobStoreException: if the bucket doesn't exist.
137
211
  """
138
- Create a new job store in AWS or load an existing one from there.
212
+ if not bucket_exists(self.s3_resource, self.bucket_name):
213
+ raise NoSuchJobStoreException(self.locator, 'aws')
214
+ # This sets self.config to not be None and loads the encryption key
215
+ # path from the unencrypted config. So it needs the bucket to exist to
216
+ # read from.
217
+ super(AWSJobStore, self).resume()
218
+
219
+ def destroy(self) -> None:
220
+ delete_s3_bucket(self.s3_resource, self.bucket_name)
139
221
 
140
- :param int partSize: The size of each individual part used for multipart operations like
141
- upload and copy, must be >= 5 MiB but large enough to not exceed 10k parts for the
142
- whole file
222
+ ###################################### BUCKET UTIL API ######################################
223
+
224
+ def _key_in_bucket(
225
+ self,
226
+ identifier: str,
227
+ prefix: str,
228
+ ) -> str:
143
229
  """
144
- super().__init__(locator)
145
- region, namePrefix = locator.split(":")
146
- regions = EC2Regions.keys()
147
- if region not in regions:
148
- raise ValueError(f'Region "{region}" is not one of: {regions}')
149
- if not self.bucketNameRe.match(namePrefix):
150
- raise ValueError(
151
- "Invalid name prefix '%s'. Name prefixes must contain only digits, "
152
- "hyphens or lower-case letters and must not start or end in a "
153
- "hyphen." % namePrefix
154
- )
155
- # Reserve 13 for separator and suffix
156
- if len(namePrefix) > self.maxBucketNameLen - self.maxNameLen - len(
157
- self.nameSeparator
158
- ):
159
- raise ValueError(
160
- "Invalid name prefix '%s'. Name prefixes may not be longer than 50 "
161
- "characters." % namePrefix
162
- )
163
- if "--" in namePrefix:
164
- raise ValueError(
165
- "Invalid name prefix '%s'. Name prefixes may not contain "
166
- "%s." % (namePrefix, self.nameSeparator)
167
- )
168
- logger.debug(
169
- "Instantiating %s for region %s and name prefix '%s'",
170
- self.__class__,
171
- region,
172
- namePrefix,
230
+ Get the key in the bucket for the given identifier and prefix.
231
+
232
+ We have this so higher-level code doesn't need to worry about the
233
+ pasting together of prefixes and identifiers, so it never ahs to be
234
+ mixed with the identifier=/prefix= calling convention.
235
+ """
236
+ return f'{prefix}{identifier}'
237
+
238
+ def is_in_bucket(
239
+ self,
240
+ identifier: str,
241
+ prefix: str,
242
+ bucket: Optional[str] = None,
243
+ ) -> bool:
244
+ """
245
+ Check if the key for the given identifier and prefix is in the bucket.
246
+ """
247
+ bucket = bucket or self.bucket_name
248
+
249
+ return s3_key_exists(
250
+ s3_resource=self.s3_resource,
251
+ bucket=bucket,
252
+ key=self._key_in_bucket(identifier=identifier, prefix=prefix),
253
+ extra_args=self._get_encryption_args()
173
254
  )
174
- self.region = region
175
- self.name_prefix = namePrefix
176
- self.part_size = partSize
177
- self.jobs_domain_name: Optional[str] = None
178
- self.files_domain_name: Optional[str] = None
179
- self.files_bucket = None
180
- self.db = boto3_session.client(service_name="sdb", region_name=region)
181
-
182
- self.s3_resource = boto3_session.resource("s3", region_name=self.region)
183
- self.s3_client = self.s3_resource.meta.client
184
-
185
- def initialize(self, config: "Config") -> None:
186
- if self._registered:
187
- raise JobStoreExistsException(self.locator, "aws")
188
- self._registered = None
189
- try:
190
- self._bind(create=True)
191
- except:
192
- with panic(logger):
193
- self.destroy()
194
- else:
195
- super().initialize(config)
196
- # Only register after job store has been full initialized
197
- self._registered = True
198
255
 
199
- @property
200
- def sseKeyPath(self) -> Optional[str]:
201
- return self.config.sseKey
202
256
 
203
- def resume(self) -> None:
204
- if not self._registered:
205
- raise NoSuchJobStoreException(self.locator, "aws")
206
- self._bind(create=False)
207
- super().resume()
257
+ def write_to_bucket(
258
+ self,
259
+ identifier: str,
260
+ prefix: str,
261
+ data: Optional[Union[bytes, str, Dict[str, Any]]],
262
+ bucket: Optional[str] = None,
263
+ encrypted: Optional[bool] = None,
264
+ ) -> None:
265
+ """
266
+ Write something directly to a bucket.
267
+
268
+ Use for small files. Does not parallelize or use multipart.
269
+
270
+ :param encrypted: Can be set to False to disable encryption.
271
+ """
272
+ # only used if exporting to a URL
273
+ encryption_args = {} if encrypted is False else self._get_encryption_args()
274
+ bucket = bucket or self.bucket_name
275
+
276
+ if isinstance(data, dict):
277
+ data = json.dumps(data).encode('utf-8')
278
+ elif isinstance(data, str):
279
+ data = data.encode('utf-8')
280
+ elif data is None:
281
+ data = b''
282
+
283
+ assert isinstance(data, bytes)
284
+ put_s3_object(
285
+ s3_resource=self.s3_resource,
286
+ bucket=bucket,
287
+ key=self._key_in_bucket(identifier=identifier, prefix=prefix),
288
+ body=data,
289
+ extra_args=encryption_args,
290
+ )
208
291
 
209
- def _bind(
292
+ def read_from_bucket(
210
293
  self,
211
- create: bool = False,
212
- block: bool = True,
213
- check_versioning_consistency: bool = True,
214
- ) -> None:
215
- def qualify(name):
216
- assert len(name) <= self.maxNameLen
217
- return self.name_prefix + self.nameSeparator + name
218
-
219
- # The order in which this sequence of events happens is important. We can easily handle the
220
- # inability to bind a domain, but it is a little harder to handle some cases of binding the
221
- # jobstore bucket. Maintaining this order allows for an easier `destroy` method.
222
- if self.jobs_domain_name is None:
223
- self.jobs_domain_name = qualify("jobs")
224
- self._bindDomain(self.jobs_domain_name, create=create, block=block)
225
- if self.files_domain_name is None:
226
- self.files_domain_name = qualify("files")
227
- self._bindDomain(self.files_domain_name, create=create, block=block)
228
- if self.files_bucket is None:
229
- self.files_bucket = self._bindBucket(
230
- qualify("files"),
231
- create=create,
232
- block=block,
233
- versioning=True,
234
- check_versioning_consistency=check_versioning_consistency,
235
- )
294
+ identifier: str,
295
+ prefix: str,
296
+ bucket: Optional[str] = None,
297
+ ) -> bytes:
298
+ """
299
+ Read something directly from a bucket.
236
300
 
237
- @property
238
- def _registered(self) -> Optional[bool]:
301
+ Use for small files. Does not parallelize or use multipart.
302
+
303
+ :raises NoSuchJobException: if the prefix is the job prefix and the
304
+ identifier is not found.
305
+ :raises NoSuchFileException: if the prefix is the content prefix and
306
+ the identifier is not found.
307
+ :raises self.s3_client.exceptions.NoSuchKey: in other cases where the
308
+ identifier is not found.
309
+ """
310
+ bucket = bucket or self.bucket_name
311
+
312
+ try:
313
+ return get_s3_object(
314
+ s3_resource=self.s3_resource,
315
+ bucket=bucket,
316
+ key=self._key_in_bucket(identifier=identifier, prefix=prefix),
317
+ extra_args=self._get_encryption_args(),
318
+ )['Body'].read()
319
+ except self.s3_client.exceptions.NoSuchKey:
320
+ if prefix == self.job_key_prefix:
321
+ raise NoSuchJobException(identifier)
322
+ elif prefix == self.content_key_prefix:
323
+ raise NoSuchFileException(identifier)
324
+ else:
325
+ raise
326
+ except ClientError as e:
327
+ if e.response.get('ResponseMetadata', {}).get('HTTPStatusCode') == 404:
328
+ if prefix == self.job_key_prefix:
329
+ raise NoSuchJobException(identifier)
330
+ elif prefix == self.content_key_prefix:
331
+ raise NoSuchFileException(identifier)
332
+ else:
333
+ raise
334
+ else:
335
+ raise
336
+
337
+ ###################################### JOBS API ######################################
338
+
339
+ def assign_job_id(self, jobDescription: JobDescription) -> None:
340
+ jobDescription.jobStoreID = str(uuid.uuid4())
341
+ logger.debug("Assigning Job ID %s", jobDescription.jobStoreID)
342
+
343
+ def create_job(self, jobDescription: JobDescription) -> JobDescription:
344
+ """
345
+ Pickle a jobDescription object and write it to the jobstore as a file.
346
+
347
+ Responsible for calling :meth:`toil.job.JobDescription.pre_update_hook`
348
+ on the job description.
239
349
  """
240
- A optional boolean property indicating whether this job store is registered. The
241
- registry is the authority on deciding if a job store exists or not. If True, this job
242
- store exists, if None the job store is transitioning from True to False or vice versa,
243
- if False the job store doesn't exist.
244
350
 
245
- :type: bool|None
351
+ jobDescription.pre_update_hook()
352
+
353
+ self.write_to_bucket(identifier=str(jobDescription.jobStoreID),
354
+ prefix=self.job_key_prefix,
355
+ data=pickle.dumps(jobDescription, protocol=pickle.HIGHEST_PROTOCOL))
356
+ return jobDescription
357
+
358
+ def job_exists(self, job_id: str, check: bool = False) -> bool:
359
+ """
360
+ Checks if the job_id is found in s3.
361
+
362
+ :param check: If True, raise an exception instead of returning false
363
+ when a job does not exist.
246
364
  """
247
- # The weird mapping of the SDB item attribute value to the property value is due to
248
- # backwards compatibility. 'True' becomes True, that's easy. Toil < 3.3.0 writes this at
249
- # the end of job store creation. Absence of either the registry, the item or the
250
- # attribute becomes False, representing a truly absent, non-existing job store. An
251
- # attribute value of 'False', which is what Toil < 3.3.0 writes at the *beginning* of job
252
- # store destruction, indicates a job store in transition, reflecting the fact that 3.3.0
253
- # may leak buckets or domains even though the registry reports 'False' for them. We
254
- # can't handle job stores that were partially created by 3.3.0, though.
255
- registry_domain_name = "toil-registry"
256
365
  try:
257
- self._bindDomain(
258
- domain_name=registry_domain_name, create=False, block=False
366
+ self.s3_client.head_object(
367
+ Bucket=self.bucket_name,
368
+ Key=self._key_in_bucket(
369
+ identifier=job_id,
370
+ prefix=self.job_key_prefix,
371
+ ),
372
+ **self._get_encryption_args()
259
373
  )
260
- except DomainDoesNotExist:
261
- return False
262
-
263
- for attempt in retry_sdb():
264
- with attempt:
265
- get_result = self.db.get_attributes(
266
- DomainName=registry_domain_name,
267
- ItemName=self.name_prefix,
268
- AttributeNames=["exists"],
269
- ConsistentRead=True,
270
- )
271
- attributes: list["AttributeTypeDef"] = get_result.get(
272
- "Attributes", []
273
- ) # the documentation says 'Attributes' should always exist, but this is not true
274
- exists: Optional[str] = get_item_from_attributes(
275
- attributes=attributes, name="exists"
276
- )
277
- if exists is None:
278
- return False
279
- elif exists == "True":
280
- return True
281
- elif exists == "False":
282
- return None
283
- else:
284
- assert False
374
+ return True
375
+ except ClientError as e:
376
+ if e.response.get('ResponseMetadata', {}).get('HTTPStatusCode') == 404:
377
+ if check:
378
+ raise NoSuchJobException(job_id)
379
+ else:
380
+ raise
381
+ except self.s3_client.exceptions.NoSuchKey:
382
+ if check:
383
+ raise NoSuchJobException(job_id)
384
+ else:
385
+ raise
386
+ return False
285
387
 
286
- @_registered.setter
287
- def _registered(self, value: bool) -> None:
288
- registry_domain_name = "toil-registry"
388
+ def jobs(self) -> Iterator[JobDescription]:
389
+ for result in list_s3_items(self.s3_resource, bucket=self.bucket_name, prefix=self.job_key_prefix):
390
+ try:
391
+ job_id = result['Key'][len(self.job_key_prefix):] # strip self.job_key_prefix
392
+ yield self.load_job(job_id)
393
+ except NoSuchJobException:
394
+ # job may have been deleted between showing up in the list and getting loaded
395
+ pass
396
+
397
+ def load_job(self, job_id: str) -> JobDescription:
398
+ """Use a job_id to get a job from the jobstore's s3 bucket, unpickle, and return it."""
289
399
  try:
290
- self._bindDomain(
291
- domain_name=registry_domain_name,
292
- # Only create registry domain when registering or
293
- # transitioning a store
294
- create=value is not False,
295
- block=False,
400
+ job = pickle.loads(self.read_from_bucket(identifier=job_id, prefix=self.job_key_prefix))
401
+ except NoSuchJobException:
402
+ raise
403
+
404
+ if not isinstance(job, JobDescription):
405
+ raise RuntimeError(
406
+ f"While trying to load a JobDescription for {job_id}, got a {type(job)} instead!",
296
407
  )
297
- except DomainDoesNotExist:
298
- pass
299
- else:
300
- for attempt in retry_sdb():
301
- with attempt:
302
- if value is False:
303
- self.db.delete_attributes(
304
- DomainName=registry_domain_name, ItemName=self.name_prefix
305
- )
306
- else:
307
- if value is True:
308
- attributes: list["ReplaceableAttributeTypeDef"] = [
309
- {"Name": "exists", "Value": "True", "Replace": True}
310
- ]
311
- elif value is None:
312
- attributes = [
313
- {"Name": "exists", "Value": "False", "Replace": True}
314
- ]
315
- else:
316
- assert False
317
- self.db.put_attributes(
318
- DomainName=registry_domain_name,
319
- ItemName=self.name_prefix,
320
- Attributes=attributes,
321
- )
322
-
323
- def _checkItem(self, item: "ItemTypeDef", enforce: bool = True) -> None:
324
- """
325
- Make sure that the given SimpleDB item actually has the attributes we think it should.
326
408
 
327
- Throw otherwise.
409
+ # Now we know it's the right type
410
+ job.assignConfig(self.config)
411
+ return job
328
412
 
329
- If enforce is false, log but don't throw.
330
- """
331
- self._checkAttributes(item["Attributes"], enforce)
413
+ def update_job(self, jobDescription: JobDescription) -> None:
414
+ self.create_job(jobDescription)
332
415
 
333
- def _checkAttributes(
334
- self, attributes: list["AttributeTypeDef"], enforce: bool = True
335
- ) -> None:
336
- if get_item_from_attributes(attributes=attributes, name="overlargeID") is None:
337
- logger.error(
338
- "overlargeID attribute isn't present: either SimpleDB entry is "
339
- "corrupt or jobstore is from an extremely old Toil: %s",
340
- attributes,
416
+ def delete_job(self, job_id: str) -> None:
417
+ logger.debug("Deleting job %s", job_id)
418
+
419
+ # delete the actual job file
420
+ self.s3_client.delete_object(
421
+ Bucket=self.bucket_name,
422
+ Key=self._key_in_bucket(
423
+ identifier=job_id,
424
+ prefix=self.job_key_prefix,
341
425
  )
342
- if enforce:
343
- raise RuntimeError(
344
- "encountered SimpleDB entry missing required attribute "
345
- "'overlargeID'; is your job store ancient?"
346
- )
426
+ )
347
427
 
348
- def _awsJobFromAttributes(self, attributes: list["AttributeTypeDef"]) -> Job:
349
- """
350
- Get a Toil Job object from attributes that are defined in an item from the DB
351
- :param attributes: List of attributes
352
- :return: Toil job
353
- """
354
- self._checkAttributes(attributes)
355
- overlarge_id_value = get_item_from_attributes(
356
- attributes=attributes, name="overlargeID"
428
+ # delete any files marked as associated with the job
429
+ job_file_associations_to_delete = []
430
+ root_key = self._key_in_bucket(
431
+ identifier=job_id,
432
+ prefix=self.job_associations_key_prefix,
357
433
  )
358
- if overlarge_id_value:
359
- assert self.file_exists(overlarge_id_value)
360
- # This is an overlarge job, download the actual attributes
361
- # from the file store
362
- logger.debug("Loading overlarge job from S3.")
363
- with self.read_file_stream(overlarge_id_value) as fh:
364
- binary = fh.read()
365
- else:
366
- binary, _ = SDBHelper.attributesToBinary(attributes)
367
- assert binary is not None
368
- job = pickle.loads(binary)
369
- if job is not None:
370
- job.assignConfig(self.config)
371
- return job
434
+ for associated_job_file in list_s3_items(self.s3_resource,
435
+ bucket=self.bucket_name,
436
+ prefix=root_key):
437
+ job_file_associations_to_delete.append(associated_job_file['Key'])
438
+ file_id = associated_job_file['Key'].split('.')[-1]
439
+ self.delete_file(file_id)
440
+
441
+ # delete the job-file association references (these are empty files the simply connect jobs to files)
442
+ for job_file_association in job_file_associations_to_delete:
443
+ self.s3_client.delete_object(Bucket=self.bucket_name, Key=f'{job_file_association}')
444
+
445
+ def associate_job_with_file(self, job_id: str, file_id: str) -> None:
446
+ # associate this job with this file; the file will be deleted when the job is
447
+ self.write_to_bucket(identifier=f'{job_id}.{file_id}', prefix=self.job_associations_key_prefix, data=None)
372
448
 
373
- def _awsJobFromItem(self, item: "ItemTypeDef") -> Job:
449
+ ###################################### FILES API ######################################
450
+
451
+ def write_file(self, local_path: str, job_id: Optional[str] = None, cleanup: bool = False) -> FileID:
374
452
  """
375
- Get a Toil Job object from an item from the DB
376
- :return: Toil Job
453
+ Write a local file into the jobstore and return a file_id referencing it.
454
+
455
+ :param job_id:
456
+ If job_id AND cleanup are supplied, associate this file with that job. When the job is deleted, the
457
+ file will be deleted as well.
458
+
459
+ :param cleanup:
460
+ If job_id AND cleanup are supplied, associate this file with that job. When the job is deleted, the
461
+ file will be deleted as well.
462
+ TODO: we don't need cleanup; remove it and only use job_id
377
463
  """
378
- return self._awsJobFromAttributes(item["Attributes"])
379
-
380
- def _awsJobToAttributes(self, job: JobDescription) -> list["AttributeTypeDef"]:
381
- binary = pickle.dumps(job, protocol=pickle.HIGHEST_PROTOCOL)
382
- if len(binary) > SDBHelper.maxBinarySize(extraReservedChunks=1):
383
- # Store as an overlarge job in S3
384
- with self.write_file_stream() as (writable, fileID):
385
- writable.write(binary)
386
- item = SDBHelper.binaryToAttributes(None)
387
- item["overlargeID"] = fileID
388
- else:
389
- item = SDBHelper.binaryToAttributes(binary)
390
- item["overlargeID"] = ""
391
- return SDBHelper.attributeDictToList(item)
464
+ # TODO: etag = compute_checksum_for_file(local_path, algorithm='etag')[len('etag$'):]
465
+ file_id = str(uuid.uuid4()) # mint a new file_id
466
+ file_attributes = os.stat(local_path)
467
+ size = file_attributes.st_size
468
+ executable = file_attributes.st_mode & stat.S_IXUSR != 0
469
+
470
+ if job_id and cleanup:
471
+ # associate this job with this file; then the file reference will be deleted when the job is
472
+ self.associate_job_with_file(job_id, file_id)
473
+
474
+ # Each file gets a prefix under which we put exactly one key, to hide
475
+ # metadata in the key.
476
+ prefix = self._key_in_bucket(
477
+ identifier=file_id,
478
+ prefix=self.content_key_prefix
479
+ )
392
480
 
393
- def _awsJobToItem(self, job: JobDescription, name: str) -> "ItemTypeDef":
394
- return {"Name": name, "Attributes": self._awsJobToAttributes(job)}
481
+ copy_local_to_s3(
482
+ s3_resource=self.s3_resource,
483
+ local_file_path=local_path,
484
+ dst_bucket=self.bucket_name,
485
+ dst_key=f'{prefix}/{os.path.basename(local_path)}',
486
+ extra_args=self._get_encryption_args()
487
+ )
488
+ return FileID(file_id, size, executable)
395
489
 
396
- jobsPerBatchInsert = 25
490
+ def find_s3_key_from_file_id(self, file_id: str) -> str:
491
+ """This finds an s3 key for which file_id is the prefix, and which already exists."""
492
+ prefix = self._key_in_bucket(
493
+ identifier=file_id,
494
+ prefix=self.content_key_prefix
495
+ )
496
+ s3_keys = [s3_item for s3_item in list_s3_items(self.s3_resource, bucket=self.bucket_name, prefix=prefix)]
497
+ if len(s3_keys) == 0:
498
+ raise NoSuchFileException(file_id)
499
+ if len(s3_keys) > 1:
500
+ # There can be only one.
501
+ raise RuntimeError(f'File ID: {file_id} should be unique, but includes: {s3_keys}')
502
+ return s3_keys[0]['Key']
397
503
 
398
504
  @contextmanager
399
- def batch(self) -> None:
400
- self._batchedUpdates = []
401
- yield
402
- batches = [
403
- self._batchedUpdates[i : i + self.jobsPerBatchInsert]
404
- for i in range(0, len(self._batchedUpdates), self.jobsPerBatchInsert)
405
- ]
406
-
407
- for batch in batches:
408
- items: list["ReplaceableItemTypeDef"] = []
409
- for jobDescription in batch:
410
- item_attributes: list["ReplaceableAttributeTypeDef"] = []
411
- jobDescription.pre_update_hook()
412
- item_name = compat_bytes(jobDescription.jobStoreID)
413
- got_job_attributes: list["AttributeTypeDef"] = self._awsJobToAttributes(
414
- jobDescription
415
- )
416
- for each_attribute in got_job_attributes:
417
- new_attribute: "ReplaceableAttributeTypeDef" = {
418
- "Name": each_attribute["Name"],
419
- "Value": each_attribute["Value"],
420
- "Replace": True,
421
- }
422
- item_attributes.append(new_attribute)
423
- items.append({"Name": item_name, "Attributes": item_attributes})
424
-
425
- for attempt in retry_sdb():
426
- with attempt:
427
- self.db.batch_put_attributes(
428
- DomainName=self.jobs_domain_name, Items=items
429
- )
430
- self._batchedUpdates = None
431
-
432
- def assign_job_id(self, job_description: JobDescription) -> None:
433
- jobStoreID = self._new_job_id()
434
- logger.debug("Assigning ID to job %s", jobStoreID)
435
- job_description.jobStoreID = jobStoreID
436
-
437
- def create_job(self, job_description: JobDescription) -> JobDescription:
438
- if hasattr(self, "_batchedUpdates") and self._batchedUpdates is not None:
439
- self._batchedUpdates.append(job_description)
440
- else:
441
- self.update_job(job_description)
442
- return job_description
443
-
444
- def job_exists(self, job_id: Union[bytes, str]) -> bool:
445
- for attempt in retry_sdb():
446
- with attempt:
447
- return (
448
- len(
449
- self.db.get_attributes(
450
- DomainName=self.jobs_domain_name,
451
- ItemName=compat_bytes(job_id),
452
- AttributeNames=[SDBHelper.presenceIndicator()],
453
- ConsistentRead=True,
454
- ).get("Attributes", [])
455
- )
456
- > 0
457
- )
458
-
459
- def jobs(self) -> Generator[Job, None, None]:
460
- job_items: Optional[list["ItemTypeDef"]] = None
461
- for attempt in retry_sdb():
462
- with attempt:
463
- job_items = boto3_pager(
464
- self.db.select,
465
- "Items",
466
- ConsistentRead=True,
467
- SelectExpression="select * from `%s`" % self.jobs_domain_name,
468
- )
469
- assert job_items is not None
470
- for jobItem in job_items:
471
- yield self._awsJobFromItem(jobItem)
472
-
473
- def load_job(self, job_id: FileID) -> Job:
474
- item_attributes = None
475
- for attempt in retry_sdb():
476
- with attempt:
477
- item_attributes = self.db.get_attributes(
478
- DomainName=self.jobs_domain_name,
479
- ItemName=compat_bytes(job_id),
480
- ConsistentRead=True,
481
- ).get("Attributes", [])
482
- if not item_attributes:
483
- raise NoSuchJobException(job_id)
484
- job = self._awsJobFromAttributes(item_attributes)
485
- if job is None:
486
- raise NoSuchJobException(job_id)
487
- logger.debug("Loaded job %s", job_id)
488
- return job
505
+ def write_file_stream(
506
+ self,
507
+ job_id: Optional[str] = None,
508
+ cleanup: bool = False,
509
+ basename: Optional[str] = None,
510
+ encoding: Optional[str] = None,
511
+ errors: Optional[str] = None,
512
+ ) -> Iterator[tuple[IO[bytes], str]]:
513
+ file_id = str(uuid.uuid4())
514
+ if job_id and cleanup:
515
+ self.associate_job_with_file(job_id, file_id)
516
+ prefix = self._key_in_bucket(
517
+ identifier=file_id,
518
+ prefix=self.content_key_prefix
519
+ )
489
520
 
490
- def update_job(self, job_description):
491
- logger.debug("Updating job %s", job_description.jobStoreID)
492
- job_description.pre_update_hook()
493
- job_attributes = self._awsJobToAttributes(job_description)
494
- update_attributes: list["ReplaceableAttributeTypeDef"] = [
495
- {"Name": attribute["Name"], "Value": attribute["Value"], "Replace": True}
496
- for attribute in job_attributes
497
- ]
498
- for attempt in retry_sdb():
499
- with attempt:
500
- self.db.put_attributes(
501
- DomainName=self.jobs_domain_name,
502
- ItemName=compat_bytes(job_description.jobStoreID),
503
- Attributes=update_attributes,
504
- )
521
+ pipe = MultiPartPipe(part_size=self.part_size,
522
+ s3_resource=self.s3_resource,
523
+ bucket_name=self.bucket_name,
524
+ file_id=f'{prefix}/{str(basename)}',
525
+ encryption_args=self._get_encryption_args(),
526
+ encoding=encoding,
527
+ errors=errors)
528
+ with pipe as writable:
529
+ yield writable, file_id
505
530
 
506
- itemsPerBatchDelete = 25
531
+ @contextmanager
532
+ def update_file_stream(
533
+ self,
534
+ file_id: str,
535
+ encoding: Optional[str] = None,
536
+ errors: Optional[str] = None
537
+ ) -> Iterator[IO[Any]]:
538
+ logger.debug("Replacing file %s via multipart upload", file_id)
539
+ pipe = MultiPartPipe(
540
+ part_size=self.part_size,
541
+ s3_resource=self.s3_resource,
542
+ bucket_name=self.bucket_name,
543
+ file_id=self.find_s3_key_from_file_id(file_id),
544
+ encryption_args=self._get_encryption_args(),
545
+ encoding=encoding,
546
+ errors=errors,
547
+ )
548
+ with pipe as writable:
549
+ yield writable
507
550
 
508
- def delete_job(self, job_id):
509
- # remove job and replace with jobStoreId.
510
- logger.debug("Deleting job %s", job_id)
551
+ @contextmanager
552
+ def write_shared_file_stream(
553
+ self,
554
+ shared_file_name: str,
555
+ encrypted: Optional[bool] = None,
556
+ encoding: Optional[str] = None,
557
+ errors: Optional[str] = None,
558
+ ) -> Iterator[IO[bytes]]:
559
+ encryption_args = {} if encrypted is False else self._get_encryption_args()
560
+ pipe = MultiPartPipe(
561
+ part_size=self.part_size,
562
+ s3_resource=self.s3_resource,
563
+ bucket_name=self.bucket_name,
564
+ file_id=self._key_in_bucket(
565
+ identifier=shared_file_name,
566
+ prefix=self.shared_key_prefix,
567
+ ),
568
+ encryption_args=encryption_args,
569
+ encoding=encoding,
570
+ errors=errors,
571
+ )
572
+ with pipe as writable:
573
+ yield writable
511
574
 
512
- # If the job is overlarge, delete its file from the filestore
513
- for attempt in retry_sdb():
514
- with attempt:
515
- attributes = self.db.get_attributes(
516
- DomainName=self.jobs_domain_name,
517
- ItemName=compat_bytes(job_id),
518
- ConsistentRead=True,
519
- ).get("Attributes", [])
520
- # If the overlargeID has fallen off, maybe we partially deleted the
521
- # attributes of the item? Or raced on it? Or hit SimpleDB being merely
522
- # eventually consistent? We should still be able to get rid of it.
523
- self._checkAttributes(attributes, enforce=False)
524
- overlarge_id_value = get_item_from_attributes(
525
- attributes=attributes, name="overlargeID"
575
+ def update_file(self, file_id: str, local_path: str) -> None:
576
+ copy_local_to_s3(
577
+ s3_resource=self.s3_resource,
578
+ local_file_path=local_path,
579
+ dst_bucket=self.bucket_name,
580
+ dst_key=self.find_s3_key_from_file_id(file_id),
581
+ extra_args=self._get_encryption_args()
526
582
  )
527
- if overlarge_id_value:
528
- logger.debug("Deleting job from filestore")
529
- self.delete_file(overlarge_id_value)
530
- for attempt in retry_sdb():
531
- with attempt:
532
- self.db.delete_attributes(
533
- DomainName=self.jobs_domain_name, ItemName=compat_bytes(job_id)
534
- )
535
- items: Optional[list["ItemTypeDef"]] = None
536
- for attempt in retry_sdb():
537
- with attempt:
538
- items = list(
539
- boto3_pager(
540
- self.db.select,
541
- "Items",
542
- ConsistentRead=True,
543
- SelectExpression=f"select version from `{self.files_domain_name}` where ownerID='{job_id}'",
544
- )
545
- )
546
- assert items is not None
547
- if items:
548
- logger.debug(
549
- "Deleting %d file(s) associated with job %s", len(items), job_id
583
+
584
+ def file_exists(self, file_id: str) -> bool:
585
+ try:
586
+ # This throws if the file doesn't exist.
587
+ self.find_s3_key_from_file_id(file_id)
588
+ except NoSuchFileException:
589
+ # It didn't exist
590
+ return False
591
+ return True
592
+
593
+ def get_file_size(self, file_id: str) -> int:
594
+ """Do we need both get_file_size and _get_size???"""
595
+ full_s3_key = self.find_s3_key_from_file_id(file_id)
596
+ return self._get_size(url=urlparse(f's3://{self.bucket_name}/{full_s3_key}')) or 0
597
+
598
+ @classmethod
599
+ def _get_size(cls, url: ParseResult) -> Optional[int]:
600
+ """Do we need both get_file_size and _get_size???"""
601
+ try:
602
+ return get_object_for_url(url, existing=True).content_length
603
+ except (AWSKeyNotFoundError, NoSuchFileException):
604
+ return 0
605
+
606
+ def read_file(self, file_id: str, local_path: str, symlink: bool = False) -> None:
607
+ full_s3_key = self.find_s3_key_from_file_id(file_id)
608
+ executable = getattr(file_id, "executable", False)
609
+ try:
610
+ copy_s3_to_local(
611
+ s3_resource=self.s3_resource,
612
+ local_file_path=local_path,
613
+ src_bucket=self.bucket_name,
614
+ src_key=full_s3_key,
615
+ extra_args=self._get_encryption_args()
550
616
  )
551
- n = self.itemsPerBatchDelete
552
- batches = [items[i : i + n] for i in range(0, len(items), n)]
553
- for batch in batches:
554
- delete_items: list["DeletableItemTypeDef"] = [
555
- {"Name": item["Name"]} for item in batch
556
- ]
557
- for attempt in retry_sdb():
558
- with attempt:
559
- self.db.batch_delete_attributes(
560
- DomainName=self.files_domain_name, Items=delete_items
561
- )
562
- for item in items:
563
- item: "ItemTypeDef"
564
- version = get_item_from_attributes(
565
- attributes=item["Attributes"], name="version"
566
- )
567
- for attempt in retry_s3():
568
- with attempt:
569
- if version:
570
- self.s3_client.delete_object(
571
- Bucket=self.files_bucket.name,
572
- Key=compat_bytes(item["Name"]),
573
- VersionId=version,
574
- )
575
- else:
576
- self.s3_client.delete_object(
577
- Bucket=self.files_bucket.name,
578
- Key=compat_bytes(item["Name"]),
579
- )
617
+ if executable:
618
+ os.chmod(local_path, os.stat(local_path).st_mode | stat.S_IXUSR)
619
+ except self.s3_client.exceptions.NoSuchKey:
620
+ raise NoSuchFileException(file_id)
621
+ except ClientError as e:
622
+ if e.response.get('ResponseMetadata', {}).get('HTTPStatusCode') == 404:
623
+ raise NoSuchFileException(file_id)
624
+ else:
625
+ raise
580
626
 
581
- def get_empty_file_store_id(
582
- self, jobStoreID=None, cleanup=False, basename=None
583
- ) -> FileID:
584
- info = self.FileInfo.create(jobStoreID if cleanup else None)
585
- with info.uploadStream() as _:
586
- # Empty
587
- pass
588
- info.save()
589
- logger.debug("Created %r.", info)
590
- return info.fileID
627
+ @contextmanager # type: ignore
628
+ def read_file_stream( # type: ignore
629
+ self,
630
+ file_id: Union[FileID, str],
631
+ encoding: Optional[str] = None,
632
+ errors: Optional[str] = None,
633
+ ) -> Union[ContextManager[IO[bytes]], ContextManager[IO[str]]]:
634
+ full_s3_key = self.find_s3_key_from_file_id(file_id)
635
+ try:
636
+ with download_stream(self.s3_resource,
637
+ bucket=self.bucket_name,
638
+ key=full_s3_key,
639
+ extra_args=self._get_encryption_args(),
640
+ encoding=encoding,
641
+ errors=errors) as readable:
642
+ yield readable
643
+ except self.s3_client.exceptions.NoSuchKey:
644
+ raise NoSuchFileException(file_id)
645
+ except ClientError as e:
646
+ if e.response.get('ResponseMetadata', {}).get('HTTPStatusCode') == 404:
647
+ raise NoSuchFileException(file_id)
648
+ else:
649
+ raise
650
+
651
+ @overload
652
+ @contextmanager
653
+ def read_shared_file_stream(
654
+ self,
655
+ shared_file_name: str,
656
+ encoding: str,
657
+ errors: Optional[str] = None,
658
+ ) -> Iterator[IO[str]]: ...
659
+
660
+ @overload
661
+ @contextmanager
662
+ def read_shared_file_stream(
663
+ self,
664
+ shared_file_name: str,
665
+ encoding: Literal[None] = None,
666
+ errors: Optional[str] = None,
667
+ ) -> Iterator[IO[bytes]]: ...
668
+
669
+ @contextmanager
670
+ def read_shared_file_stream(
671
+ self,
672
+ shared_file_name: str,
673
+ encoding: Optional[str] = None,
674
+ errors: Optional[str] = None,
675
+ ) -> Iterator[Union[IO[bytes], IO[str]]]:
676
+ self._requireValidSharedFileName(shared_file_name)
677
+ key = self._key_in_bucket(identifier=shared_file_name, prefix=self.shared_key_prefix)
678
+ if not self.is_in_bucket(
679
+ identifier=shared_file_name,
680
+ prefix=self.shared_key_prefix,
681
+ ):
682
+ # TRAVIS=true TOIL_OWNER_TAG="shared" /home/quokka/git/toil/v3nv/bin/python -m pytest --durations=0 --log-level DEBUG --log-cli-level INFO -r s /home/quokka/git/toil/src/toil/test/jobStores/jobStoreTest.py::EncryptedAWSJobStoreTest::testJobDeletions
683
+ # throw NoSuchFileException in download_stream
684
+ raise NoSuchFileException(f's3://{self.bucket_name}/{key}')
685
+
686
+ try:
687
+ with download_stream(self.s3_resource,
688
+ bucket=self.bucket_name,
689
+ key=key,
690
+ encoding=encoding,
691
+ errors=errors,
692
+ extra_args=self._get_encryption_args()) as readable:
693
+ yield readable
694
+ except self.s3_client.exceptions.NoSuchKey:
695
+ raise NoSuchFileException(shared_file_name)
696
+ except ClientError as e:
697
+ if e.response.get('ResponseMetadata', {}).get('HTTPStatusCode') == 404:
698
+ raise NoSuchFileException(shared_file_name)
699
+ else:
700
+ raise
701
+
702
+ def delete_file(self, file_id: str) -> None:
703
+ try:
704
+ full_s3_key = self.find_s3_key_from_file_id(file_id)
705
+ except NoSuchFileException:
706
+ # The file is gone. That's great, we're idempotent.
707
+ return
708
+ self.s3_client.delete_object(Bucket=self.bucket_name, Key=full_s3_key)
709
+
710
+ ###################################### URI API ######################################
591
711
 
592
712
  def _import_file(
593
713
  self,
594
- otherCls,
714
+ otherCls: type[URLAccess],
595
715
  uri: ParseResult,
596
716
  shared_file_name: Optional[str] = None,
597
717
  hardlink: bool = False,
598
718
  symlink: bool = True,
599
719
  ) -> Optional[FileID]:
600
- try:
601
- if issubclass(otherCls, AWSJobStore):
602
- srcObj = get_object_for_url(uri, existing=True)
603
- size = srcObj.content_length
604
- if shared_file_name is None:
605
- info = self.FileInfo.create(srcObj.key)
606
- else:
607
- self._requireValidSharedFileName(shared_file_name)
608
- jobStoreFileID = self._shared_file_id(shared_file_name)
609
- info = self.FileInfo.loadOrCreate(
610
- jobStoreFileID=jobStoreFileID,
611
- ownerID=str(self.sharedFileOwnerID),
612
- encrypted=None,
613
- )
614
- info.copyFrom(srcObj)
615
- info.save()
616
- return FileID(info.fileID, size) if shared_file_name is None else None
617
- except (NoBucketLocationError, ServerSideCopyProhibitedError):
618
- # AWS refuses to tell us where the bucket is or do this copy for us
619
- logger.warning(
620
- "Falling back to copying via the local machine. This could get expensive!"
720
+ """
721
+ Upload a file into the s3 bucket jobstore from the source uri.
722
+
723
+ This db entry's existence should always be in sync with the file's existence (when one exists,
724
+ so must the other).
725
+ """
726
+ # we are copying from s3 to s3
727
+ if isinstance(otherCls, AWSJobStore):
728
+ src_bucket_name, src_key_name = parse_s3_uri(uri.geturl())
729
+ response = head_s3_object(self.s3_resource, bucket=src_bucket_name, key=src_key_name, check=True)
730
+ content_length = response['ContentLength'] # e.g. 65536
731
+
732
+ file_id = str(uuid.uuid4())
733
+ if shared_file_name:
734
+ dst_key = self._key_in_bucket(identifier=shared_file_name, prefix=self.shared_key_prefix)
735
+ else:
736
+ # cannot determine exec bit from foreign s3 so default to False
737
+ dst_key = "/".join([
738
+ self._key_in_bucket(identifier=file_id, prefix=self.content_key_prefix),
739
+ src_key_name.split("/")[-1],
740
+ ])
741
+
742
+ copy_s3_to_s3(
743
+ s3_resource=self.s3_resource,
744
+ src_bucket=src_bucket_name,
745
+ src_key=src_key_name,
746
+ dst_bucket=self.bucket_name,
747
+ dst_key=dst_key,
748
+ extra_args=self._get_encryption_args()
621
749
  )
750
+ # TODO: verify etag after copying here?
622
751
 
623
- # copy if exception
624
- return super()._import_file(otherCls, uri, shared_file_name=shared_file_name)
752
+ return FileID(file_id, content_length) if not shared_file_name else None
753
+ else:
754
+ return super(AWSJobStore, self)._import_file(
755
+ otherCls=otherCls,
756
+ uri=uri,
757
+ shared_file_name=shared_file_name,
758
+ hardlink=hardlink,
759
+ symlink=symlink
760
+ )
625
761
 
626
- def _export_file(self, otherCls, file_id: FileID, uri: ParseResult) -> None:
627
- try:
628
- if issubclass(otherCls, AWSJobStore):
629
- dstObj = get_object_for_url(uri)
630
- info = self.FileInfo.loadOrFail(file_id)
631
- info.copyTo(dstObj)
632
- return
633
- except (NoBucketLocationError, ServerSideCopyProhibitedError):
634
- # AWS refuses to tell us where the bucket is or do this copy for us
635
- logger.warning(
636
- "Falling back to copying via the local machine. This could get expensive!"
762
+ def _export_file(
763
+ self, otherCls: type[URLAccess], jobStoreFileID: FileID, url: ParseResult
764
+ ) -> None:
765
+ """Export a file_id in the jobstore to the url."""
766
+ if isinstance(otherCls, AWSJobStore):
767
+ src_full_s3_key = self.find_s3_key_from_file_id(jobStoreFileID)
768
+ dst_bucket_name, dst_key_name = parse_s3_uri(url.geturl())
769
+ copy_s3_to_s3(
770
+ s3_resource=self.s3_resource,
771
+ src_bucket=self.bucket_name,
772
+ src_key=src_full_s3_key,
773
+ dst_bucket=dst_bucket_name,
774
+ dst_key=dst_key_name,
775
+ extra_args=self._get_encryption_args()
637
776
  )
638
777
  else:
639
- super()._default_export_file(otherCls, file_id, uri)
778
+ super(AWSJobStore, self)._default_export_file(otherCls, jobStoreFileID, url)
640
779
 
641
- ###
642
- # URL access implementation
643
- ###
780
+ @classmethod
781
+ def _read_from_url(
782
+ cls, url: ParseResult, writable: Union[IO[bytes], IO[str]]
783
+ ) -> tuple[int, bool]:
784
+ src_obj = get_object_for_url(url, existing=True)
785
+ src_obj.download_fileobj(writable)
786
+ executable = False
787
+ return src_obj.content_length, executable
644
788
 
645
- # URL access methods aren't used by the rest of the job store methods.
789
+ @classmethod
790
+ def _write_to_url(
791
+ cls,
792
+ readable: Union[IO[bytes], IO[str]],
793
+ url: ParseResult,
794
+ executable: bool = False,
795
+ ) -> None:
796
+ dst_obj = get_object_for_url(url)
797
+ upload_to_s3(readable=readable,
798
+ s3_resource=establish_boto3_session().resource("s3"),
799
+ bucket=dst_obj.bucket_name,
800
+ key=dst_obj.key)
646
801
 
647
802
  @classmethod
648
803
  def _url_exists(cls, url: ParseResult) -> bool:
649
804
  try:
650
- try:
651
- get_object_for_url(url, existing=True, anonymous=True)
652
- except PermissionError:
653
- # If we can't look anonymously, log in
654
- get_object_for_url(url, existing=True)
805
+ get_object_for_url(url, existing=True)
655
806
  return True
656
807
  except FileNotFoundError:
657
808
  # Not a file
658
- # Might be a directory. Or we might not have access to know.
659
- # See if it's a directory.
809
+ # Might be a directory.
660
810
  return cls._get_is_directory(url)
661
811
 
662
- @classmethod
663
- def _get_size(cls, url: ParseResult) -> int:
664
- try:
665
- src_obj = get_object_for_url(url, existing=True, anonymous=True)
666
- except PermissionError:
667
- src_obj = get_object_for_url(url, existing=True)
668
- return src_obj.content_length
669
-
670
- @classmethod
671
- def _read_from_url(cls, url: ParseResult, writable):
672
- try:
673
- src_obj = get_object_for_url(url, existing=True, anonymous=True)
674
- src_obj.download_fileobj(writable)
675
- except Exception as e:
676
- if isinstance(e, PermissionError) or (isinstance(e, ClientError) and get_error_status(e) == 403):
677
- # The object setup or the download does not have permission. Try again with a login.
678
- src_obj = get_object_for_url(url, existing=True)
679
- src_obj.download_fileobj(writable)
680
- else:
681
- raise
682
- return (src_obj.content_length, False) # executable bit is always False
683
-
684
812
  @classmethod
685
813
  def _open_url(cls, url: ParseResult) -> IO[bytes]:
686
814
  try:
@@ -696,1415 +824,186 @@ class AWSJobStore(AbstractJobStore, URLAccess):
696
824
  # We should get back a response with a stream in 'Body'
697
825
  if "Body" not in response:
698
826
  raise RuntimeError(f"Could not fetch body stream for {url}")
699
- return response["Body"]
827
+ return response["Body"] # type: ignore
700
828
 
701
829
  @classmethod
702
- def _write_to_url(
703
- cls, readable, url: ParseResult, executable: bool = False
704
- ) -> None:
705
- # Don't try to do anonympus writes.
706
- dstObj = get_object_for_url(url)
707
-
708
- logger.debug("Uploading %s", dstObj.key)
709
- # uploadFile takes care of using multipart upload if the file is larger than partSize (default to 5MB)
710
- uploadFile(
711
- readable=readable,
712
- resource=s3_boto3_resource,
713
- bucketName=dstObj.bucket_name,
714
- fileID=dstObj.key,
715
- partSize=5 * 1000 * 1000,
716
- )
830
+ def _list_url(cls, url: ParseResult) -> list[str]:
831
+ return list_objects_for_url(url)
717
832
 
718
833
  @classmethod
719
- def _list_url(cls, url: ParseResult) -> list[str]:
834
+ def _supports_url(cls, url: ParseResult, export: bool = False) -> bool:
835
+ # TODO: export seems unused
836
+ return url.scheme.lower() == 's3'
837
+
838
+ def get_public_url(self, file_id: str) -> str:
839
+ """Turn s3:// into http:// and put a public-read ACL on it."""
720
840
  try:
721
- return list_objects_for_url(url, anonymous=True)
722
- except PermissionError:
723
- return list_objects_for_url(url)
724
-
841
+ return create_public_url(
842
+ self.s3_resource,
843
+ bucket=self.bucket_name,
844
+ key=self._key_in_bucket(
845
+ identifier=file_id,
846
+ prefix=self.content_key_prefix,
847
+ ),
848
+ )
849
+ except self.s3_client.exceptions.NoSuchKey:
850
+ raise NoSuchFileException(file_id)
851
+ except ClientError as e:
852
+ if e.response.get('ResponseMetadata', {}).get('HTTPStatusCode') == 404:
853
+ raise NoSuchFileException(file_id)
854
+ else:
855
+ raise
856
+
857
+ def get_shared_public_url(self, file_id: str) -> str:
858
+ """Turn s3:// into http:// and put a public-read ACL on it."""
859
+ # since this is only for a few files like "config.pickle"... why and what is this used for?
860
+ self._requireValidSharedFileName(file_id)
861
+ try:
862
+ return create_public_url(
863
+ self.s3_resource,
864
+ bucket=self.bucket_name,
865
+ key=self._key_in_bucket(
866
+ identifier=file_id,
867
+ prefix=self.shared_key_prefix,
868
+ ),
869
+ )
870
+ except self.s3_client.exceptions.NoSuchKey:
871
+ raise NoSuchFileException(file_id)
872
+ except ClientError as e:
873
+ if e.response.get('ResponseMetadata', {}).get('HTTPStatusCode') == 404:
874
+ raise NoSuchFileException(file_id)
875
+ else:
876
+ raise
725
877
 
726
878
  @classmethod
727
879
  def _get_is_directory(cls, url: ParseResult) -> bool:
728
880
  # We consider it a directory if anything is in it.
729
881
  # TODO: Can we just get the first item and not the whole list?
730
- return len(cls._list_url(url)) > 0
731
-
732
- @classmethod
733
- def _supports_url(cls, url: ParseResult, export: bool = False) -> bool:
734
- return url.scheme.lower() == "s3"
882
+ return len(list_objects_for_url(url)) > 0
735
883
 
736
- def write_file(
737
- self, local_path: FileID, job_id: Optional[FileID] = None, cleanup: bool = False
738
- ) -> FileID:
739
- info = self.FileInfo.create(job_id if cleanup else None)
740
- info.upload(local_path, not self.config.disableJobStoreChecksumVerification)
741
- info.save()
742
- logger.debug("Wrote %r of from %r", info, local_path)
743
- return info.fileID
744
-
745
- @contextmanager
746
- def write_file_stream(
884
+ def get_empty_file_store_id(
747
885
  self,
748
- job_id: Optional[FileID] = None,
886
+ job_id: Optional[str] = None,
749
887
  cleanup: bool = False,
750
- basename=None,
751
- encoding=None,
752
- errors=None,
753
- ):
754
- info = self.FileInfo.create(job_id if cleanup else None)
755
- with info.uploadStream(encoding=encoding, errors=errors) as writable:
756
- yield writable, info.fileID
757
- info.save()
758
- logger.debug("Wrote %r.", info)
759
-
760
- @contextmanager
761
- def write_shared_file_stream(
762
- self, shared_file_name, encrypted=None, encoding=None, errors=None
763
- ):
764
- self._requireValidSharedFileName(shared_file_name)
765
- info = self.FileInfo.loadOrCreate(
766
- jobStoreFileID=self._shared_file_id(shared_file_name),
767
- ownerID=str(self.sharedFileOwnerID),
768
- encrypted=encrypted,
769
- )
770
- with info.uploadStream(encoding=encoding, errors=errors) as writable:
771
- yield writable
772
- info.save()
773
- logger.debug("Wrote %r for shared file %r.", info, shared_file_name)
774
-
775
- def update_file(self, file_id, local_path):
776
- info = self.FileInfo.loadOrFail(file_id)
777
- info.upload(local_path, not self.config.disableJobStoreChecksumVerification)
778
- info.save()
779
- logger.debug("Wrote %r from path %r.", info, local_path)
780
-
781
- @contextmanager
782
- def update_file_stream(self, file_id, encoding=None, errors=None):
783
- info = self.FileInfo.loadOrFail(file_id)
784
- with info.uploadStream(encoding=encoding, errors=errors) as writable:
785
- yield writable
786
- info.save()
787
- logger.debug("Wrote %r from stream.", info)
788
-
789
- def file_exists(self, file_id):
790
- return self.FileInfo.exists(file_id)
791
-
792
- def get_file_size(self, file_id):
793
- if not self.file_exists(file_id):
794
- return 0
795
- info = self.FileInfo.loadOrFail(file_id)
796
- return info.getSize()
797
-
798
- def read_file(self, file_id, local_path, symlink=False):
799
- info = self.FileInfo.loadOrFail(file_id)
800
- logger.debug("Reading %r into %r.", info, local_path)
801
- info.download(local_path, not self.config.disableJobStoreChecksumVerification)
802
- if getattr(file_id, "executable", False):
803
- os.chmod(local_path, os.stat(local_path).st_mode | stat.S_IXUSR)
804
-
805
- @contextmanager
806
- def read_file_stream(self, file_id, encoding=None, errors=None):
807
- info = self.FileInfo.loadOrFail(file_id)
808
- logger.debug("Reading %r into stream.", info)
809
- with info.downloadStream(encoding=encoding, errors=errors) as readable:
810
- yield readable
811
-
812
- @contextmanager
813
- def read_shared_file_stream(self, shared_file_name, encoding=None, errors=None):
814
- self._requireValidSharedFileName(shared_file_name)
815
- jobStoreFileID = self._shared_file_id(shared_file_name)
816
- info = self.FileInfo.loadOrFail(jobStoreFileID, customName=shared_file_name)
817
- logger.debug(
818
- "Reading %r for shared file %r into stream.", info, shared_file_name
819
- )
820
- with info.downloadStream(encoding=encoding, errors=errors) as readable:
821
- yield readable
822
-
823
- def delete_file(self, file_id):
824
- info = self.FileInfo.load(file_id)
825
- if info is None:
826
- logger.debug("File %s does not exist, skipping deletion.", file_id)
827
- else:
828
- info.delete()
829
-
830
- def write_logs(self, msg):
831
- info = self.FileInfo.create(str(self.statsFileOwnerID))
832
- with info.uploadStream(multipart=False) as writeable:
833
- if isinstance(msg, str):
834
- # This stream is for binary data, so encode any non-encoded things
835
- msg = msg.encode("utf-8", errors="ignore")
836
- writeable.write(msg)
837
- info.save()
838
-
839
- def read_logs(self, callback, read_all=False):
840
- itemsProcessed = 0
841
-
842
- for info in self._read_logs(callback, self.statsFileOwnerID):
843
- info._ownerID = str(self.readStatsFileOwnerID) # boto3 requires strings
844
- info.save()
845
- itemsProcessed += 1
846
-
847
- if read_all:
848
- for _ in self._read_logs(callback, self.readStatsFileOwnerID):
849
- itemsProcessed += 1
850
-
851
- return itemsProcessed
852
-
853
- def _read_logs(self, callback, ownerId):
854
- items = None
855
- for attempt in retry_sdb():
856
- with attempt:
857
- items = boto3_pager(
858
- self.db.select,
859
- "Items",
860
- ConsistentRead=True,
861
- SelectExpression=f"select * from `{self.files_domain_name}` where ownerID='{str(ownerId)}'",
862
- )
863
- assert items is not None
864
- for item in items:
865
- info = self.FileInfo.fromItem(item)
866
- with info.downloadStream() as readable:
867
- callback(readable)
868
- yield info
869
-
870
- # TODO: Make this retry more specific?
871
- # example: https://github.com/DataBiosphere/toil/issues/3378
872
- @retry()
873
- def get_public_url(self, jobStoreFileID):
874
- info = self.FileInfo.loadOrFail(jobStoreFileID)
875
- if info.content is not None:
876
- with info.uploadStream(allowInlining=False) as f:
877
- f.write(info.content)
878
-
879
- self.files_bucket.Object(compat_bytes(jobStoreFileID)).Acl().put(
880
- ACL="public-read"
888
+ basename: Optional[str] = None,
889
+ ) -> str:
890
+ """Create an empty file in s3 and return a bare string file ID."""
891
+ file_id = str(uuid.uuid4())
892
+ self.write_to_bucket(
893
+ identifier=f'{file_id}/0/{basename}',
894
+ prefix=self.content_key_prefix,
895
+ data=None,
896
+ bucket=self.bucket_name
881
897
  )
882
-
883
- url = self.s3_client.generate_presigned_url(
884
- "get_object",
885
- Params={
886
- "Bucket": self.files_bucket.name,
887
- "Key": compat_bytes(jobStoreFileID),
888
- "VersionId": info.version,
889
- },
890
- ExpiresIn=self.publicUrlExpiration.total_seconds(),
898
+ if job_id and cleanup:
899
+ self.associate_job_with_file(job_id, file_id)
900
+ return file_id
901
+
902
+ ###################################### LOGGING API ######################################
903
+
904
+ def write_logs(self, log_msg: Union[bytes, str]) -> None:
905
+ if isinstance(log_msg, str):
906
+ log_msg = log_msg.encode('utf-8', errors='ignore')
907
+ file_obj = BytesIO(log_msg)
908
+
909
+ key_name = self._key_in_bucket(
910
+ identifier=f'{datetime.datetime.now()}{str(uuid.uuid4())}'.replace(
911
+ ' ', '_'
912
+ ),
913
+ prefix=self.logs_key_prefix,
891
914
  )
915
+ self.s3_client.upload_fileobj(Bucket=self.bucket_name,
916
+ Key=key_name,
917
+ ExtraArgs=self._get_encryption_args(),
918
+ Fileobj=file_obj)
892
919
 
893
- # boto doesn't properly remove the x-amz-security-token parameter when
894
- # query_auth is False when using an IAM role (see issue #2043). Including the
895
- # x-amz-security-token parameter without the access key results in a 403,
896
- # even if the resource is public, so we need to remove it.
897
- scheme, netloc, path, query, fragment = urlsplit(url)
898
- params = parse_qs(query)
899
- if "x-amz-security-token" in params:
900
- del params["x-amz-security-token"]
901
- if "AWSAccessKeyId" in params:
902
- del params["AWSAccessKeyId"]
903
- if "Signature" in params:
904
- del params["Signature"]
905
- query = urlencode(params, doseq=True)
906
- url = urlunsplit((scheme, netloc, path, query, fragment))
907
- return url
908
-
909
- def get_shared_public_url(self, shared_file_name):
910
- self._requireValidSharedFileName(shared_file_name)
911
- return self.get_public_url(self._shared_file_id(shared_file_name))
912
-
913
- def _bindBucket(
914
- self,
915
- bucket_name: str,
916
- create: bool = False,
917
- block: bool = True,
918
- versioning: bool = False,
919
- check_versioning_consistency: bool = True,
920
- ):
920
+ def read_logs(self, callback: Callable[..., Any], read_all: bool = False) -> int:
921
921
  """
922
- Return the Boto Bucket object representing the S3 bucket with the given name. If the
923
- bucket does not exist and `create` is True, it will be created.
924
-
925
- :param str bucket_name: the name of the bucket to bind to
926
-
927
- :param bool create: Whether to create bucket the if it doesn't exist
928
-
929
- :param bool block: If False, return None if the bucket doesn't exist. If True, wait until
930
- bucket appears. Ignored if `create` is True.
931
-
932
- :rtype: Bucket|None
933
- :raises botocore.exceptions.ClientError: If `block` is True and the bucket still doesn't exist after the
934
- retry timeout expires.
922
+ This fetches all referenced logs in the database from s3 as readable objects
923
+ and runs "callback()" on them.
935
924
  """
936
- assert self.minBucketNameLen <= len(bucket_name) <= self.maxBucketNameLen
937
- assert self.bucketNameRe.match(bucket_name)
938
- logger.debug("Binding to job store bucket '%s'.", bucket_name)
939
-
940
- def bucket_retry_predicate(error):
941
- """
942
- Decide, given an error, whether we should retry binding the bucket.
943
- """
944
-
945
- if isinstance(error, ClientError) and get_error_status(error) in (404, 409):
946
- # Handle cases where the bucket creation is in a weird state that might let us proceed.
947
- # https://github.com/BD2KGenomics/toil/issues/955
948
- # https://github.com/BD2KGenomics/toil/issues/995
949
- # https://github.com/BD2KGenomics/toil/issues/1093
950
-
951
- # BucketAlreadyOwnedByYou == 409
952
- # OperationAborted == 409
953
- # NoSuchBucket == 404
954
- return True
955
- if get_error_code(error) == "SlowDown":
956
- # We may get told to SlowDown by AWS when we try to create our
957
- # bucket. In that case, we should retry and use the exponential
958
- # backoff.
959
- return True
960
- return False
961
-
962
- bucketExisted = True
963
- for attempt in retry_s3(predicate=bucket_retry_predicate):
964
- with attempt:
965
- try:
966
- # the head_bucket() call makes sure that the bucket exists and the user can access it
967
- self.s3_client.head_bucket(Bucket=bucket_name)
968
-
969
- bucket = self.s3_resource.Bucket(bucket_name)
970
- except ClientError as e:
971
- error_http_status = get_error_status(e)
972
- if error_http_status == 404:
973
- bucketExisted = False
974
- logger.debug("Bucket '%s' does not exist.", bucket_name)
975
- if create:
976
- bucket = create_s3_bucket(
977
- self.s3_resource, bucket_name, self.region
978
- )
979
- # Wait until the bucket exists before checking the region and adding tags
980
- bucket.wait_until_exists()
981
-
982
- # It is possible for create_bucket to return but
983
- # for an immediate request for the bucket region to
984
- # produce an S3ResponseError with code
985
- # NoSuchBucket. We let that kick us back up to the
986
- # main retry loop.
987
- assert (
988
- get_bucket_region(bucket_name) == self.region
989
- ), f"bucket_name: {bucket_name}, {get_bucket_region(bucket_name)} != {self.region}"
990
-
991
- tags = build_tag_dict_from_env()
992
-
993
- if tags:
994
- flat_tags = flatten_tags(tags)
995
- bucket_tagging = self.s3_resource.BucketTagging(
996
- bucket_name
997
- )
998
- bucket_tagging.put(Tagging={"TagSet": flat_tags})
999
-
1000
- # Configure bucket so that we can make objects in
1001
- # it public, which was the historical default.
1002
- enable_public_objects(bucket_name)
1003
- elif block:
1004
- raise
1005
- else:
1006
- return None
1007
- elif error_http_status == 301:
1008
- # This is raised if the user attempts to get a bucket in a region outside
1009
- # the specified one, if the specified one is not `us-east-1`. The us-east-1
1010
- # server allows a user to use buckets from any region.
1011
- raise BucketLocationConflictException(
1012
- get_bucket_region(bucket_name)
1013
- )
1014
- else:
1015
- raise
1016
- else:
1017
- bucketRegion = get_bucket_region(bucket_name)
1018
- if bucketRegion != self.region:
1019
- raise BucketLocationConflictException(bucketRegion)
1020
-
1021
- if versioning and not bucketExisted:
1022
- # only call this method on bucket creation
1023
- bucket.Versioning().enable()
1024
- # Now wait until versioning is actually on. Some uploads
1025
- # would come back with no versions; maybe they were
1026
- # happening too fast and this setting isn't sufficiently
1027
- # consistent?
1028
- time.sleep(1)
1029
- while not self._getBucketVersioning(bucket_name):
1030
- logger.warning(
1031
- f"Waiting for versioning activation on bucket '{bucket_name}'..."
1032
- )
1033
- time.sleep(1)
1034
- elif check_versioning_consistency:
1035
- # now test for versioning consistency
1036
- # we should never see any of these errors since 'versioning' should always be true
1037
- bucket_versioning = self._getBucketVersioning(bucket_name)
1038
- if bucket_versioning != versioning:
1039
- assert False, "Cannot modify versioning on existing bucket"
1040
- elif bucket_versioning is None:
1041
- assert False, "Cannot use a bucket with versioning suspended"
1042
- if bucketExisted:
1043
- logger.debug(
1044
- f"Using pre-existing job store bucket '{bucket_name}'."
1045
- )
1046
- else:
1047
- logger.debug(
1048
- f"Created new job store bucket '{bucket_name}' with versioning state {versioning}."
1049
- )
1050
-
1051
- return bucket
1052
-
1053
- def _bindDomain(
1054
- self, domain_name: str, create: bool = False, block: bool = True
1055
- ) -> None:
925
+ items_processed = 0
926
+ LOG_MARKER = "most_recently_read_log.marker"
927
+ read_log_marker = "0"
928
+ if not read_all:
929
+ # We want to pick up reading where we left off
930
+ try:
931
+ read_log_marker = self.read_from_bucket(
932
+ identifier=LOG_MARKER,
933
+ prefix=self.shared_key_prefix
934
+ ).decode('utf-8')
935
+ except self.s3_client.exceptions.NoSuchKey:
936
+ # We haven't recorded that we've read anything yet.
937
+ # Leave read_log_marker at "0"
938
+ pass
939
+
940
+ startafter = None if read_log_marker == "0" else read_log_marker
941
+ for result in list_s3_items(self.s3_resource, bucket=self.bucket_name, prefix=self.logs_key_prefix, startafter=startafter):
942
+ if result['Key'] > read_log_marker or read_all:
943
+ read_log_marker = result['Key']
944
+ with download_stream(self.s3_resource,
945
+ bucket=self.bucket_name,
946
+ key=result['Key'],
947
+ extra_args=self._get_encryption_args()) as readable:
948
+ callback(readable)
949
+ items_processed += 1
950
+
951
+ if items_processed > 0:
952
+ # We processed something, so we need to update the marker.
953
+ self.write_to_bucket(identifier=LOG_MARKER,
954
+ prefix=self.shared_key_prefix,
955
+ data=read_log_marker)
956
+ return items_processed
957
+
958
+ def _get_encryption_args(self) -> dict[str, Any]:
1056
959
  """
1057
- Return the Boto3 domain name representing the SDB domain. When create=True, it will
1058
- create the domain if it does not exist.
1059
- Return the Boto Domain object representing the SDB domain of the given name. If the
1060
- domain does not exist and `create` is True, it will be created.
1061
-
1062
- :param str domain_name: the name of the domain to bind to
960
+ Get the encryption arguments to pass to an AWS function.
1063
961
 
1064
- :param bool create: True if domain should be created if it doesn't exist
962
+ Reads live from the SSE key file referenced by the config.
1065
963
 
1066
- :param bool block: If False, raise DomainDoesNotExist if the domain doesn't exist. If True, wait until
1067
- domain appears. This parameter is ignored if create is True.
964
+ If the config is not available, returns an empty dict.
1068
965
 
1069
- :rtype: None
1070
- :raises ClientError: If `block` is True and the domain still doesn't exist after the
1071
- retry timeout expires.
1072
- """
1073
- logger.debug("Binding to job store domain '%s'.", domain_name)
1074
- retryargs = dict(
1075
- predicate=lambda e: no_such_sdb_domain(e) or sdb_unavailable(e)
1076
- )
1077
- if not block:
1078
- retryargs["timeout"] = 15
1079
- for attempt in retry_sdb(**retryargs):
1080
- with attempt:
1081
- try:
1082
- self.db.domain_metadata(DomainName=domain_name)
1083
- return
1084
- except ClientError as e:
1085
- if no_such_sdb_domain(e):
1086
- if create:
1087
- self.db.create_domain(DomainName=domain_name)
1088
- return
1089
- elif block:
1090
- raise
1091
- else:
1092
- raise DomainDoesNotExist(domain_name)
1093
- else:
1094
- raise
1095
-
1096
- def _new_job_id(self):
1097
- return str(uuid.uuid4())
1098
-
1099
- # A dummy job ID under which all shared files are stored
1100
- sharedFileOwnerID = uuid.UUID("891f7db6-e4d9-4221-a58e-ab6cc4395f94")
1101
-
1102
- # A dummy job ID under which all unread stats files are stored
1103
- statsFileOwnerID = uuid.UUID("bfcf5286-4bc7-41ef-a85d-9ab415b69d53")
1104
-
1105
- # A dummy job ID under which all read stats files are stored
1106
- readStatsFileOwnerID = uuid.UUID("e77fc3aa-d232-4255-ae04-f64ee8eb0bfa")
1107
-
1108
- def _shared_file_id(self, shared_file_name):
1109
- return str(uuid.uuid5(self.sharedFileOwnerID, shared_file_name))
1110
-
1111
- @InnerClass
1112
- class FileInfo(SDBHelper):
966
+ :raises ValueError: If the key data is not formatted correctly.
1113
967
  """
1114
- Represents a file in this job store.
1115
- """
1116
-
1117
- outer = None
1118
- """
1119
- :type: AWSJobStore
1120
- """
1121
-
1122
- def __init__(
1123
- self,
1124
- fileID,
1125
- ownerID,
1126
- encrypted,
1127
- version=None,
1128
- content=None,
1129
- numContentChunks=0,
1130
- checksum=None,
1131
- ):
1132
- """
1133
- :type fileID: str
1134
- :param fileID: the file's ID
1135
-
1136
- :type ownerID: str
1137
- :param ownerID: ID of the entity owning this file, typically a job ID aka jobStoreID
1138
-
1139
- :type encrypted: bool
1140
- :param encrypted: whether the file is stored in encrypted form
1141
-
1142
- :type version: str|None
1143
- :param version: a non-empty string containing the most recent version of the S3
1144
- object storing this file's content, None if the file is new, or empty string if the
1145
- file is inlined.
1146
-
1147
- :type content: str|None
1148
- :param content: this file's inlined content
1149
-
1150
- :type numContentChunks: int
1151
- :param numContentChunks: the number of SDB domain attributes occupied by this files
1152
-
1153
- :type checksum: str|None
1154
- :param checksum: the checksum of the file, if available. Formatted
1155
- as <algorithm>$<lowercase hex hash>.
1156
-
1157
- inlined content. Note that an inlined empty string still occupies one chunk.
1158
- """
1159
- super().__init__()
1160
- self._fileID = fileID
1161
- self._ownerID = ownerID
1162
- self.encrypted = encrypted
1163
- self._version = version
1164
- self._previousVersion = version
1165
- self._content = content
1166
- self._checksum = checksum
1167
- self._numContentChunks = numContentChunks
1168
-
1169
- @property
1170
- def fileID(self):
1171
- return self._fileID
1172
-
1173
- @property
1174
- def ownerID(self):
1175
- return self._ownerID
1176
-
1177
- @property
1178
- def version(self):
1179
- return self._version
1180
-
1181
- @version.setter
1182
- def version(self, version):
1183
- # Version should only change once
1184
- assert self._previousVersion == self._version
1185
- self._version = version
1186
- if version:
1187
- self.content = None
1188
-
1189
- @property
1190
- def previousVersion(self):
1191
- return self._previousVersion
1192
-
1193
- @property
1194
- def content(self):
1195
- return self._content
1196
-
1197
- @property
1198
- def checksum(self):
1199
- return self._checksum
1200
-
1201
- @checksum.setter
1202
- def checksum(self, checksum):
1203
- self._checksum = checksum
1204
-
1205
- @content.setter
1206
- def content(self, content):
1207
- assert content is None or isinstance(content, bytes)
1208
- self._content = content
1209
- if content is not None:
1210
- self.version = ""
1211
-
1212
- @classmethod
1213
- def create(cls, ownerID: str):
1214
- return cls(
1215
- str(uuid.uuid4()), ownerID, encrypted=cls.outer.sseKeyPath is not None
1216
- )
1217
-
1218
- @classmethod
1219
- def presenceIndicator(cls):
1220
- return "encrypted"
1221
-
1222
- @classmethod
1223
- def exists(cls, jobStoreFileID):
1224
- for attempt in retry_sdb():
1225
- with attempt:
1226
- return bool(
1227
- cls.outer.db.get_attributes(
1228
- DomainName=cls.outer.files_domain_name,
1229
- ItemName=compat_bytes(jobStoreFileID),
1230
- AttributeNames=[cls.presenceIndicator()],
1231
- ConsistentRead=True,
1232
- ).get("Attributes", [])
1233
- )
1234
-
1235
- @classmethod
1236
- def load(cls, jobStoreFileID):
1237
- for attempt in retry_sdb():
1238
- with attempt:
1239
- self = cls.fromItem(
1240
- {
1241
- "Name": compat_bytes(jobStoreFileID),
1242
- "Attributes": cls.outer.db.get_attributes(
1243
- DomainName=cls.outer.files_domain_name,
1244
- ItemName=compat_bytes(jobStoreFileID),
1245
- ConsistentRead=True,
1246
- ).get("Attributes", []),
1247
- }
1248
- )
1249
- return self
1250
-
1251
- @classmethod
1252
- def loadOrCreate(cls, jobStoreFileID, ownerID, encrypted):
1253
- self = cls.load(jobStoreFileID)
1254
- if encrypted is None:
1255
- encrypted = cls.outer.sseKeyPath is not None
1256
- if self is None:
1257
- self = cls(jobStoreFileID, ownerID, encrypted=encrypted)
1258
- else:
1259
- assert self.fileID == jobStoreFileID
1260
- assert self.ownerID == ownerID
1261
- self.encrypted = encrypted
1262
- return self
1263
-
1264
- @classmethod
1265
- def loadOrFail(cls, jobStoreFileID, customName=None):
1266
- """
1267
- :rtype: AWSJobStore.FileInfo
1268
- :return: an instance of this class representing the file with the given ID
1269
- :raises NoSuchFileException: if given file does not exist
1270
- """
1271
- self = cls.load(jobStoreFileID)
1272
- if self is None:
1273
- raise NoSuchFileException(jobStoreFileID, customName=customName)
1274
- else:
1275
- return self
1276
-
1277
- @classmethod
1278
- def fromItem(cls, item: "ItemTypeDef"):
1279
- """
1280
- Convert an SDB item to an instance of this class.
1281
-
1282
- :type item: Item
1283
- """
1284
- assert item is not None
1285
-
1286
- # Strings come back from SDB as unicode
1287
- def strOrNone(s):
1288
- return s if s is None else str(s)
1289
-
1290
- # ownerID and encrypted are the only mandatory attributes
1291
- ownerID, encrypted, version, checksum = SDBHelper.get_attributes_from_item(
1292
- item, ["ownerID", "encrypted", "version", "checksum"]
1293
- )
1294
- if ownerID is None:
1295
- assert encrypted is None
1296
- return None
1297
- else:
1298
- encrypted = strict_bool(encrypted)
1299
- content, numContentChunks = cls.attributesToBinary(item["Attributes"])
1300
- if encrypted:
1301
- sseKeyPath = cls.outer.sseKeyPath
1302
- if sseKeyPath is None:
1303
- raise AssertionError(
1304
- "Content is encrypted but no key was provided."
1305
- )
1306
- if content is not None:
1307
- content = encryption.decrypt(content, sseKeyPath)
1308
- self = cls(
1309
- fileID=item["Name"],
1310
- ownerID=ownerID,
1311
- encrypted=encrypted,
1312
- version=version,
1313
- content=content,
1314
- numContentChunks=numContentChunks,
1315
- checksum=checksum,
1316
- )
1317
- return self
1318
-
1319
- def toItem(self) -> tuple[dict[str, str], int]:
1320
- """
1321
- Convert this instance to a dictionary of attribute names to values
1322
-
1323
- :return: the attributes dict and an integer specifying the the number of chunk
1324
- attributes in the dictionary that are used for storing inlined content.
1325
- """
1326
- content = self.content
1327
- assert content is None or isinstance(content, bytes)
1328
- if self.encrypted and content is not None:
1329
- sseKeyPath = self.outer.sseKeyPath
1330
- if sseKeyPath is None:
1331
- raise AssertionError(
1332
- "Encryption requested but no key was provided."
1333
- )
1334
- content = encryption.encrypt(content, sseKeyPath)
1335
- assert content is None or isinstance(content, bytes)
1336
- attributes = self.binaryToAttributes(content)
1337
- numChunks = int(attributes["numChunks"])
1338
- attributes.update(
1339
- dict(
1340
- ownerID=self.ownerID or "",
1341
- encrypted=str(self.encrypted),
1342
- version=self.version or "",
1343
- checksum=self.checksum or "",
1344
- )
1345
- )
1346
- return attributes, numChunks
1347
-
1348
- @classmethod
1349
- def _reservedAttributes(cls):
1350
- return 3 + super()._reservedAttributes()
1351
-
1352
- @staticmethod
1353
- def maxInlinedSize():
1354
- return 256
1355
-
1356
- def save(self):
1357
- attributes, numNewContentChunks = self.toItem()
1358
- attributes_boto3 = SDBHelper.attributeDictToList(attributes)
1359
- # False stands for absence
1360
- if self.previousVersion is None:
1361
- expected: "UpdateConditionTypeDef" = {
1362
- "Name": "version",
1363
- "Exists": False,
1364
- }
1365
- else:
1366
- expected = {"Name": "version", "Value": cast(str, self.previousVersion)}
1367
- try:
1368
- for attempt in retry_sdb():
1369
- with attempt:
1370
- self.outer.db.put_attributes(
1371
- DomainName=self.outer.files_domain_name,
1372
- ItemName=compat_bytes(self.fileID),
1373
- Attributes=[
1374
- {
1375
- "Name": attribute["Name"],
1376
- "Value": attribute["Value"],
1377
- "Replace": True,
1378
- }
1379
- for attribute in attributes_boto3
1380
- ],
1381
- Expected=expected,
1382
- )
1383
- # clean up the old version of the file if necessary and safe
1384
- if self.previousVersion and (self.previousVersion != self.version):
1385
- for attempt in retry_s3():
1386
- with attempt:
1387
- self.outer.s3_client.delete_object(
1388
- Bucket=self.outer.files_bucket.name,
1389
- Key=compat_bytes(self.fileID),
1390
- VersionId=self.previousVersion,
1391
- )
1392
- self._previousVersion = self._version
1393
- if numNewContentChunks < self._numContentChunks:
1394
- residualChunks = range(numNewContentChunks, self._numContentChunks)
1395
- residual_chunk_names = [self._chunkName(i) for i in residualChunks]
1396
- # boto3 requires providing the value as well as the name in the attribute, and we don't store it locally
1397
- # the php sdk resolves this issue by not requiring the Value key https://github.com/aws/aws-sdk-php/issues/185
1398
- # but this doesnt extend to boto3
1399
- delete_attributes = self.outer.db.get_attributes(
1400
- DomainName=self.outer.files_domain_name,
1401
- ItemName=compat_bytes(self.fileID),
1402
- AttributeNames=[chunk for chunk in residual_chunk_names],
1403
- ).get("Attributes")
1404
- for attempt in retry_sdb():
1405
- with attempt:
1406
- self.outer.db.delete_attributes(
1407
- DomainName=self.outer.files_domain_name,
1408
- ItemName=compat_bytes(self.fileID),
1409
- Attributes=delete_attributes,
1410
- )
1411
- self.outer.db.get_attributes(
1412
- DomainName=self.outer.files_domain_name,
1413
- ItemName=compat_bytes(self.fileID),
1414
- )
1415
-
1416
- self._numContentChunks = numNewContentChunks
1417
- except ClientError as e:
1418
- if get_error_code(e) == "ConditionalCheckFailed":
1419
- raise ConcurrentFileModificationException(self.fileID)
1420
- else:
1421
- raise
1422
-
1423
- def upload(self, localFilePath, calculateChecksum=True):
1424
- file_size, file_time = fileSizeAndTime(localFilePath)
1425
- if file_size <= self.maxInlinedSize():
1426
- with open(localFilePath, "rb") as f:
1427
- self.content = f.read()
1428
- # Clear out any old checksum in case of overwrite
1429
- self.checksum = ""
1430
- else:
1431
- headerArgs = self._s3EncryptionArgs()
1432
- # Create a new Resource in case it needs to be on its own thread
1433
- resource = boto3_session.resource("s3", region_name=self.outer.region)
1434
-
1435
- self.checksum = (
1436
- self._get_file_checksum(localFilePath)
1437
- if calculateChecksum
1438
- else None
1439
- )
1440
- self.version = uploadFromPath(
1441
- localFilePath,
1442
- resource=resource,
1443
- bucketName=self.outer.files_bucket.name,
1444
- fileID=compat_bytes(self.fileID),
1445
- headerArgs=headerArgs,
1446
- partSize=self.outer.part_size,
1447
- )
1448
-
1449
- def _start_checksum(self, to_match=None, algorithm="sha1"):
1450
- """
1451
- Get a hasher that can be used with _update_checksum and
1452
- _finish_checksum.
1453
-
1454
- If to_match is set, it is a precomputed checksum which we expect
1455
- the result to match.
1456
-
1457
- The right way to compare checksums is to feed in the checksum to be
1458
- matched, so we can see its algorithm, instead of getting a new one
1459
- and comparing. If a checksum to match is fed in, _finish_checksum()
1460
- will raise a ChecksumError if it isn't matched.
1461
- """
1462
-
1463
- # If we have an expexted result it will go here.
1464
- expected = None
1465
-
1466
- if to_match is not None:
1467
- parts = to_match.split("$")
1468
- algorithm = parts[0]
1469
- expected = parts[1]
1470
-
1471
- wrapped = getattr(hashlib, algorithm)()
1472
- logger.debug(f"Starting {algorithm} checksum to match {expected}")
1473
- return algorithm, wrapped, expected
1474
-
1475
- def _update_checksum(self, checksum_in_progress, data):
1476
- """
1477
- Update a checksum in progress from _start_checksum with new data.
1478
- """
1479
- checksum_in_progress[1].update(data)
1480
-
1481
- def _finish_checksum(self, checksum_in_progress):
1482
- """
1483
- Complete a checksum in progress from _start_checksum and return the
1484
- checksum result string.
1485
- """
1486
-
1487
- result_hash = checksum_in_progress[1].hexdigest()
1488
-
1489
- logger.debug(
1490
- f"Completed checksum with hash {result_hash} vs. expected {checksum_in_progress[2]}"
1491
- )
1492
- if checksum_in_progress[2] is not None:
1493
- # We expected a particular hash
1494
- if result_hash != checksum_in_progress[2]:
1495
- raise ChecksumError(
1496
- "Checksum mismatch. Expected: %s Actual: %s"
1497
- % (checksum_in_progress[2], result_hash)
1498
- )
1499
-
1500
- return "$".join([checksum_in_progress[0], result_hash])
1501
-
1502
- def _get_file_checksum(self, localFilePath, to_match=None):
1503
- with open(localFilePath, "rb") as f:
1504
- hasher = self._start_checksum(to_match=to_match)
1505
- contents = f.read(1024 * 1024)
1506
- while contents != b"":
1507
- self._update_checksum(hasher, contents)
1508
- contents = f.read(1024 * 1024)
1509
- return self._finish_checksum(hasher)
1510
-
1511
- @contextmanager
1512
- def uploadStream(
1513
- self, multipart=True, allowInlining=True, encoding=None, errors=None
1514
- ):
1515
- """
1516
- Context manager that gives out a binary or text mode upload stream to upload data.
1517
- """
1518
-
1519
- # Note that we have to handle already having a content or a version
1520
- # if we are overwriting something.
1521
-
1522
- # But make sure we don't have both.
1523
- assert not (bool(self.version) and self.content is not None)
1524
-
1525
- info = self
1526
- store = self.outer
1527
-
1528
- class MultiPartPipe(WritablePipe):
1529
- def readFrom(self, readable):
1530
- # Get the first block of data we want to put
1531
- buf = readable.read(store.part_size)
1532
- assert isinstance(buf, bytes)
1533
-
1534
- if allowInlining and len(buf) <= info.maxInlinedSize():
1535
- logger.debug("Inlining content of %d bytes", len(buf))
1536
- info.content = buf
1537
- # There will be no checksum
1538
- info.checksum = ""
1539
- else:
1540
- # We will compute a checksum
1541
- hasher = info._start_checksum()
1542
- logger.debug("Updating checksum with %d bytes", len(buf))
1543
- info._update_checksum(hasher, buf)
1544
-
1545
- client = store.s3_client
1546
- bucket_name = store.files_bucket.name
1547
- headerArgs = info._s3EncryptionArgs()
1548
-
1549
- for attempt in retry_s3():
1550
- with attempt:
1551
- logger.debug("Starting multipart upload")
1552
- # low-level clients are thread safe
1553
- upload = client.create_multipart_upload(
1554
- Bucket=bucket_name,
1555
- Key=compat_bytes(info.fileID),
1556
- **headerArgs,
1557
- )
1558
- uploadId = upload["UploadId"]
1559
- parts = []
1560
- logger.debug("Multipart upload started as %s", uploadId)
1561
-
1562
- for attempt in retry_s3():
1563
- with attempt:
1564
- for i in range(CONSISTENCY_TICKS):
1565
- # Sometimes we can create a multipart upload and not see it. Wait around for it.
1566
- response = client.list_multipart_uploads(
1567
- Bucket=bucket_name,
1568
- MaxUploads=1,
1569
- Prefix=compat_bytes(info.fileID),
1570
- )
1571
- if (
1572
- "Uploads" in response
1573
- and len(response["Uploads"]) != 0
1574
- and response["Uploads"][0]["UploadId"]
1575
- == uploadId
1576
- ):
1577
-
1578
- logger.debug(
1579
- "Multipart upload visible as %s", uploadId
1580
- )
1581
- break
1582
- else:
1583
- logger.debug(
1584
- "Multipart upload %s is not visible; we see %s",
1585
- uploadId,
1586
- response.get("Uploads"),
1587
- )
1588
- time.sleep(CONSISTENCY_TIME * 2**i)
1589
-
1590
- try:
1591
- for part_num in itertools.count():
1592
- for attempt in retry_s3():
1593
- with attempt:
1594
- logger.debug(
1595
- "Uploading part %d of %d bytes to %s",
1596
- part_num + 1,
1597
- len(buf),
1598
- uploadId,
1599
- )
1600
- # TODO: include the Content-MD5 header:
1601
- # https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.complete_multipart_upload
1602
- part = client.upload_part(
1603
- Bucket=bucket_name,
1604
- Key=compat_bytes(info.fileID),
1605
- PartNumber=part_num + 1,
1606
- UploadId=uploadId,
1607
- Body=BytesIO(buf),
1608
- **headerArgs,
1609
- )
1610
-
1611
- parts.append(
1612
- {
1613
- "PartNumber": part_num + 1,
1614
- "ETag": part["ETag"],
1615
- }
1616
- )
1617
-
1618
- # Get the next block of data we want to put
1619
- buf = readable.read(info.outer.part_size)
1620
- assert isinstance(buf, bytes)
1621
- if len(buf) == 0:
1622
- # Don't allow any part other than the very first to be empty.
1623
- break
1624
- info._update_checksum(hasher, buf)
1625
- except:
1626
- with panic(log=logger):
1627
- for attempt in retry_s3():
1628
- with attempt:
1629
- client.abort_multipart_upload(
1630
- Bucket=bucket_name,
1631
- Key=compat_bytes(info.fileID),
1632
- UploadId=uploadId,
1633
- )
1634
-
1635
- else:
1636
-
1637
- while not store._getBucketVersioning(
1638
- store.files_bucket.name
1639
- ):
1640
- logger.warning(
1641
- "Versioning does not appear to be enabled yet. Deferring multipart "
1642
- "upload completion..."
1643
- )
1644
- time.sleep(1)
1645
-
1646
- # Save the checksum
1647
- info.checksum = info._finish_checksum(hasher)
1648
-
1649
- for attempt in retry_s3(timeout=600):
1650
- # Wait here for a bit longer if S3 breaks,
1651
- # because we have been known to flake out here
1652
- # in tests
1653
- # (https://github.com/DataBiosphere/toil/issues/3894)
1654
- with attempt:
1655
- logger.debug("Attempting to complete upload...")
1656
- completed = client.complete_multipart_upload(
1657
- Bucket=bucket_name,
1658
- Key=compat_bytes(info.fileID),
1659
- UploadId=uploadId,
1660
- MultipartUpload={"Parts": parts},
1661
- )
1662
-
1663
- logger.debug(
1664
- "Completed upload object of type %s: %s",
1665
- str(type(completed)),
1666
- repr(completed),
1667
- )
1668
- info.version = completed.get("VersionId")
1669
- logger.debug(
1670
- "Completed upload with version %s",
1671
- str(info.version),
1672
- )
1673
-
1674
- if info.version is None:
1675
- # Somehow we don't know the version. Try and get it.
1676
- for attempt in retry_s3(
1677
- predicate=lambda e: retryable_s3_errors(e)
1678
- or isinstance(e, AssertionError)
1679
- ):
1680
- with attempt:
1681
- version = client.head_object(
1682
- Bucket=bucket_name,
1683
- Key=compat_bytes(info.fileID),
1684
- **headerArgs,
1685
- ).get("VersionId", None)
1686
- logger.warning(
1687
- "Loaded key for upload with no version and got version %s",
1688
- str(version),
1689
- )
1690
- info.version = version
1691
- assert info.version is not None
1692
-
1693
- # Make sure we actually wrote something, even if an empty file
1694
- assert bool(info.version) or info.content is not None
1695
-
1696
- class SinglePartPipe(WritablePipe):
1697
- def readFrom(self, readable):
1698
- buf = readable.read()
1699
- assert isinstance(buf, bytes)
1700
- dataLength = len(buf)
1701
- if allowInlining and dataLength <= info.maxInlinedSize():
1702
- logger.debug("Inlining content of %d bytes", len(buf))
1703
- info.content = buf
1704
- # There will be no checksum
1705
- info.checksum = ""
1706
- else:
1707
- # We will compute a checksum
1708
- hasher = info._start_checksum()
1709
- info._update_checksum(hasher, buf)
1710
- info.checksum = info._finish_checksum(hasher)
1711
-
1712
- bucket_name = store.files_bucket.name
1713
- headerArgs = info._s3EncryptionArgs()
1714
- client = store.s3_client
1715
-
1716
- buf = BytesIO(buf)
1717
-
1718
- while not store._getBucketVersioning(bucket_name):
1719
- logger.warning(
1720
- "Versioning does not appear to be enabled yet. Deferring single part "
1721
- "upload..."
1722
- )
1723
- time.sleep(1)
1724
-
1725
- for attempt in retry_s3():
1726
- with attempt:
1727
- logger.debug(
1728
- "Uploading single part of %d bytes", dataLength
1729
- )
1730
- client.upload_fileobj(
1731
- Bucket=bucket_name,
1732
- Key=compat_bytes(info.fileID),
1733
- Fileobj=buf,
1734
- ExtraArgs=headerArgs,
1735
- )
1736
-
1737
- # use head_object with the SSE headers to access versionId and content_length attributes
1738
- headObj = client.head_object(
1739
- Bucket=bucket_name,
1740
- Key=compat_bytes(info.fileID),
1741
- **headerArgs,
1742
- )
1743
- assert dataLength == headObj.get("ContentLength", None)
1744
- info.version = headObj.get("VersionId", None)
1745
- logger.debug(
1746
- "Upload received version %s", str(info.version)
1747
- )
1748
-
1749
- if info.version is None:
1750
- # Somehow we don't know the version
1751
- for attempt in retry_s3(
1752
- predicate=lambda e: retryable_s3_errors(e)
1753
- or isinstance(e, AssertionError)
1754
- ):
1755
- with attempt:
1756
- headObj = client.head_object(
1757
- Bucket=bucket_name,
1758
- Key=compat_bytes(info.fileID),
1759
- **headerArgs,
1760
- )
1761
- info.version = headObj.get("VersionId", None)
1762
- logger.warning(
1763
- "Reloaded key with no version and got version %s",
1764
- str(info.version),
1765
- )
1766
- assert info.version is not None
1767
-
1768
- # Make sure we actually wrote something, even if an empty file
1769
- assert bool(info.version) or info.content is not None
1770
-
1771
- if multipart:
1772
- pipe = MultiPartPipe(encoding=encoding, errors=errors)
1773
- else:
1774
- pipe = SinglePartPipe(encoding=encoding, errors=errors)
968
+ # TODO: Maybe memoize the file read, subject to config field changes?
1775
969
 
1776
- with pipe as writable:
1777
- yield writable
1778
-
1779
- if not pipe.reader_done:
1780
- logger.debug(f"Version: {self.version} Content: {self.content}")
1781
- raise RuntimeError(
1782
- "Escaped context manager without written data being read!"
970
+ try:
971
+ config = self.config
972
+ except AttributeError:
973
+ # The config isn't set yet. This happens during resume(), when we
974
+ # need to get the encryption args to talk to the job store to
975
+ # download the config, before we have it.
976
+ return {}
977
+
978
+ if config is not None and config.sseKey:
979
+ with open(config.sseKey, 'r') as f:
980
+ sse_key = f.read()
981
+ if not len(sse_key) == 32: # TODO: regex
982
+ raise ValueError(
983
+ f'Check that {self.config.sseKey} '
984
+ f'is the path to a real SSE key. '
985
+ f'(Key length {len(sse_key)} != 32)'
1783
986
  )
987
+ return {'SSECustomerAlgorithm': 'AES256', 'SSECustomerKey': sse_key}
988
+ else:
989
+ return {}
1784
990
 
1785
- # We check our work to make sure we have exactly one of embedded
1786
- # content or a real object version.
1787
-
1788
- if self.content is None:
1789
- if not bool(self.version):
1790
- logger.debug(f"Version: {self.version} Content: {self.content}")
1791
- raise RuntimeError("No content added and no version created")
1792
- else:
1793
- if bool(self.version):
1794
- logger.debug(f"Version: {self.version} Content: {self.content}")
1795
- raise RuntimeError("Content added and version created")
1796
-
1797
- def copyFrom(self, srcObj):
1798
- """
1799
- Copies contents of source key into this file.
1800
-
1801
- :param S3.Object srcObj: The key (object) that will be copied from
1802
- """
1803
- assert srcObj.content_length is not None
1804
- if srcObj.content_length <= self.maxInlinedSize():
1805
- self.content = srcObj.get().get("Body").read()
1806
- else:
1807
- # Create a new Resource in case it needs to be on its own thread
1808
- resource = boto3_session.resource("s3", region_name=self.outer.region)
1809
- self.version = copyKeyMultipart(
1810
- resource,
1811
- srcBucketName=compat_bytes(srcObj.bucket_name),
1812
- srcKeyName=compat_bytes(srcObj.key),
1813
- srcKeyVersion=compat_bytes(srcObj.version_id),
1814
- dstBucketName=compat_bytes(self.outer.files_bucket.name),
1815
- dstKeyName=compat_bytes(self._fileID),
1816
- sseAlgorithm="AES256",
1817
- sseKey=self._getSSEKey(),
1818
- )
991
+ def parse_jobstore_identifier(jobstore_identifier: str) -> Tuple[str, str]:
992
+ region, jobstore_name = jobstore_identifier.split(':')
993
+ bucket_name = f'{jobstore_name}--toil'
1819
994
 
1820
- def copyTo(self, dstObj):
1821
- """
1822
- Copies contents of this file to the given key.
1823
-
1824
- :param S3.Object dstObj: The key (object) to copy this file's content to
1825
- """
1826
- if self.content is not None:
1827
- for attempt in retry_s3():
1828
- with attempt:
1829
- dstObj.put(Body=self.content)
1830
- elif self.version:
1831
- # Create a new Resource in case it needs to be on its own thread
1832
- resource = boto3_session.resource("s3", region_name=self.outer.region)
1833
-
1834
- for attempt in retry_s3():
1835
- # encrypted = True if self.outer.sseKeyPath else False
1836
- with attempt:
1837
- copyKeyMultipart(
1838
- resource,
1839
- srcBucketName=compat_bytes(self.outer.files_bucket.name),
1840
- srcKeyName=compat_bytes(self.fileID),
1841
- srcKeyVersion=compat_bytes(self.version),
1842
- dstBucketName=compat_bytes(dstObj.bucket_name),
1843
- dstKeyName=compat_bytes(dstObj.key),
1844
- copySourceSseAlgorithm="AES256",
1845
- copySourceSseKey=self._getSSEKey(),
1846
- )
1847
- else:
1848
- assert False
1849
-
1850
- def download(self, localFilePath, verifyChecksum=True):
1851
- if self.content is not None:
1852
- with AtomicFileCreate(localFilePath) as tmpPath:
1853
- with open(tmpPath, "wb") as f:
1854
- f.write(self.content)
1855
- elif self.version:
1856
- headerArgs = self._s3EncryptionArgs()
1857
- obj = self.outer.files_bucket.Object(compat_bytes(self.fileID))
1858
-
1859
- for attempt in retry_s3(
1860
- predicate=lambda e: retryable_s3_errors(e)
1861
- or isinstance(e, ChecksumError)
1862
- ):
1863
- with attempt:
1864
- with AtomicFileCreate(localFilePath) as tmpPath:
1865
- obj.download_file(
1866
- Filename=tmpPath,
1867
- ExtraArgs={"VersionId": self.version, **headerArgs},
1868
- )
1869
-
1870
- if verifyChecksum and self.checksum:
1871
- try:
1872
- # This automatically compares the result and matches the algorithm.
1873
- self._get_file_checksum(localFilePath, self.checksum)
1874
- except ChecksumError as e:
1875
- # Annotate checksum mismatches with file name
1876
- raise ChecksumError(
1877
- "Checksums do not match for file %s."
1878
- % localFilePath
1879
- ) from e
1880
- # The error will get caught and result in a retry of the download until we run out of retries.
1881
- # TODO: handle obviously truncated downloads by resuming instead.
1882
- else:
1883
- assert False
1884
-
1885
- @contextmanager
1886
- def downloadStream(self, verifyChecksum=True, encoding=None, errors=None):
1887
- """
1888
- Context manager that gives out a download stream to download data.
1889
- """
1890
- info = self
1891
-
1892
- class DownloadPipe(ReadablePipe):
1893
- def writeTo(self, writable):
1894
- if info.content is not None:
1895
- writable.write(info.content)
1896
- elif info.version:
1897
- headerArgs = info._s3EncryptionArgs()
1898
- obj = info.outer.files_bucket.Object(compat_bytes(info.fileID))
1899
- for attempt in retry_s3():
1900
- with attempt:
1901
- obj.download_fileobj(
1902
- writable,
1903
- ExtraArgs={"VersionId": info.version, **headerArgs},
1904
- )
1905
- else:
1906
- assert False
1907
-
1908
- class HashingPipe(ReadableTransformingPipe):
1909
- """
1910
- Class which checksums all the data read through it. If it
1911
- reaches EOF and the checksum isn't correct, raises
1912
- ChecksumError.
1913
-
1914
- Assumes info actually has a checksum.
1915
- """
1916
-
1917
- def transform(self, readable, writable):
1918
- hasher = info._start_checksum(to_match=info.checksum)
1919
- contents = readable.read(1024 * 1024)
1920
- while contents != b"":
1921
- info._update_checksum(hasher, contents)
1922
- try:
1923
- writable.write(contents)
1924
- except BrokenPipeError:
1925
- # Read was stopped early by user code.
1926
- # Can't check the checksum.
1927
- return
1928
- contents = readable.read(1024 * 1024)
1929
- # We reached EOF in the input.
1930
- # Finish checksumming and verify.
1931
- info._finish_checksum(hasher)
1932
- # Now stop so EOF happens in the output.
1933
-
1934
- if verifyChecksum and self.checksum:
1935
- with DownloadPipe() as readable:
1936
- # Interpose a pipe to check the hash
1937
- with HashingPipe(
1938
- readable, encoding=encoding, errors=errors
1939
- ) as verified:
1940
- yield verified
1941
- else:
1942
- # Readable end of pipe produces text mode output if encoding specified
1943
- with DownloadPipe(encoding=encoding, errors=errors) as readable:
1944
- # No true checksum available, so don't hash
1945
- yield readable
1946
-
1947
- def delete(self):
1948
- store = self.outer
1949
- if self.previousVersion is not None:
1950
- expected: "UpdateConditionTypeDef" = {
1951
- "Name": "version",
1952
- "Value": cast(str, self.previousVersion),
1953
- }
1954
- for attempt in retry_sdb():
1955
- with attempt:
1956
- store.db.delete_attributes(
1957
- DomainName=store.files_domain_name,
1958
- ItemName=compat_bytes(self.fileID),
1959
- Expected=expected,
1960
- )
1961
- if self.previousVersion:
1962
- for attempt in retry_s3():
1963
- with attempt:
1964
- store.s3_client.delete_object(
1965
- Bucket=store.files_bucket.name,
1966
- Key=compat_bytes(self.fileID),
1967
- VersionId=self.previousVersion,
1968
- )
1969
-
1970
- def getSize(self):
1971
- """
1972
- Return the size of the referenced item in bytes.
1973
- """
1974
- if self.content is not None:
1975
- return len(self.content)
1976
- elif self.version:
1977
- for attempt in retry_s3():
1978
- with attempt:
1979
- obj = self.outer.files_bucket.Object(compat_bytes(self.fileID))
1980
- return obj.content_length
1981
- else:
1982
- return 0
1983
-
1984
- def _getSSEKey(self) -> Optional[bytes]:
1985
- sseKeyPath = self.outer.sseKeyPath
1986
- if sseKeyPath:
1987
- with open(sseKeyPath, "rb") as f:
1988
- sseKey = f.read()
1989
- return sseKey
1990
-
1991
- def _s3EncryptionArgs(self):
1992
- # the keys of the returned dictionary are unpacked to the corresponding boto3 optional
1993
- # parameters and will be used to set the http headers
1994
- if self.encrypted:
1995
- sseKey = self._getSSEKey()
1996
- assert (
1997
- sseKey is not None
1998
- ), "Content is encrypted but no key was provided."
1999
- assert len(sseKey) == 32
2000
- # boto3 encodes the key and calculates the MD5 for us
2001
- return {"SSECustomerAlgorithm": "AES256", "SSECustomerKey": sseKey}
2002
- else:
2003
- return {}
2004
-
2005
- def __repr__(self):
2006
- r = custom_repr
2007
- d = (
2008
- ("fileID", r(self.fileID)),
2009
- ("ownerID", r(self.ownerID)),
2010
- ("encrypted", r(self.encrypted)),
2011
- ("version", r(self.version)),
2012
- ("previousVersion", r(self.previousVersion)),
2013
- ("content", r(self.content)),
2014
- ("checksum", r(self.checksum)),
2015
- ("_numContentChunks", r(self._numContentChunks)),
2016
- )
2017
- return "{}({})".format(
2018
- type(self).__name__, ", ".join(f"{k}={v}" for k, v in d)
2019
- )
995
+ regions = EC2Regions.keys()
996
+ if region not in regions:
997
+ raise ValueError(f'AWS Region "{region}" is not one of: {regions}')
2020
998
 
2021
- versionings = dict(Enabled=True, Disabled=False, Suspended=None)
999
+ if not 3 <= len(jobstore_name) <= 56:
1000
+ raise ValueError(f'AWS jobstore name must be between 3 and 56 chars: '
1001
+ f'{jobstore_name} (len: {len(jobstore_name)})')
2022
1002
 
2023
- def _getBucketVersioning(self, bucket_name):
2024
- """
2025
- The status attribute of BucketVersioning can be 'Enabled', 'Suspended' or None (Disabled)
2026
- which we map to True, None and False respectively. Note that we've never seen a versioning
2027
- status of 'Disabled', only the None return value. Calling BucketVersioning.suspend() will
2028
- cause BucketVersioning.status to then return 'Suspended' even on a new bucket that never
2029
- had versioning enabled.
1003
+ if not re.compile(r'^[a-z0-9][a-z0-9-]+[a-z0-9]$').match(jobstore_name):
1004
+ raise ValueError(f"Invalid AWS jobstore name: '{jobstore_name}'. Must contain only digits, "
1005
+ f"lower-case letters, and hyphens. Must also not start or end in a hyphen.")
2030
1006
 
2031
- :param bucket_name: str
2032
- """
2033
- for attempt in retry_s3():
2034
- with attempt:
2035
- status = self.s3_resource.BucketVersioning(bucket_name).status
2036
- return self.versionings.get(status) if status else False
2037
-
2038
- # TODO: Make this retry more specific?
2039
- # example: https://github.com/DataBiosphere/toil/issues/3378
2040
- @retry()
2041
- def destroy(self):
2042
- # FIXME: Destruction of encrypted stores only works after initialize() or .resume()
2043
- # See https://github.com/BD2KGenomics/toil/issues/1041
2044
- try:
2045
- self._bind(create=False, block=False, check_versioning_consistency=False)
2046
- except BucketLocationConflictException:
2047
- # If the unique jobstore bucket name existed, _bind would have raised a
2048
- # BucketLocationConflictException before calling destroy. Calling _bind here again
2049
- # would reraise the same exception so we need to catch and ignore that exception.
2050
- pass
2051
- # TODO: Add other failure cases to be ignored here.
2052
- self._registered = None
2053
- if self.files_bucket is not None:
2054
- self._delete_bucket(self.files_bucket)
2055
- self.files_bucket = None
2056
- for name in "files_domain_name", "jobs_domain_name":
2057
- domainName = getattr(self, name)
2058
- if domainName is not None:
2059
- self._delete_domain(domainName)
2060
- setattr(self, name, None)
2061
- self._registered = False
2062
-
2063
- def _delete_domain(self, domainName):
2064
- for attempt in retry_sdb():
2065
- with attempt:
2066
- try:
2067
- self.db.delete_domain(DomainName=domainName)
2068
- except ClientError as e:
2069
- if not no_such_sdb_domain(e):
2070
- raise
2071
-
2072
- @staticmethod
2073
- def _delete_bucket(bucket):
2074
- """
2075
- :param bucket: S3.Bucket
2076
- """
2077
- for attempt in retry_s3():
2078
- with attempt:
2079
- try:
2080
- uploads = s3_boto3_client.list_multipart_uploads(
2081
- Bucket=bucket.name
2082
- ).get("Uploads")
2083
- if uploads:
2084
- for u in uploads:
2085
- s3_boto3_client.abort_multipart_upload(
2086
- Bucket=bucket.name, Key=u["Key"], UploadId=u["UploadId"]
2087
- )
2088
-
2089
- bucket.objects.all().delete()
2090
- bucket.object_versions.delete()
2091
- bucket.delete()
2092
- except s3_boto3_client.exceptions.NoSuchBucket:
2093
- pass
2094
- except ClientError as e:
2095
- if get_error_status(e) != 404:
2096
- raise
2097
-
2098
-
2099
- aRepr = reprlib.Repr()
2100
- aRepr.maxstring = 38 # so UUIDs don't get truncated (36 for UUID plus 2 for quotes)
2101
- custom_repr = aRepr.repr
2102
-
2103
-
2104
- class BucketLocationConflictException(LocatorException):
2105
- def __init__(self, bucketRegion):
2106
- super().__init__(
2107
- "A bucket with the same name as the jobstore was found in another region (%s). "
2108
- "Cannot proceed as the unique bucket name is already in use.",
2109
- locator=bucketRegion,
2110
- )
1007
+ if '--' in jobstore_name:
1008
+ raise ValueError(f"AWS jobstore names may not contain '--': {jobstore_name}")
1009
+ return region, bucket_name