toil 9.0.0__py3-none-any.whl → 9.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/batchSystems/abstractBatchSystem.py +13 -5
- toil/batchSystems/abstractGridEngineBatchSystem.py +17 -5
- toil/batchSystems/kubernetes.py +13 -2
- toil/batchSystems/mesos/batchSystem.py +33 -2
- toil/batchSystems/slurm.py +191 -16
- toil/cwl/cwltoil.py +17 -82
- toil/fileStores/__init__.py +1 -1
- toil/fileStores/abstractFileStore.py +5 -2
- toil/fileStores/cachingFileStore.py +1 -1
- toil/job.py +30 -14
- toil/jobStores/abstractJobStore.py +24 -19
- toil/jobStores/aws/jobStore.py +862 -1963
- toil/jobStores/aws/utils.py +24 -270
- toil/jobStores/googleJobStore.py +25 -9
- toil/jobStores/utils.py +0 -327
- toil/leader.py +27 -22
- toil/lib/aws/config.py +22 -0
- toil/lib/aws/s3.py +477 -9
- toil/lib/aws/utils.py +22 -33
- toil/lib/checksum.py +88 -0
- toil/lib/conversions.py +33 -31
- toil/lib/directory.py +217 -0
- toil/lib/ec2.py +97 -29
- toil/lib/exceptions.py +2 -1
- toil/lib/expando.py +2 -2
- toil/lib/generatedEC2Lists.py +73 -16
- toil/lib/io.py +33 -2
- toil/lib/memoize.py +21 -7
- toil/lib/pipes.py +385 -0
- toil/lib/retry.py +1 -1
- toil/lib/threading.py +1 -1
- toil/lib/web.py +4 -5
- toil/provisioners/__init__.py +5 -2
- toil/provisioners/aws/__init__.py +43 -36
- toil/provisioners/aws/awsProvisioner.py +22 -13
- toil/provisioners/node.py +60 -12
- toil/resource.py +3 -13
- toil/test/__init__.py +14 -16
- toil/test/batchSystems/test_slurm.py +103 -14
- toil/test/cwl/staging_cat.cwl +27 -0
- toil/test/cwl/staging_make_file.cwl +25 -0
- toil/test/cwl/staging_workflow.cwl +43 -0
- toil/test/cwl/zero_default.cwl +61 -0
- toil/test/docs/scripts/tutorial_staging.py +17 -8
- toil/test/jobStores/jobStoreTest.py +23 -133
- toil/test/lib/aws/test_iam.py +7 -7
- toil/test/lib/aws/test_s3.py +30 -33
- toil/test/lib/aws/test_utils.py +9 -9
- toil/test/provisioners/aws/awsProvisionerTest.py +59 -6
- toil/test/src/autoDeploymentTest.py +2 -3
- toil/test/src/fileStoreTest.py +89 -87
- toil/test/utils/ABCWorkflowDebug/ABC.txt +1 -0
- toil/test/utils/ABCWorkflowDebug/debugWorkflow.py +4 -4
- toil/test/utils/toilKillTest.py +35 -28
- toil/test/wdl/md5sum/md5sum.json +1 -1
- toil/test/wdl/testfiles/gather.wdl +52 -0
- toil/test/wdl/wdltoil_test.py +120 -38
- toil/test/wdl/wdltoil_test_kubernetes.py +9 -0
- toil/utils/toilDebugFile.py +6 -3
- toil/utils/toilStats.py +17 -2
- toil/version.py +6 -6
- toil/wdl/wdltoil.py +1038 -549
- toil/worker.py +5 -2
- {toil-9.0.0.dist-info → toil-9.1.1.dist-info}/METADATA +12 -12
- {toil-9.0.0.dist-info → toil-9.1.1.dist-info}/RECORD +69 -61
- toil/lib/iterables.py +0 -112
- toil/test/docs/scripts/stagingExampleFiles/in.txt +0 -1
- {toil-9.0.0.dist-info → toil-9.1.1.dist-info}/WHEEL +0 -0
- {toil-9.0.0.dist-info → toil-9.1.1.dist-info}/entry_points.txt +0 -0
- {toil-9.0.0.dist-info → toil-9.1.1.dist-info}/licenses/LICENSE +0 -0
- {toil-9.0.0.dist-info → toil-9.1.1.dist-info}/top_level.txt +0 -0
toil/jobStores/aws/jobStore.py
CHANGED
|
@@ -11,676 +11,804 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
14
|
+
"""
|
|
15
|
+
This file contains the AWS jobstore, which has its own docstring defining its use.
|
|
16
|
+
|
|
17
|
+
This docstring is about the organization of the file.
|
|
18
|
+
|
|
19
|
+
All direct AWS boto calls should live in toil.lib.aws, except for creating the
|
|
20
|
+
session instance and the resource/client (which should only be made ONCE in the jobstore).
|
|
21
|
+
|
|
22
|
+
Reasons for this
|
|
23
|
+
- DRY.
|
|
24
|
+
- All retries are on their individual boto functions, instead of here.
|
|
25
|
+
- Simple clear functions => simple clear unit tests (ideally).
|
|
26
|
+
|
|
27
|
+
Variables defining part size, parallelization, and other constants should live in toil.lib.aws.config.
|
|
28
|
+
"""
|
|
17
29
|
import os
|
|
30
|
+
import json
|
|
31
|
+
import logging
|
|
18
32
|
import pickle
|
|
19
33
|
import re
|
|
20
|
-
import reprlib
|
|
21
34
|
import stat
|
|
22
|
-
import time
|
|
23
35
|
import uuid
|
|
24
|
-
|
|
25
|
-
|
|
36
|
+
import datetime
|
|
37
|
+
|
|
26
38
|
from io import BytesIO
|
|
27
|
-
from
|
|
28
|
-
from urllib.parse import ParseResult,
|
|
39
|
+
from contextlib import contextmanager
|
|
40
|
+
from urllib.parse import ParseResult, urlparse
|
|
41
|
+
from typing import (
|
|
42
|
+
ContextManager,
|
|
43
|
+
IO,
|
|
44
|
+
TYPE_CHECKING,
|
|
45
|
+
Optional,
|
|
46
|
+
Union,
|
|
47
|
+
cast,
|
|
48
|
+
Tuple,
|
|
49
|
+
Callable,
|
|
50
|
+
Dict,
|
|
51
|
+
Any,
|
|
52
|
+
Iterator,
|
|
53
|
+
Literal,
|
|
54
|
+
overload
|
|
55
|
+
)
|
|
29
56
|
|
|
57
|
+
# This file can't be imported if the AWS modules are not available.
|
|
30
58
|
from botocore.exceptions import ClientError
|
|
31
59
|
|
|
32
|
-
import toil.lib.encryption as encryption
|
|
33
60
|
from toil.fileStores import FileID
|
|
34
|
-
from toil.
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
LocatorException,
|
|
40
|
-
NoSuchFileException,
|
|
41
|
-
NoSuchJobException,
|
|
42
|
-
NoSuchJobStoreException,
|
|
43
|
-
)
|
|
44
|
-
from toil.jobStores.aws.utils import (
|
|
45
|
-
SDBHelper,
|
|
46
|
-
ServerSideCopyProhibitedError,
|
|
47
|
-
copyKeyMultipart,
|
|
48
|
-
fileSizeAndTime,
|
|
49
|
-
no_such_sdb_domain,
|
|
50
|
-
retry_sdb,
|
|
51
|
-
sdb_unavailable,
|
|
52
|
-
uploadFile,
|
|
53
|
-
uploadFromPath,
|
|
54
|
-
)
|
|
55
|
-
from toil.jobStores.utils import ReadablePipe, ReadableTransformingPipe, WritablePipe
|
|
56
|
-
from toil.lib.aws import build_tag_dict_from_env
|
|
57
|
-
from toil.lib.aws.session import establish_boto3_session
|
|
58
|
-
from toil.lib.aws.utils import (
|
|
59
|
-
NoBucketLocationError,
|
|
60
|
-
boto3_pager,
|
|
61
|
+
from toil.jobStores.abstractJobStore import (AbstractJobStore,
|
|
62
|
+
JobStoreExistsException,
|
|
63
|
+
NoSuchJobException,
|
|
64
|
+
NoSuchJobStoreException)
|
|
65
|
+
from toil.lib.aws.s3 import (
|
|
61
66
|
create_s3_bucket,
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
67
|
+
delete_s3_bucket,
|
|
68
|
+
bucket_exists,
|
|
69
|
+
copy_s3_to_s3,
|
|
70
|
+
copy_local_to_s3,
|
|
71
|
+
copy_s3_to_local,
|
|
72
|
+
parse_s3_uri,
|
|
73
|
+
MultiPartPipe,
|
|
74
|
+
list_s3_items,
|
|
75
|
+
upload_to_s3,
|
|
76
|
+
download_stream,
|
|
77
|
+
s3_key_exists,
|
|
78
|
+
head_s3_object,
|
|
79
|
+
get_s3_object,
|
|
80
|
+
put_s3_object,
|
|
81
|
+
create_public_url,
|
|
82
|
+
AWSKeyNotFoundError,
|
|
70
83
|
)
|
|
71
|
-
from toil.lib.
|
|
84
|
+
from toil.lib.aws.utils import get_object_for_url, list_objects_for_url
|
|
85
|
+
from toil.common import Config
|
|
86
|
+
from toil.jobStores.abstractJobStore import NoSuchFileException
|
|
72
87
|
from toil.lib.ec2nodes import EC2Regions
|
|
73
|
-
from toil.lib.
|
|
74
|
-
from toil.
|
|
75
|
-
from toil.lib.
|
|
76
|
-
from toil.
|
|
77
|
-
from toil.lib.retry import get_error_code, get_error_status, retry
|
|
88
|
+
from toil.lib.retry import get_error_status
|
|
89
|
+
from toil.version import version
|
|
90
|
+
from toil.lib.aws.session import establish_boto3_session
|
|
91
|
+
from toil.job import JobDescription, Job
|
|
78
92
|
from toil.lib.url import URLAccess
|
|
79
93
|
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
AttributeTypeDef,
|
|
83
|
-
DeletableItemTypeDef,
|
|
84
|
-
ItemTypeDef,
|
|
85
|
-
ReplaceableAttributeTypeDef,
|
|
86
|
-
ReplaceableItemTypeDef,
|
|
87
|
-
UpdateConditionTypeDef,
|
|
88
|
-
)
|
|
89
|
-
|
|
90
|
-
from toil import Config
|
|
91
|
-
|
|
92
|
-
boto3_session = establish_boto3_session()
|
|
93
|
-
s3_boto3_resource = boto3_session.resource("s3")
|
|
94
|
-
s3_boto3_client = boto3_session.client("s3")
|
|
94
|
+
|
|
95
|
+
DEFAULT_AWS_PART_SIZE = 52428800
|
|
95
96
|
logger = logging.getLogger(__name__)
|
|
96
97
|
|
|
97
|
-
# Sometimes we have to wait for multipart uploads to become real. How long
|
|
98
|
-
# should we wait?
|
|
99
|
-
CONSISTENCY_TICKS = 5
|
|
100
|
-
CONSISTENCY_TIME = 1
|
|
101
98
|
|
|
99
|
+
class AWSJobStore(AbstractJobStore, URLAccess):
|
|
100
|
+
"""
|
|
101
|
+
The AWS jobstore can be thought of as an AWS s3 bucket, with functions to
|
|
102
|
+
centralize, store, and track files for the workflow.
|
|
103
|
+
|
|
104
|
+
The AWS jobstore stores 4 things:
|
|
102
105
|
|
|
103
|
-
|
|
104
|
-
|
|
106
|
+
1. Jobs: These are pickled as files, and contain the information necessary to run a job when unpickled.
|
|
107
|
+
A job's file is deleted when finished, and its absence means it completed.
|
|
105
108
|
|
|
109
|
+
2. Files: The inputs and outputs of jobs. Each file is written in s3 with the file pattern:
|
|
110
|
+
"files/{uuid4}/{original_filename}", where the file prefix
|
|
111
|
+
"files/{uuid4}" should only point to one file.
|
|
112
|
+
3. Logs: The written log files of jobs that have run, plus the log file for the main Toil process.
|
|
106
113
|
|
|
107
|
-
|
|
108
|
-
|
|
114
|
+
4. Shared Files: Files with himan=-readable names, used by Toil itself or Python workflows.
|
|
115
|
+
These include:
|
|
109
116
|
|
|
110
|
-
|
|
111
|
-
super().__init__(f"Expected domain {domain_name} to exist!")
|
|
117
|
+
* environment.pickle (environment variables)
|
|
112
118
|
|
|
119
|
+
* config.pickle (user options)
|
|
113
120
|
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
+
* pid.log (process ID of the workflow; when it finishes, the workflow either succeeded/failed)
|
|
122
|
+
* userScript (hot deployment; this is the job module)
|
|
123
|
+
|
|
124
|
+
* rootJobReturnValue (workflow succeeded or not)
|
|
125
|
+
|
|
126
|
+
NOTES
|
|
127
|
+
- The AWS jobstore does not use a database (directly, at least) currently. We can get away with this because:
|
|
128
|
+
|
|
129
|
+
1. AWS s3 has strong consistency.
|
|
130
|
+
|
|
131
|
+
2. s3's filter/query speed is pretty good.
|
|
132
|
+
|
|
133
|
+
However, there may be reasons in the future to provide users with a database:
|
|
134
|
+
|
|
135
|
+
* s3 throttling has limits (3,500/5,000 requests (TODO: per
|
|
136
|
+
second?); something like dynamodb supports 100,000+ requests).
|
|
137
|
+
|
|
138
|
+
* Access and filtering would be sped up, though how much faster this would be needs testing.
|
|
139
|
+
|
|
140
|
+
ALSO NOTE: The caching filestore uses a local (per node) database with a very similar structure that maybe
|
|
141
|
+
could be synced up with this.
|
|
142
|
+
|
|
143
|
+
- TODO: Etags are s3's native checksum, so use that for file integrity checking since it's free when fetching
|
|
144
|
+
object headers from s3. Using an md5sum in addition to this would work well with the current filestore.
|
|
145
|
+
WARNING: Etag values differ for the same file when the part size changes, so part size should always
|
|
146
|
+
be Set In Stone, unless we hit s3's 10,000 part limit, and we need to account for that.
|
|
147
|
+
|
|
148
|
+
- This class fills in self.config only when initialized/restarted; it is None upon class instantiation. These
|
|
149
|
+
are the options/config set by the user. When jobs are loaded/unpickled, they must re-incorporate this.
|
|
150
|
+
|
|
151
|
+
- The config.sseKey field is the single source of truth for bucket encryption
|
|
152
|
+
status. The key is never stored inside this class; it is always read
|
|
153
|
+
from the file referenced by the config when needed. Modifying the config
|
|
154
|
+
at runtime will modify whether encryption is used. Note that files
|
|
155
|
+
written *without* encryption (i.e. config.pickle) can't be read when
|
|
156
|
+
encryption is enabled!
|
|
157
|
+
|
|
158
|
+
- TODO: In general, job stores should log the version of Toil they were
|
|
159
|
+
initialized with and warn the user if restarting with a different
|
|
160
|
+
version.
|
|
121
161
|
"""
|
|
162
|
+
def __init__(self, locator: str, partSize: int = DEFAULT_AWS_PART_SIZE) -> None:
|
|
163
|
+
super(AWSJobStore, self).__init__(locator)
|
|
164
|
+
# TODO: parsing of user options seems like it should be done outside of this class;
|
|
165
|
+
# pass in only the bucket name and region?
|
|
166
|
+
self.region, self.bucket_name = parse_jobstore_identifier(locator)
|
|
167
|
+
boto3_session = establish_boto3_session(region_name=self.region)
|
|
168
|
+
self.s3_resource = boto3_session.resource("s3")
|
|
169
|
+
self.s3_client = boto3_session.client("s3")
|
|
170
|
+
logger.info(f"Instantiating {self.__class__} with region: {self.region}")
|
|
171
|
+
self.part_size = DEFAULT_AWS_PART_SIZE # don't let users set the part size; it will throw off etag values
|
|
172
|
+
|
|
173
|
+
# created anew during self.initialize() or loaded using self.resume()
|
|
174
|
+
self.bucket = None
|
|
175
|
+
|
|
176
|
+
# pickled job files named with uuid4
|
|
177
|
+
self.job_key_prefix = 'jobs/'
|
|
178
|
+
# job-file associations; these are empty files mimicking a db w/naming convention: job_uuid4.file_uuid4
|
|
179
|
+
#
|
|
180
|
+
# TODO: a many-to-many system is implemented, but a simpler one-to-many
|
|
181
|
+
# system could be used, because each file should belong to at most one
|
|
182
|
+
# job. This should be changed to a hierarchical layout.
|
|
183
|
+
self.job_associations_key_prefix = 'job-associations/'
|
|
184
|
+
# input/output files named with uuid4
|
|
185
|
+
self.content_key_prefix = 'files/'
|
|
186
|
+
# these are special files, like 'environment.pickle'; place them in root
|
|
187
|
+
self.shared_key_prefix = ''
|
|
188
|
+
# read and unread; named with uuid4
|
|
189
|
+
self.logs_key_prefix = 'logs/'
|
|
190
|
+
|
|
191
|
+
###################################### CREATE/DESTROY JOBSTORE ######################################
|
|
192
|
+
|
|
193
|
+
def initialize(self, config: Config) -> None:
|
|
194
|
+
"""
|
|
195
|
+
Called when starting a new jobstore with a non-existent bucket.
|
|
122
196
|
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
197
|
+
Create bucket, raise if it already exists.
|
|
198
|
+
Set options from config.
|
|
199
|
+
"""
|
|
200
|
+
logger.debug(f"Instantiating {self.__class__} for region {self.region} with bucket: '{self.bucket_name}'")
|
|
201
|
+
if bucket_exists(self.s3_resource, self.bucket_name):
|
|
202
|
+
raise JobStoreExistsException(self.locator, 'aws')
|
|
203
|
+
self.bucket = create_s3_bucket(self.s3_resource, self.bucket_name, region=self.region) # type: ignore
|
|
204
|
+
super(AWSJobStore, self).initialize(config)
|
|
128
205
|
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
maxBucketNameLen = 63
|
|
133
|
-
maxNameLen = 10
|
|
134
|
-
nameSeparator = "--"
|
|
206
|
+
def resume(self) -> None:
|
|
207
|
+
"""
|
|
208
|
+
Called when reusing an old jobstore with an existing bucket.
|
|
135
209
|
|
|
136
|
-
|
|
210
|
+
:raise NoSuchJobStoreException: if the bucket doesn't exist.
|
|
137
211
|
"""
|
|
138
|
-
|
|
212
|
+
if not bucket_exists(self.s3_resource, self.bucket_name):
|
|
213
|
+
raise NoSuchJobStoreException(self.locator, 'aws')
|
|
214
|
+
# This sets self.config to not be None and loads the encryption key
|
|
215
|
+
# path from the unencrypted config. So it needs the bucket to exist to
|
|
216
|
+
# read from.
|
|
217
|
+
super(AWSJobStore, self).resume()
|
|
218
|
+
|
|
219
|
+
def destroy(self) -> None:
|
|
220
|
+
delete_s3_bucket(self.s3_resource, self.bucket_name)
|
|
139
221
|
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
222
|
+
###################################### BUCKET UTIL API ######################################
|
|
223
|
+
|
|
224
|
+
def _key_in_bucket(
|
|
225
|
+
self,
|
|
226
|
+
identifier: str,
|
|
227
|
+
prefix: str,
|
|
228
|
+
) -> str:
|
|
143
229
|
"""
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
)
|
|
168
|
-
logger.debug(
|
|
169
|
-
"Instantiating %s for region %s and name prefix '%s'",
|
|
170
|
-
self.__class__,
|
|
171
|
-
region,
|
|
172
|
-
namePrefix,
|
|
230
|
+
Get the key in the bucket for the given identifier and prefix.
|
|
231
|
+
|
|
232
|
+
We have this so higher-level code doesn't need to worry about the
|
|
233
|
+
pasting together of prefixes and identifiers, so it never ahs to be
|
|
234
|
+
mixed with the identifier=/prefix= calling convention.
|
|
235
|
+
"""
|
|
236
|
+
return f'{prefix}{identifier}'
|
|
237
|
+
|
|
238
|
+
def is_in_bucket(
|
|
239
|
+
self,
|
|
240
|
+
identifier: str,
|
|
241
|
+
prefix: str,
|
|
242
|
+
bucket: Optional[str] = None,
|
|
243
|
+
) -> bool:
|
|
244
|
+
"""
|
|
245
|
+
Check if the key for the given identifier and prefix is in the bucket.
|
|
246
|
+
"""
|
|
247
|
+
bucket = bucket or self.bucket_name
|
|
248
|
+
|
|
249
|
+
return s3_key_exists(
|
|
250
|
+
s3_resource=self.s3_resource,
|
|
251
|
+
bucket=bucket,
|
|
252
|
+
key=self._key_in_bucket(identifier=identifier, prefix=prefix),
|
|
253
|
+
extra_args=self._get_encryption_args()
|
|
173
254
|
)
|
|
174
|
-
self.region = region
|
|
175
|
-
self.name_prefix = namePrefix
|
|
176
|
-
self.part_size = partSize
|
|
177
|
-
self.jobs_domain_name: Optional[str] = None
|
|
178
|
-
self.files_domain_name: Optional[str] = None
|
|
179
|
-
self.files_bucket = None
|
|
180
|
-
self.db = boto3_session.client(service_name="sdb", region_name=region)
|
|
181
|
-
|
|
182
|
-
self.s3_resource = boto3_session.resource("s3", region_name=self.region)
|
|
183
|
-
self.s3_client = self.s3_resource.meta.client
|
|
184
|
-
|
|
185
|
-
def initialize(self, config: "Config") -> None:
|
|
186
|
-
if self._registered:
|
|
187
|
-
raise JobStoreExistsException(self.locator, "aws")
|
|
188
|
-
self._registered = None
|
|
189
|
-
try:
|
|
190
|
-
self._bind(create=True)
|
|
191
|
-
except:
|
|
192
|
-
with panic(logger):
|
|
193
|
-
self.destroy()
|
|
194
|
-
else:
|
|
195
|
-
super().initialize(config)
|
|
196
|
-
# Only register after job store has been full initialized
|
|
197
|
-
self._registered = True
|
|
198
255
|
|
|
199
|
-
@property
|
|
200
|
-
def sseKeyPath(self) -> Optional[str]:
|
|
201
|
-
return self.config.sseKey
|
|
202
256
|
|
|
203
|
-
def
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
257
|
+
def write_to_bucket(
|
|
258
|
+
self,
|
|
259
|
+
identifier: str,
|
|
260
|
+
prefix: str,
|
|
261
|
+
data: Optional[Union[bytes, str, Dict[str, Any]]],
|
|
262
|
+
bucket: Optional[str] = None,
|
|
263
|
+
encrypted: Optional[bool] = None,
|
|
264
|
+
) -> None:
|
|
265
|
+
"""
|
|
266
|
+
Write something directly to a bucket.
|
|
267
|
+
|
|
268
|
+
Use for small files. Does not parallelize or use multipart.
|
|
269
|
+
|
|
270
|
+
:param encrypted: Can be set to False to disable encryption.
|
|
271
|
+
"""
|
|
272
|
+
# only used if exporting to a URL
|
|
273
|
+
encryption_args = {} if encrypted is False else self._get_encryption_args()
|
|
274
|
+
bucket = bucket or self.bucket_name
|
|
275
|
+
|
|
276
|
+
if isinstance(data, dict):
|
|
277
|
+
data = json.dumps(data).encode('utf-8')
|
|
278
|
+
elif isinstance(data, str):
|
|
279
|
+
data = data.encode('utf-8')
|
|
280
|
+
elif data is None:
|
|
281
|
+
data = b''
|
|
282
|
+
|
|
283
|
+
assert isinstance(data, bytes)
|
|
284
|
+
put_s3_object(
|
|
285
|
+
s3_resource=self.s3_resource,
|
|
286
|
+
bucket=bucket,
|
|
287
|
+
key=self._key_in_bucket(identifier=identifier, prefix=prefix),
|
|
288
|
+
body=data,
|
|
289
|
+
extra_args=encryption_args,
|
|
290
|
+
)
|
|
208
291
|
|
|
209
|
-
def
|
|
292
|
+
def read_from_bucket(
|
|
210
293
|
self,
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
) ->
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
return self.name_prefix + self.nameSeparator + name
|
|
218
|
-
|
|
219
|
-
# The order in which this sequence of events happens is important. We can easily handle the
|
|
220
|
-
# inability to bind a domain, but it is a little harder to handle some cases of binding the
|
|
221
|
-
# jobstore bucket. Maintaining this order allows for an easier `destroy` method.
|
|
222
|
-
if self.jobs_domain_name is None:
|
|
223
|
-
self.jobs_domain_name = qualify("jobs")
|
|
224
|
-
self._bindDomain(self.jobs_domain_name, create=create, block=block)
|
|
225
|
-
if self.files_domain_name is None:
|
|
226
|
-
self.files_domain_name = qualify("files")
|
|
227
|
-
self._bindDomain(self.files_domain_name, create=create, block=block)
|
|
228
|
-
if self.files_bucket is None:
|
|
229
|
-
self.files_bucket = self._bindBucket(
|
|
230
|
-
qualify("files"),
|
|
231
|
-
create=create,
|
|
232
|
-
block=block,
|
|
233
|
-
versioning=True,
|
|
234
|
-
check_versioning_consistency=check_versioning_consistency,
|
|
235
|
-
)
|
|
294
|
+
identifier: str,
|
|
295
|
+
prefix: str,
|
|
296
|
+
bucket: Optional[str] = None,
|
|
297
|
+
) -> bytes:
|
|
298
|
+
"""
|
|
299
|
+
Read something directly from a bucket.
|
|
236
300
|
|
|
237
|
-
|
|
238
|
-
|
|
301
|
+
Use for small files. Does not parallelize or use multipart.
|
|
302
|
+
|
|
303
|
+
:raises NoSuchJobException: if the prefix is the job prefix and the
|
|
304
|
+
identifier is not found.
|
|
305
|
+
:raises NoSuchFileException: if the prefix is the content prefix and
|
|
306
|
+
the identifier is not found.
|
|
307
|
+
:raises self.s3_client.exceptions.NoSuchKey: in other cases where the
|
|
308
|
+
identifier is not found.
|
|
309
|
+
"""
|
|
310
|
+
bucket = bucket or self.bucket_name
|
|
311
|
+
|
|
312
|
+
try:
|
|
313
|
+
return get_s3_object(
|
|
314
|
+
s3_resource=self.s3_resource,
|
|
315
|
+
bucket=bucket,
|
|
316
|
+
key=self._key_in_bucket(identifier=identifier, prefix=prefix),
|
|
317
|
+
extra_args=self._get_encryption_args(),
|
|
318
|
+
)['Body'].read()
|
|
319
|
+
except self.s3_client.exceptions.NoSuchKey:
|
|
320
|
+
if prefix == self.job_key_prefix:
|
|
321
|
+
raise NoSuchJobException(identifier)
|
|
322
|
+
elif prefix == self.content_key_prefix:
|
|
323
|
+
raise NoSuchFileException(identifier)
|
|
324
|
+
else:
|
|
325
|
+
raise
|
|
326
|
+
except ClientError as e:
|
|
327
|
+
if e.response.get('ResponseMetadata', {}).get('HTTPStatusCode') == 404:
|
|
328
|
+
if prefix == self.job_key_prefix:
|
|
329
|
+
raise NoSuchJobException(identifier)
|
|
330
|
+
elif prefix == self.content_key_prefix:
|
|
331
|
+
raise NoSuchFileException(identifier)
|
|
332
|
+
else:
|
|
333
|
+
raise
|
|
334
|
+
else:
|
|
335
|
+
raise
|
|
336
|
+
|
|
337
|
+
###################################### JOBS API ######################################
|
|
338
|
+
|
|
339
|
+
def assign_job_id(self, jobDescription: JobDescription) -> None:
|
|
340
|
+
jobDescription.jobStoreID = str(uuid.uuid4())
|
|
341
|
+
logger.debug("Assigning Job ID %s", jobDescription.jobStoreID)
|
|
342
|
+
|
|
343
|
+
def create_job(self, jobDescription: JobDescription) -> JobDescription:
|
|
344
|
+
"""
|
|
345
|
+
Pickle a jobDescription object and write it to the jobstore as a file.
|
|
346
|
+
|
|
347
|
+
Responsible for calling :meth:`toil.job.JobDescription.pre_update_hook`
|
|
348
|
+
on the job description.
|
|
239
349
|
"""
|
|
240
|
-
A optional boolean property indicating whether this job store is registered. The
|
|
241
|
-
registry is the authority on deciding if a job store exists or not. If True, this job
|
|
242
|
-
store exists, if None the job store is transitioning from True to False or vice versa,
|
|
243
|
-
if False the job store doesn't exist.
|
|
244
350
|
|
|
245
|
-
|
|
351
|
+
jobDescription.pre_update_hook()
|
|
352
|
+
|
|
353
|
+
self.write_to_bucket(identifier=str(jobDescription.jobStoreID),
|
|
354
|
+
prefix=self.job_key_prefix,
|
|
355
|
+
data=pickle.dumps(jobDescription, protocol=pickle.HIGHEST_PROTOCOL))
|
|
356
|
+
return jobDescription
|
|
357
|
+
|
|
358
|
+
def job_exists(self, job_id: str, check: bool = False) -> bool:
|
|
359
|
+
"""
|
|
360
|
+
Checks if the job_id is found in s3.
|
|
361
|
+
|
|
362
|
+
:param check: If True, raise an exception instead of returning false
|
|
363
|
+
when a job does not exist.
|
|
246
364
|
"""
|
|
247
|
-
# The weird mapping of the SDB item attribute value to the property value is due to
|
|
248
|
-
# backwards compatibility. 'True' becomes True, that's easy. Toil < 3.3.0 writes this at
|
|
249
|
-
# the end of job store creation. Absence of either the registry, the item or the
|
|
250
|
-
# attribute becomes False, representing a truly absent, non-existing job store. An
|
|
251
|
-
# attribute value of 'False', which is what Toil < 3.3.0 writes at the *beginning* of job
|
|
252
|
-
# store destruction, indicates a job store in transition, reflecting the fact that 3.3.0
|
|
253
|
-
# may leak buckets or domains even though the registry reports 'False' for them. We
|
|
254
|
-
# can't handle job stores that were partially created by 3.3.0, though.
|
|
255
|
-
registry_domain_name = "toil-registry"
|
|
256
365
|
try:
|
|
257
|
-
self.
|
|
258
|
-
|
|
366
|
+
self.s3_client.head_object(
|
|
367
|
+
Bucket=self.bucket_name,
|
|
368
|
+
Key=self._key_in_bucket(
|
|
369
|
+
identifier=job_id,
|
|
370
|
+
prefix=self.job_key_prefix,
|
|
371
|
+
),
|
|
372
|
+
**self._get_encryption_args()
|
|
259
373
|
)
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
) # the documentation says 'Attributes' should always exist, but this is not true
|
|
274
|
-
exists: Optional[str] = get_item_from_attributes(
|
|
275
|
-
attributes=attributes, name="exists"
|
|
276
|
-
)
|
|
277
|
-
if exists is None:
|
|
278
|
-
return False
|
|
279
|
-
elif exists == "True":
|
|
280
|
-
return True
|
|
281
|
-
elif exists == "False":
|
|
282
|
-
return None
|
|
283
|
-
else:
|
|
284
|
-
assert False
|
|
374
|
+
return True
|
|
375
|
+
except ClientError as e:
|
|
376
|
+
if e.response.get('ResponseMetadata', {}).get('HTTPStatusCode') == 404:
|
|
377
|
+
if check:
|
|
378
|
+
raise NoSuchJobException(job_id)
|
|
379
|
+
else:
|
|
380
|
+
raise
|
|
381
|
+
except self.s3_client.exceptions.NoSuchKey:
|
|
382
|
+
if check:
|
|
383
|
+
raise NoSuchJobException(job_id)
|
|
384
|
+
else:
|
|
385
|
+
raise
|
|
386
|
+
return False
|
|
285
387
|
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
388
|
+
def jobs(self) -> Iterator[JobDescription]:
|
|
389
|
+
for result in list_s3_items(self.s3_resource, bucket=self.bucket_name, prefix=self.job_key_prefix):
|
|
390
|
+
try:
|
|
391
|
+
job_id = result['Key'][len(self.job_key_prefix):] # strip self.job_key_prefix
|
|
392
|
+
yield self.load_job(job_id)
|
|
393
|
+
except NoSuchJobException:
|
|
394
|
+
# job may have been deleted between showing up in the list and getting loaded
|
|
395
|
+
pass
|
|
396
|
+
|
|
397
|
+
def load_job(self, job_id: str) -> JobDescription:
|
|
398
|
+
"""Use a job_id to get a job from the jobstore's s3 bucket, unpickle, and return it."""
|
|
289
399
|
try:
|
|
290
|
-
self.
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
400
|
+
job = pickle.loads(self.read_from_bucket(identifier=job_id, prefix=self.job_key_prefix))
|
|
401
|
+
except NoSuchJobException:
|
|
402
|
+
raise
|
|
403
|
+
|
|
404
|
+
if not isinstance(job, JobDescription):
|
|
405
|
+
raise RuntimeError(
|
|
406
|
+
f"While trying to load a JobDescription for {job_id}, got a {type(job)} instead!",
|
|
296
407
|
)
|
|
297
|
-
except DomainDoesNotExist:
|
|
298
|
-
pass
|
|
299
|
-
else:
|
|
300
|
-
for attempt in retry_sdb():
|
|
301
|
-
with attempt:
|
|
302
|
-
if value is False:
|
|
303
|
-
self.db.delete_attributes(
|
|
304
|
-
DomainName=registry_domain_name, ItemName=self.name_prefix
|
|
305
|
-
)
|
|
306
|
-
else:
|
|
307
|
-
if value is True:
|
|
308
|
-
attributes: list["ReplaceableAttributeTypeDef"] = [
|
|
309
|
-
{"Name": "exists", "Value": "True", "Replace": True}
|
|
310
|
-
]
|
|
311
|
-
elif value is None:
|
|
312
|
-
attributes = [
|
|
313
|
-
{"Name": "exists", "Value": "False", "Replace": True}
|
|
314
|
-
]
|
|
315
|
-
else:
|
|
316
|
-
assert False
|
|
317
|
-
self.db.put_attributes(
|
|
318
|
-
DomainName=registry_domain_name,
|
|
319
|
-
ItemName=self.name_prefix,
|
|
320
|
-
Attributes=attributes,
|
|
321
|
-
)
|
|
322
|
-
|
|
323
|
-
def _checkItem(self, item: "ItemTypeDef", enforce: bool = True) -> None:
|
|
324
|
-
"""
|
|
325
|
-
Make sure that the given SimpleDB item actually has the attributes we think it should.
|
|
326
408
|
|
|
327
|
-
|
|
409
|
+
# Now we know it's the right type
|
|
410
|
+
job.assignConfig(self.config)
|
|
411
|
+
return job
|
|
328
412
|
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
self._checkAttributes(item["Attributes"], enforce)
|
|
413
|
+
def update_job(self, jobDescription: JobDescription) -> None:
|
|
414
|
+
self.create_job(jobDescription)
|
|
332
415
|
|
|
333
|
-
def
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
416
|
+
def delete_job(self, job_id: str) -> None:
|
|
417
|
+
logger.debug("Deleting job %s", job_id)
|
|
418
|
+
|
|
419
|
+
# delete the actual job file
|
|
420
|
+
self.s3_client.delete_object(
|
|
421
|
+
Bucket=self.bucket_name,
|
|
422
|
+
Key=self._key_in_bucket(
|
|
423
|
+
identifier=job_id,
|
|
424
|
+
prefix=self.job_key_prefix,
|
|
341
425
|
)
|
|
342
|
-
|
|
343
|
-
raise RuntimeError(
|
|
344
|
-
"encountered SimpleDB entry missing required attribute "
|
|
345
|
-
"'overlargeID'; is your job store ancient?"
|
|
346
|
-
)
|
|
426
|
+
)
|
|
347
427
|
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
"""
|
|
354
|
-
self._checkAttributes(attributes)
|
|
355
|
-
overlarge_id_value = get_item_from_attributes(
|
|
356
|
-
attributes=attributes, name="overlargeID"
|
|
428
|
+
# delete any files marked as associated with the job
|
|
429
|
+
job_file_associations_to_delete = []
|
|
430
|
+
root_key = self._key_in_bucket(
|
|
431
|
+
identifier=job_id,
|
|
432
|
+
prefix=self.job_associations_key_prefix,
|
|
357
433
|
)
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
434
|
+
for associated_job_file in list_s3_items(self.s3_resource,
|
|
435
|
+
bucket=self.bucket_name,
|
|
436
|
+
prefix=root_key):
|
|
437
|
+
job_file_associations_to_delete.append(associated_job_file['Key'])
|
|
438
|
+
file_id = associated_job_file['Key'].split('.')[-1]
|
|
439
|
+
self.delete_file(file_id)
|
|
440
|
+
|
|
441
|
+
# delete the job-file association references (these are empty files the simply connect jobs to files)
|
|
442
|
+
for job_file_association in job_file_associations_to_delete:
|
|
443
|
+
self.s3_client.delete_object(Bucket=self.bucket_name, Key=f'{job_file_association}')
|
|
444
|
+
|
|
445
|
+
def associate_job_with_file(self, job_id: str, file_id: str) -> None:
|
|
446
|
+
# associate this job with this file; the file will be deleted when the job is
|
|
447
|
+
self.write_to_bucket(identifier=f'{job_id}.{file_id}', prefix=self.job_associations_key_prefix, data=None)
|
|
372
448
|
|
|
373
|
-
|
|
449
|
+
###################################### FILES API ######################################
|
|
450
|
+
|
|
451
|
+
def write_file(self, local_path: str, job_id: Optional[str] = None, cleanup: bool = False) -> FileID:
|
|
374
452
|
"""
|
|
375
|
-
|
|
376
|
-
|
|
453
|
+
Write a local file into the jobstore and return a file_id referencing it.
|
|
454
|
+
|
|
455
|
+
:param job_id:
|
|
456
|
+
If job_id AND cleanup are supplied, associate this file with that job. When the job is deleted, the
|
|
457
|
+
file will be deleted as well.
|
|
458
|
+
|
|
459
|
+
:param cleanup:
|
|
460
|
+
If job_id AND cleanup are supplied, associate this file with that job. When the job is deleted, the
|
|
461
|
+
file will be deleted as well.
|
|
462
|
+
TODO: we don't need cleanup; remove it and only use job_id
|
|
377
463
|
"""
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
464
|
+
# TODO: etag = compute_checksum_for_file(local_path, algorithm='etag')[len('etag$'):]
|
|
465
|
+
file_id = str(uuid.uuid4()) # mint a new file_id
|
|
466
|
+
file_attributes = os.stat(local_path)
|
|
467
|
+
size = file_attributes.st_size
|
|
468
|
+
executable = file_attributes.st_mode & stat.S_IXUSR != 0
|
|
469
|
+
|
|
470
|
+
if job_id and cleanup:
|
|
471
|
+
# associate this job with this file; then the file reference will be deleted when the job is
|
|
472
|
+
self.associate_job_with_file(job_id, file_id)
|
|
473
|
+
|
|
474
|
+
# Each file gets a prefix under which we put exactly one key, to hide
|
|
475
|
+
# metadata in the key.
|
|
476
|
+
prefix = self._key_in_bucket(
|
|
477
|
+
identifier=file_id,
|
|
478
|
+
prefix=self.content_key_prefix
|
|
479
|
+
)
|
|
392
480
|
|
|
393
|
-
|
|
394
|
-
|
|
481
|
+
copy_local_to_s3(
|
|
482
|
+
s3_resource=self.s3_resource,
|
|
483
|
+
local_file_path=local_path,
|
|
484
|
+
dst_bucket=self.bucket_name,
|
|
485
|
+
dst_key=f'{prefix}/{os.path.basename(local_path)}',
|
|
486
|
+
extra_args=self._get_encryption_args()
|
|
487
|
+
)
|
|
488
|
+
return FileID(file_id, size, executable)
|
|
395
489
|
|
|
396
|
-
|
|
490
|
+
def find_s3_key_from_file_id(self, file_id: str) -> str:
|
|
491
|
+
"""This finds an s3 key for which file_id is the prefix, and which already exists."""
|
|
492
|
+
prefix = self._key_in_bucket(
|
|
493
|
+
identifier=file_id,
|
|
494
|
+
prefix=self.content_key_prefix
|
|
495
|
+
)
|
|
496
|
+
s3_keys = [s3_item for s3_item in list_s3_items(self.s3_resource, bucket=self.bucket_name, prefix=prefix)]
|
|
497
|
+
if len(s3_keys) == 0:
|
|
498
|
+
raise NoSuchFileException(file_id)
|
|
499
|
+
if len(s3_keys) > 1:
|
|
500
|
+
# There can be only one.
|
|
501
|
+
raise RuntimeError(f'File ID: {file_id} should be unique, but includes: {s3_keys}')
|
|
502
|
+
return s3_keys[0]['Key']
|
|
397
503
|
|
|
398
504
|
@contextmanager
|
|
399
|
-
def
|
|
400
|
-
self
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
]
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
jobDescription
|
|
415
|
-
)
|
|
416
|
-
for each_attribute in got_job_attributes:
|
|
417
|
-
new_attribute: "ReplaceableAttributeTypeDef" = {
|
|
418
|
-
"Name": each_attribute["Name"],
|
|
419
|
-
"Value": each_attribute["Value"],
|
|
420
|
-
"Replace": True,
|
|
421
|
-
}
|
|
422
|
-
item_attributes.append(new_attribute)
|
|
423
|
-
items.append({"Name": item_name, "Attributes": item_attributes})
|
|
424
|
-
|
|
425
|
-
for attempt in retry_sdb():
|
|
426
|
-
with attempt:
|
|
427
|
-
self.db.batch_put_attributes(
|
|
428
|
-
DomainName=self.jobs_domain_name, Items=items
|
|
429
|
-
)
|
|
430
|
-
self._batchedUpdates = None
|
|
431
|
-
|
|
432
|
-
def assign_job_id(self, job_description: JobDescription) -> None:
|
|
433
|
-
jobStoreID = self._new_job_id()
|
|
434
|
-
logger.debug("Assigning ID to job %s", jobStoreID)
|
|
435
|
-
job_description.jobStoreID = jobStoreID
|
|
436
|
-
|
|
437
|
-
def create_job(self, job_description: JobDescription) -> JobDescription:
|
|
438
|
-
if hasattr(self, "_batchedUpdates") and self._batchedUpdates is not None:
|
|
439
|
-
self._batchedUpdates.append(job_description)
|
|
440
|
-
else:
|
|
441
|
-
self.update_job(job_description)
|
|
442
|
-
return job_description
|
|
443
|
-
|
|
444
|
-
def job_exists(self, job_id: Union[bytes, str]) -> bool:
|
|
445
|
-
for attempt in retry_sdb():
|
|
446
|
-
with attempt:
|
|
447
|
-
return (
|
|
448
|
-
len(
|
|
449
|
-
self.db.get_attributes(
|
|
450
|
-
DomainName=self.jobs_domain_name,
|
|
451
|
-
ItemName=compat_bytes(job_id),
|
|
452
|
-
AttributeNames=[SDBHelper.presenceIndicator()],
|
|
453
|
-
ConsistentRead=True,
|
|
454
|
-
).get("Attributes", [])
|
|
455
|
-
)
|
|
456
|
-
> 0
|
|
457
|
-
)
|
|
458
|
-
|
|
459
|
-
def jobs(self) -> Generator[Job, None, None]:
|
|
460
|
-
job_items: Optional[list["ItemTypeDef"]] = None
|
|
461
|
-
for attempt in retry_sdb():
|
|
462
|
-
with attempt:
|
|
463
|
-
job_items = boto3_pager(
|
|
464
|
-
self.db.select,
|
|
465
|
-
"Items",
|
|
466
|
-
ConsistentRead=True,
|
|
467
|
-
SelectExpression="select * from `%s`" % self.jobs_domain_name,
|
|
468
|
-
)
|
|
469
|
-
assert job_items is not None
|
|
470
|
-
for jobItem in job_items:
|
|
471
|
-
yield self._awsJobFromItem(jobItem)
|
|
472
|
-
|
|
473
|
-
def load_job(self, job_id: FileID) -> Job:
|
|
474
|
-
item_attributes = None
|
|
475
|
-
for attempt in retry_sdb():
|
|
476
|
-
with attempt:
|
|
477
|
-
item_attributes = self.db.get_attributes(
|
|
478
|
-
DomainName=self.jobs_domain_name,
|
|
479
|
-
ItemName=compat_bytes(job_id),
|
|
480
|
-
ConsistentRead=True,
|
|
481
|
-
).get("Attributes", [])
|
|
482
|
-
if not item_attributes:
|
|
483
|
-
raise NoSuchJobException(job_id)
|
|
484
|
-
job = self._awsJobFromAttributes(item_attributes)
|
|
485
|
-
if job is None:
|
|
486
|
-
raise NoSuchJobException(job_id)
|
|
487
|
-
logger.debug("Loaded job %s", job_id)
|
|
488
|
-
return job
|
|
505
|
+
def write_file_stream(
|
|
506
|
+
self,
|
|
507
|
+
job_id: Optional[str] = None,
|
|
508
|
+
cleanup: bool = False,
|
|
509
|
+
basename: Optional[str] = None,
|
|
510
|
+
encoding: Optional[str] = None,
|
|
511
|
+
errors: Optional[str] = None,
|
|
512
|
+
) -> Iterator[tuple[IO[bytes], str]]:
|
|
513
|
+
file_id = str(uuid.uuid4())
|
|
514
|
+
if job_id and cleanup:
|
|
515
|
+
self.associate_job_with_file(job_id, file_id)
|
|
516
|
+
prefix = self._key_in_bucket(
|
|
517
|
+
identifier=file_id,
|
|
518
|
+
prefix=self.content_key_prefix
|
|
519
|
+
)
|
|
489
520
|
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
with attempt:
|
|
500
|
-
self.db.put_attributes(
|
|
501
|
-
DomainName=self.jobs_domain_name,
|
|
502
|
-
ItemName=compat_bytes(job_description.jobStoreID),
|
|
503
|
-
Attributes=update_attributes,
|
|
504
|
-
)
|
|
521
|
+
pipe = MultiPartPipe(part_size=self.part_size,
|
|
522
|
+
s3_resource=self.s3_resource,
|
|
523
|
+
bucket_name=self.bucket_name,
|
|
524
|
+
file_id=f'{prefix}/{str(basename)}',
|
|
525
|
+
encryption_args=self._get_encryption_args(),
|
|
526
|
+
encoding=encoding,
|
|
527
|
+
errors=errors)
|
|
528
|
+
with pipe as writable:
|
|
529
|
+
yield writable, file_id
|
|
505
530
|
|
|
506
|
-
|
|
531
|
+
@contextmanager
|
|
532
|
+
def update_file_stream(
|
|
533
|
+
self,
|
|
534
|
+
file_id: str,
|
|
535
|
+
encoding: Optional[str] = None,
|
|
536
|
+
errors: Optional[str] = None
|
|
537
|
+
) -> Iterator[IO[Any]]:
|
|
538
|
+
logger.debug("Replacing file %s via multipart upload", file_id)
|
|
539
|
+
pipe = MultiPartPipe(
|
|
540
|
+
part_size=self.part_size,
|
|
541
|
+
s3_resource=self.s3_resource,
|
|
542
|
+
bucket_name=self.bucket_name,
|
|
543
|
+
file_id=self.find_s3_key_from_file_id(file_id),
|
|
544
|
+
encryption_args=self._get_encryption_args(),
|
|
545
|
+
encoding=encoding,
|
|
546
|
+
errors=errors,
|
|
547
|
+
)
|
|
548
|
+
with pipe as writable:
|
|
549
|
+
yield writable
|
|
507
550
|
|
|
508
|
-
|
|
509
|
-
|
|
510
|
-
|
|
551
|
+
@contextmanager
|
|
552
|
+
def write_shared_file_stream(
|
|
553
|
+
self,
|
|
554
|
+
shared_file_name: str,
|
|
555
|
+
encrypted: Optional[bool] = None,
|
|
556
|
+
encoding: Optional[str] = None,
|
|
557
|
+
errors: Optional[str] = None,
|
|
558
|
+
) -> Iterator[IO[bytes]]:
|
|
559
|
+
encryption_args = {} if encrypted is False else self._get_encryption_args()
|
|
560
|
+
pipe = MultiPartPipe(
|
|
561
|
+
part_size=self.part_size,
|
|
562
|
+
s3_resource=self.s3_resource,
|
|
563
|
+
bucket_name=self.bucket_name,
|
|
564
|
+
file_id=self._key_in_bucket(
|
|
565
|
+
identifier=shared_file_name,
|
|
566
|
+
prefix=self.shared_key_prefix,
|
|
567
|
+
),
|
|
568
|
+
encryption_args=encryption_args,
|
|
569
|
+
encoding=encoding,
|
|
570
|
+
errors=errors,
|
|
571
|
+
)
|
|
572
|
+
with pipe as writable:
|
|
573
|
+
yield writable
|
|
511
574
|
|
|
512
|
-
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
|
|
518
|
-
|
|
519
|
-
).get("Attributes", [])
|
|
520
|
-
# If the overlargeID has fallen off, maybe we partially deleted the
|
|
521
|
-
# attributes of the item? Or raced on it? Or hit SimpleDB being merely
|
|
522
|
-
# eventually consistent? We should still be able to get rid of it.
|
|
523
|
-
self._checkAttributes(attributes, enforce=False)
|
|
524
|
-
overlarge_id_value = get_item_from_attributes(
|
|
525
|
-
attributes=attributes, name="overlargeID"
|
|
575
|
+
def update_file(self, file_id: str, local_path: str) -> None:
|
|
576
|
+
copy_local_to_s3(
|
|
577
|
+
s3_resource=self.s3_resource,
|
|
578
|
+
local_file_path=local_path,
|
|
579
|
+
dst_bucket=self.bucket_name,
|
|
580
|
+
dst_key=self.find_s3_key_from_file_id(file_id),
|
|
581
|
+
extra_args=self._get_encryption_args()
|
|
526
582
|
)
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
583
|
+
|
|
584
|
+
def file_exists(self, file_id: str) -> bool:
|
|
585
|
+
try:
|
|
586
|
+
# This throws if the file doesn't exist.
|
|
587
|
+
self.find_s3_key_from_file_id(file_id)
|
|
588
|
+
except NoSuchFileException:
|
|
589
|
+
# It didn't exist
|
|
590
|
+
return False
|
|
591
|
+
return True
|
|
592
|
+
|
|
593
|
+
def get_file_size(self, file_id: str) -> int:
|
|
594
|
+
"""Do we need both get_file_size and _get_size???"""
|
|
595
|
+
full_s3_key = self.find_s3_key_from_file_id(file_id)
|
|
596
|
+
return self._get_size(url=urlparse(f's3://{self.bucket_name}/{full_s3_key}')) or 0
|
|
597
|
+
|
|
598
|
+
@classmethod
|
|
599
|
+
def _get_size(cls, url: ParseResult) -> Optional[int]:
|
|
600
|
+
"""Do we need both get_file_size and _get_size???"""
|
|
601
|
+
try:
|
|
602
|
+
return get_object_for_url(url, existing=True).content_length
|
|
603
|
+
except (AWSKeyNotFoundError, NoSuchFileException):
|
|
604
|
+
return 0
|
|
605
|
+
|
|
606
|
+
def read_file(self, file_id: str, local_path: str, symlink: bool = False) -> None:
|
|
607
|
+
full_s3_key = self.find_s3_key_from_file_id(file_id)
|
|
608
|
+
executable = getattr(file_id, "executable", False)
|
|
609
|
+
try:
|
|
610
|
+
copy_s3_to_local(
|
|
611
|
+
s3_resource=self.s3_resource,
|
|
612
|
+
local_file_path=local_path,
|
|
613
|
+
src_bucket=self.bucket_name,
|
|
614
|
+
src_key=full_s3_key,
|
|
615
|
+
extra_args=self._get_encryption_args()
|
|
550
616
|
)
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
DomainName=self.files_domain_name, Items=delete_items
|
|
561
|
-
)
|
|
562
|
-
for item in items:
|
|
563
|
-
item: "ItemTypeDef"
|
|
564
|
-
version = get_item_from_attributes(
|
|
565
|
-
attributes=item["Attributes"], name="version"
|
|
566
|
-
)
|
|
567
|
-
for attempt in retry_s3():
|
|
568
|
-
with attempt:
|
|
569
|
-
if version:
|
|
570
|
-
self.s3_client.delete_object(
|
|
571
|
-
Bucket=self.files_bucket.name,
|
|
572
|
-
Key=compat_bytes(item["Name"]),
|
|
573
|
-
VersionId=version,
|
|
574
|
-
)
|
|
575
|
-
else:
|
|
576
|
-
self.s3_client.delete_object(
|
|
577
|
-
Bucket=self.files_bucket.name,
|
|
578
|
-
Key=compat_bytes(item["Name"]),
|
|
579
|
-
)
|
|
617
|
+
if executable:
|
|
618
|
+
os.chmod(local_path, os.stat(local_path).st_mode | stat.S_IXUSR)
|
|
619
|
+
except self.s3_client.exceptions.NoSuchKey:
|
|
620
|
+
raise NoSuchFileException(file_id)
|
|
621
|
+
except ClientError as e:
|
|
622
|
+
if e.response.get('ResponseMetadata', {}).get('HTTPStatusCode') == 404:
|
|
623
|
+
raise NoSuchFileException(file_id)
|
|
624
|
+
else:
|
|
625
|
+
raise
|
|
580
626
|
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
|
|
627
|
+
@contextmanager # type: ignore
|
|
628
|
+
def read_file_stream( # type: ignore
|
|
629
|
+
self,
|
|
630
|
+
file_id: Union[FileID, str],
|
|
631
|
+
encoding: Optional[str] = None,
|
|
632
|
+
errors: Optional[str] = None,
|
|
633
|
+
) -> Union[ContextManager[IO[bytes]], ContextManager[IO[str]]]:
|
|
634
|
+
full_s3_key = self.find_s3_key_from_file_id(file_id)
|
|
635
|
+
try:
|
|
636
|
+
with download_stream(self.s3_resource,
|
|
637
|
+
bucket=self.bucket_name,
|
|
638
|
+
key=full_s3_key,
|
|
639
|
+
extra_args=self._get_encryption_args(),
|
|
640
|
+
encoding=encoding,
|
|
641
|
+
errors=errors) as readable:
|
|
642
|
+
yield readable
|
|
643
|
+
except self.s3_client.exceptions.NoSuchKey:
|
|
644
|
+
raise NoSuchFileException(file_id)
|
|
645
|
+
except ClientError as e:
|
|
646
|
+
if e.response.get('ResponseMetadata', {}).get('HTTPStatusCode') == 404:
|
|
647
|
+
raise NoSuchFileException(file_id)
|
|
648
|
+
else:
|
|
649
|
+
raise
|
|
650
|
+
|
|
651
|
+
@overload
|
|
652
|
+
@contextmanager
|
|
653
|
+
def read_shared_file_stream(
|
|
654
|
+
self,
|
|
655
|
+
shared_file_name: str,
|
|
656
|
+
encoding: str,
|
|
657
|
+
errors: Optional[str] = None,
|
|
658
|
+
) -> Iterator[IO[str]]: ...
|
|
659
|
+
|
|
660
|
+
@overload
|
|
661
|
+
@contextmanager
|
|
662
|
+
def read_shared_file_stream(
|
|
663
|
+
self,
|
|
664
|
+
shared_file_name: str,
|
|
665
|
+
encoding: Literal[None] = None,
|
|
666
|
+
errors: Optional[str] = None,
|
|
667
|
+
) -> Iterator[IO[bytes]]: ...
|
|
668
|
+
|
|
669
|
+
@contextmanager
|
|
670
|
+
def read_shared_file_stream(
|
|
671
|
+
self,
|
|
672
|
+
shared_file_name: str,
|
|
673
|
+
encoding: Optional[str] = None,
|
|
674
|
+
errors: Optional[str] = None,
|
|
675
|
+
) -> Iterator[Union[IO[bytes], IO[str]]]:
|
|
676
|
+
self._requireValidSharedFileName(shared_file_name)
|
|
677
|
+
key = self._key_in_bucket(identifier=shared_file_name, prefix=self.shared_key_prefix)
|
|
678
|
+
if not self.is_in_bucket(
|
|
679
|
+
identifier=shared_file_name,
|
|
680
|
+
prefix=self.shared_key_prefix,
|
|
681
|
+
):
|
|
682
|
+
# TRAVIS=true TOIL_OWNER_TAG="shared" /home/quokka/git/toil/v3nv/bin/python -m pytest --durations=0 --log-level DEBUG --log-cli-level INFO -r s /home/quokka/git/toil/src/toil/test/jobStores/jobStoreTest.py::EncryptedAWSJobStoreTest::testJobDeletions
|
|
683
|
+
# throw NoSuchFileException in download_stream
|
|
684
|
+
raise NoSuchFileException(f's3://{self.bucket_name}/{key}')
|
|
685
|
+
|
|
686
|
+
try:
|
|
687
|
+
with download_stream(self.s3_resource,
|
|
688
|
+
bucket=self.bucket_name,
|
|
689
|
+
key=key,
|
|
690
|
+
encoding=encoding,
|
|
691
|
+
errors=errors,
|
|
692
|
+
extra_args=self._get_encryption_args()) as readable:
|
|
693
|
+
yield readable
|
|
694
|
+
except self.s3_client.exceptions.NoSuchKey:
|
|
695
|
+
raise NoSuchFileException(shared_file_name)
|
|
696
|
+
except ClientError as e:
|
|
697
|
+
if e.response.get('ResponseMetadata', {}).get('HTTPStatusCode') == 404:
|
|
698
|
+
raise NoSuchFileException(shared_file_name)
|
|
699
|
+
else:
|
|
700
|
+
raise
|
|
701
|
+
|
|
702
|
+
def delete_file(self, file_id: str) -> None:
|
|
703
|
+
try:
|
|
704
|
+
full_s3_key = self.find_s3_key_from_file_id(file_id)
|
|
705
|
+
except NoSuchFileException:
|
|
706
|
+
# The file is gone. That's great, we're idempotent.
|
|
707
|
+
return
|
|
708
|
+
self.s3_client.delete_object(Bucket=self.bucket_name, Key=full_s3_key)
|
|
709
|
+
|
|
710
|
+
###################################### URI API ######################################
|
|
591
711
|
|
|
592
712
|
def _import_file(
|
|
593
713
|
self,
|
|
594
|
-
otherCls,
|
|
714
|
+
otherCls: type[URLAccess],
|
|
595
715
|
uri: ParseResult,
|
|
596
716
|
shared_file_name: Optional[str] = None,
|
|
597
717
|
hardlink: bool = False,
|
|
598
718
|
symlink: bool = True,
|
|
599
719
|
) -> Optional[FileID]:
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
|
|
604
|
-
|
|
605
|
-
|
|
606
|
-
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
|
|
610
|
-
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
720
|
+
"""
|
|
721
|
+
Upload a file into the s3 bucket jobstore from the source uri.
|
|
722
|
+
|
|
723
|
+
This db entry's existence should always be in sync with the file's existence (when one exists,
|
|
724
|
+
so must the other).
|
|
725
|
+
"""
|
|
726
|
+
# we are copying from s3 to s3
|
|
727
|
+
if isinstance(otherCls, AWSJobStore):
|
|
728
|
+
src_bucket_name, src_key_name = parse_s3_uri(uri.geturl())
|
|
729
|
+
response = head_s3_object(self.s3_resource, bucket=src_bucket_name, key=src_key_name, check=True)
|
|
730
|
+
content_length = response['ContentLength'] # e.g. 65536
|
|
731
|
+
|
|
732
|
+
file_id = str(uuid.uuid4())
|
|
733
|
+
if shared_file_name:
|
|
734
|
+
dst_key = self._key_in_bucket(identifier=shared_file_name, prefix=self.shared_key_prefix)
|
|
735
|
+
else:
|
|
736
|
+
# cannot determine exec bit from foreign s3 so default to False
|
|
737
|
+
dst_key = "/".join([
|
|
738
|
+
self._key_in_bucket(identifier=file_id, prefix=self.content_key_prefix),
|
|
739
|
+
src_key_name.split("/")[-1],
|
|
740
|
+
])
|
|
741
|
+
|
|
742
|
+
copy_s3_to_s3(
|
|
743
|
+
s3_resource=self.s3_resource,
|
|
744
|
+
src_bucket=src_bucket_name,
|
|
745
|
+
src_key=src_key_name,
|
|
746
|
+
dst_bucket=self.bucket_name,
|
|
747
|
+
dst_key=dst_key,
|
|
748
|
+
extra_args=self._get_encryption_args()
|
|
621
749
|
)
|
|
750
|
+
# TODO: verify etag after copying here?
|
|
622
751
|
|
|
623
|
-
|
|
624
|
-
|
|
752
|
+
return FileID(file_id, content_length) if not shared_file_name else None
|
|
753
|
+
else:
|
|
754
|
+
return super(AWSJobStore, self)._import_file(
|
|
755
|
+
otherCls=otherCls,
|
|
756
|
+
uri=uri,
|
|
757
|
+
shared_file_name=shared_file_name,
|
|
758
|
+
hardlink=hardlink,
|
|
759
|
+
symlink=symlink
|
|
760
|
+
)
|
|
625
761
|
|
|
626
|
-
def _export_file(
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
762
|
+
def _export_file(
|
|
763
|
+
self, otherCls: type[URLAccess], jobStoreFileID: FileID, url: ParseResult
|
|
764
|
+
) -> None:
|
|
765
|
+
"""Export a file_id in the jobstore to the url."""
|
|
766
|
+
if isinstance(otherCls, AWSJobStore):
|
|
767
|
+
src_full_s3_key = self.find_s3_key_from_file_id(jobStoreFileID)
|
|
768
|
+
dst_bucket_name, dst_key_name = parse_s3_uri(url.geturl())
|
|
769
|
+
copy_s3_to_s3(
|
|
770
|
+
s3_resource=self.s3_resource,
|
|
771
|
+
src_bucket=self.bucket_name,
|
|
772
|
+
src_key=src_full_s3_key,
|
|
773
|
+
dst_bucket=dst_bucket_name,
|
|
774
|
+
dst_key=dst_key_name,
|
|
775
|
+
extra_args=self._get_encryption_args()
|
|
637
776
|
)
|
|
638
777
|
else:
|
|
639
|
-
super()._default_export_file(otherCls,
|
|
778
|
+
super(AWSJobStore, self)._default_export_file(otherCls, jobStoreFileID, url)
|
|
640
779
|
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
780
|
+
@classmethod
|
|
781
|
+
def _read_from_url(
|
|
782
|
+
cls, url: ParseResult, writable: Union[IO[bytes], IO[str]]
|
|
783
|
+
) -> tuple[int, bool]:
|
|
784
|
+
src_obj = get_object_for_url(url, existing=True)
|
|
785
|
+
src_obj.download_fileobj(writable)
|
|
786
|
+
executable = False
|
|
787
|
+
return src_obj.content_length, executable
|
|
644
788
|
|
|
645
|
-
|
|
789
|
+
@classmethod
|
|
790
|
+
def _write_to_url(
|
|
791
|
+
cls,
|
|
792
|
+
readable: Union[IO[bytes], IO[str]],
|
|
793
|
+
url: ParseResult,
|
|
794
|
+
executable: bool = False,
|
|
795
|
+
) -> None:
|
|
796
|
+
dst_obj = get_object_for_url(url)
|
|
797
|
+
upload_to_s3(readable=readable,
|
|
798
|
+
s3_resource=establish_boto3_session().resource("s3"),
|
|
799
|
+
bucket=dst_obj.bucket_name,
|
|
800
|
+
key=dst_obj.key)
|
|
646
801
|
|
|
647
802
|
@classmethod
|
|
648
803
|
def _url_exists(cls, url: ParseResult) -> bool:
|
|
649
804
|
try:
|
|
650
|
-
|
|
651
|
-
get_object_for_url(url, existing=True, anonymous=True)
|
|
652
|
-
except PermissionError:
|
|
653
|
-
# If we can't look anonymously, log in
|
|
654
|
-
get_object_for_url(url, existing=True)
|
|
805
|
+
get_object_for_url(url, existing=True)
|
|
655
806
|
return True
|
|
656
807
|
except FileNotFoundError:
|
|
657
808
|
# Not a file
|
|
658
|
-
# Might be a directory.
|
|
659
|
-
# See if it's a directory.
|
|
809
|
+
# Might be a directory.
|
|
660
810
|
return cls._get_is_directory(url)
|
|
661
811
|
|
|
662
|
-
@classmethod
|
|
663
|
-
def _get_size(cls, url: ParseResult) -> int:
|
|
664
|
-
try:
|
|
665
|
-
src_obj = get_object_for_url(url, existing=True, anonymous=True)
|
|
666
|
-
except PermissionError:
|
|
667
|
-
src_obj = get_object_for_url(url, existing=True)
|
|
668
|
-
return src_obj.content_length
|
|
669
|
-
|
|
670
|
-
@classmethod
|
|
671
|
-
def _read_from_url(cls, url: ParseResult, writable):
|
|
672
|
-
try:
|
|
673
|
-
src_obj = get_object_for_url(url, existing=True, anonymous=True)
|
|
674
|
-
src_obj.download_fileobj(writable)
|
|
675
|
-
except Exception as e:
|
|
676
|
-
if isinstance(e, PermissionError) or (isinstance(e, ClientError) and get_error_status(e) == 403):
|
|
677
|
-
# The object setup or the download does not have permission. Try again with a login.
|
|
678
|
-
src_obj = get_object_for_url(url, existing=True)
|
|
679
|
-
src_obj.download_fileobj(writable)
|
|
680
|
-
else:
|
|
681
|
-
raise
|
|
682
|
-
return (src_obj.content_length, False) # executable bit is always False
|
|
683
|
-
|
|
684
812
|
@classmethod
|
|
685
813
|
def _open_url(cls, url: ParseResult) -> IO[bytes]:
|
|
686
814
|
try:
|
|
@@ -696,1415 +824,186 @@ class AWSJobStore(AbstractJobStore, URLAccess):
|
|
|
696
824
|
# We should get back a response with a stream in 'Body'
|
|
697
825
|
if "Body" not in response:
|
|
698
826
|
raise RuntimeError(f"Could not fetch body stream for {url}")
|
|
699
|
-
return response["Body"]
|
|
827
|
+
return response["Body"] # type: ignore
|
|
700
828
|
|
|
701
829
|
@classmethod
|
|
702
|
-
def
|
|
703
|
-
|
|
704
|
-
) -> None:
|
|
705
|
-
# Don't try to do anonympus writes.
|
|
706
|
-
dstObj = get_object_for_url(url)
|
|
707
|
-
|
|
708
|
-
logger.debug("Uploading %s", dstObj.key)
|
|
709
|
-
# uploadFile takes care of using multipart upload if the file is larger than partSize (default to 5MB)
|
|
710
|
-
uploadFile(
|
|
711
|
-
readable=readable,
|
|
712
|
-
resource=s3_boto3_resource,
|
|
713
|
-
bucketName=dstObj.bucket_name,
|
|
714
|
-
fileID=dstObj.key,
|
|
715
|
-
partSize=5 * 1000 * 1000,
|
|
716
|
-
)
|
|
830
|
+
def _list_url(cls, url: ParseResult) -> list[str]:
|
|
831
|
+
return list_objects_for_url(url)
|
|
717
832
|
|
|
718
833
|
@classmethod
|
|
719
|
-
def
|
|
834
|
+
def _supports_url(cls, url: ParseResult, export: bool = False) -> bool:
|
|
835
|
+
# TODO: export seems unused
|
|
836
|
+
return url.scheme.lower() == 's3'
|
|
837
|
+
|
|
838
|
+
def get_public_url(self, file_id: str) -> str:
|
|
839
|
+
"""Turn s3:// into http:// and put a public-read ACL on it."""
|
|
720
840
|
try:
|
|
721
|
-
return
|
|
722
|
-
|
|
723
|
-
|
|
724
|
-
|
|
841
|
+
return create_public_url(
|
|
842
|
+
self.s3_resource,
|
|
843
|
+
bucket=self.bucket_name,
|
|
844
|
+
key=self._key_in_bucket(
|
|
845
|
+
identifier=file_id,
|
|
846
|
+
prefix=self.content_key_prefix,
|
|
847
|
+
),
|
|
848
|
+
)
|
|
849
|
+
except self.s3_client.exceptions.NoSuchKey:
|
|
850
|
+
raise NoSuchFileException(file_id)
|
|
851
|
+
except ClientError as e:
|
|
852
|
+
if e.response.get('ResponseMetadata', {}).get('HTTPStatusCode') == 404:
|
|
853
|
+
raise NoSuchFileException(file_id)
|
|
854
|
+
else:
|
|
855
|
+
raise
|
|
856
|
+
|
|
857
|
+
def get_shared_public_url(self, file_id: str) -> str:
|
|
858
|
+
"""Turn s3:// into http:// and put a public-read ACL on it."""
|
|
859
|
+
# since this is only for a few files like "config.pickle"... why and what is this used for?
|
|
860
|
+
self._requireValidSharedFileName(file_id)
|
|
861
|
+
try:
|
|
862
|
+
return create_public_url(
|
|
863
|
+
self.s3_resource,
|
|
864
|
+
bucket=self.bucket_name,
|
|
865
|
+
key=self._key_in_bucket(
|
|
866
|
+
identifier=file_id,
|
|
867
|
+
prefix=self.shared_key_prefix,
|
|
868
|
+
),
|
|
869
|
+
)
|
|
870
|
+
except self.s3_client.exceptions.NoSuchKey:
|
|
871
|
+
raise NoSuchFileException(file_id)
|
|
872
|
+
except ClientError as e:
|
|
873
|
+
if e.response.get('ResponseMetadata', {}).get('HTTPStatusCode') == 404:
|
|
874
|
+
raise NoSuchFileException(file_id)
|
|
875
|
+
else:
|
|
876
|
+
raise
|
|
725
877
|
|
|
726
878
|
@classmethod
|
|
727
879
|
def _get_is_directory(cls, url: ParseResult) -> bool:
|
|
728
880
|
# We consider it a directory if anything is in it.
|
|
729
881
|
# TODO: Can we just get the first item and not the whole list?
|
|
730
|
-
return len(
|
|
731
|
-
|
|
732
|
-
@classmethod
|
|
733
|
-
def _supports_url(cls, url: ParseResult, export: bool = False) -> bool:
|
|
734
|
-
return url.scheme.lower() == "s3"
|
|
882
|
+
return len(list_objects_for_url(url)) > 0
|
|
735
883
|
|
|
736
|
-
def
|
|
737
|
-
self, local_path: FileID, job_id: Optional[FileID] = None, cleanup: bool = False
|
|
738
|
-
) -> FileID:
|
|
739
|
-
info = self.FileInfo.create(job_id if cleanup else None)
|
|
740
|
-
info.upload(local_path, not self.config.disableJobStoreChecksumVerification)
|
|
741
|
-
info.save()
|
|
742
|
-
logger.debug("Wrote %r of from %r", info, local_path)
|
|
743
|
-
return info.fileID
|
|
744
|
-
|
|
745
|
-
@contextmanager
|
|
746
|
-
def write_file_stream(
|
|
884
|
+
def get_empty_file_store_id(
|
|
747
885
|
self,
|
|
748
|
-
job_id: Optional[
|
|
886
|
+
job_id: Optional[str] = None,
|
|
749
887
|
cleanup: bool = False,
|
|
750
|
-
basename=None,
|
|
751
|
-
|
|
752
|
-
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
|
|
759
|
-
|
|
760
|
-
@contextmanager
|
|
761
|
-
def write_shared_file_stream(
|
|
762
|
-
self, shared_file_name, encrypted=None, encoding=None, errors=None
|
|
763
|
-
):
|
|
764
|
-
self._requireValidSharedFileName(shared_file_name)
|
|
765
|
-
info = self.FileInfo.loadOrCreate(
|
|
766
|
-
jobStoreFileID=self._shared_file_id(shared_file_name),
|
|
767
|
-
ownerID=str(self.sharedFileOwnerID),
|
|
768
|
-
encrypted=encrypted,
|
|
769
|
-
)
|
|
770
|
-
with info.uploadStream(encoding=encoding, errors=errors) as writable:
|
|
771
|
-
yield writable
|
|
772
|
-
info.save()
|
|
773
|
-
logger.debug("Wrote %r for shared file %r.", info, shared_file_name)
|
|
774
|
-
|
|
775
|
-
def update_file(self, file_id, local_path):
|
|
776
|
-
info = self.FileInfo.loadOrFail(file_id)
|
|
777
|
-
info.upload(local_path, not self.config.disableJobStoreChecksumVerification)
|
|
778
|
-
info.save()
|
|
779
|
-
logger.debug("Wrote %r from path %r.", info, local_path)
|
|
780
|
-
|
|
781
|
-
@contextmanager
|
|
782
|
-
def update_file_stream(self, file_id, encoding=None, errors=None):
|
|
783
|
-
info = self.FileInfo.loadOrFail(file_id)
|
|
784
|
-
with info.uploadStream(encoding=encoding, errors=errors) as writable:
|
|
785
|
-
yield writable
|
|
786
|
-
info.save()
|
|
787
|
-
logger.debug("Wrote %r from stream.", info)
|
|
788
|
-
|
|
789
|
-
def file_exists(self, file_id):
|
|
790
|
-
return self.FileInfo.exists(file_id)
|
|
791
|
-
|
|
792
|
-
def get_file_size(self, file_id):
|
|
793
|
-
if not self.file_exists(file_id):
|
|
794
|
-
return 0
|
|
795
|
-
info = self.FileInfo.loadOrFail(file_id)
|
|
796
|
-
return info.getSize()
|
|
797
|
-
|
|
798
|
-
def read_file(self, file_id, local_path, symlink=False):
|
|
799
|
-
info = self.FileInfo.loadOrFail(file_id)
|
|
800
|
-
logger.debug("Reading %r into %r.", info, local_path)
|
|
801
|
-
info.download(local_path, not self.config.disableJobStoreChecksumVerification)
|
|
802
|
-
if getattr(file_id, "executable", False):
|
|
803
|
-
os.chmod(local_path, os.stat(local_path).st_mode | stat.S_IXUSR)
|
|
804
|
-
|
|
805
|
-
@contextmanager
|
|
806
|
-
def read_file_stream(self, file_id, encoding=None, errors=None):
|
|
807
|
-
info = self.FileInfo.loadOrFail(file_id)
|
|
808
|
-
logger.debug("Reading %r into stream.", info)
|
|
809
|
-
with info.downloadStream(encoding=encoding, errors=errors) as readable:
|
|
810
|
-
yield readable
|
|
811
|
-
|
|
812
|
-
@contextmanager
|
|
813
|
-
def read_shared_file_stream(self, shared_file_name, encoding=None, errors=None):
|
|
814
|
-
self._requireValidSharedFileName(shared_file_name)
|
|
815
|
-
jobStoreFileID = self._shared_file_id(shared_file_name)
|
|
816
|
-
info = self.FileInfo.loadOrFail(jobStoreFileID, customName=shared_file_name)
|
|
817
|
-
logger.debug(
|
|
818
|
-
"Reading %r for shared file %r into stream.", info, shared_file_name
|
|
819
|
-
)
|
|
820
|
-
with info.downloadStream(encoding=encoding, errors=errors) as readable:
|
|
821
|
-
yield readable
|
|
822
|
-
|
|
823
|
-
def delete_file(self, file_id):
|
|
824
|
-
info = self.FileInfo.load(file_id)
|
|
825
|
-
if info is None:
|
|
826
|
-
logger.debug("File %s does not exist, skipping deletion.", file_id)
|
|
827
|
-
else:
|
|
828
|
-
info.delete()
|
|
829
|
-
|
|
830
|
-
def write_logs(self, msg):
|
|
831
|
-
info = self.FileInfo.create(str(self.statsFileOwnerID))
|
|
832
|
-
with info.uploadStream(multipart=False) as writeable:
|
|
833
|
-
if isinstance(msg, str):
|
|
834
|
-
# This stream is for binary data, so encode any non-encoded things
|
|
835
|
-
msg = msg.encode("utf-8", errors="ignore")
|
|
836
|
-
writeable.write(msg)
|
|
837
|
-
info.save()
|
|
838
|
-
|
|
839
|
-
def read_logs(self, callback, read_all=False):
|
|
840
|
-
itemsProcessed = 0
|
|
841
|
-
|
|
842
|
-
for info in self._read_logs(callback, self.statsFileOwnerID):
|
|
843
|
-
info._ownerID = str(self.readStatsFileOwnerID) # boto3 requires strings
|
|
844
|
-
info.save()
|
|
845
|
-
itemsProcessed += 1
|
|
846
|
-
|
|
847
|
-
if read_all:
|
|
848
|
-
for _ in self._read_logs(callback, self.readStatsFileOwnerID):
|
|
849
|
-
itemsProcessed += 1
|
|
850
|
-
|
|
851
|
-
return itemsProcessed
|
|
852
|
-
|
|
853
|
-
def _read_logs(self, callback, ownerId):
|
|
854
|
-
items = None
|
|
855
|
-
for attempt in retry_sdb():
|
|
856
|
-
with attempt:
|
|
857
|
-
items = boto3_pager(
|
|
858
|
-
self.db.select,
|
|
859
|
-
"Items",
|
|
860
|
-
ConsistentRead=True,
|
|
861
|
-
SelectExpression=f"select * from `{self.files_domain_name}` where ownerID='{str(ownerId)}'",
|
|
862
|
-
)
|
|
863
|
-
assert items is not None
|
|
864
|
-
for item in items:
|
|
865
|
-
info = self.FileInfo.fromItem(item)
|
|
866
|
-
with info.downloadStream() as readable:
|
|
867
|
-
callback(readable)
|
|
868
|
-
yield info
|
|
869
|
-
|
|
870
|
-
# TODO: Make this retry more specific?
|
|
871
|
-
# example: https://github.com/DataBiosphere/toil/issues/3378
|
|
872
|
-
@retry()
|
|
873
|
-
def get_public_url(self, jobStoreFileID):
|
|
874
|
-
info = self.FileInfo.loadOrFail(jobStoreFileID)
|
|
875
|
-
if info.content is not None:
|
|
876
|
-
with info.uploadStream(allowInlining=False) as f:
|
|
877
|
-
f.write(info.content)
|
|
878
|
-
|
|
879
|
-
self.files_bucket.Object(compat_bytes(jobStoreFileID)).Acl().put(
|
|
880
|
-
ACL="public-read"
|
|
888
|
+
basename: Optional[str] = None,
|
|
889
|
+
) -> str:
|
|
890
|
+
"""Create an empty file in s3 and return a bare string file ID."""
|
|
891
|
+
file_id = str(uuid.uuid4())
|
|
892
|
+
self.write_to_bucket(
|
|
893
|
+
identifier=f'{file_id}/0/{basename}',
|
|
894
|
+
prefix=self.content_key_prefix,
|
|
895
|
+
data=None,
|
|
896
|
+
bucket=self.bucket_name
|
|
881
897
|
)
|
|
882
|
-
|
|
883
|
-
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
|
|
889
|
-
|
|
890
|
-
|
|
898
|
+
if job_id and cleanup:
|
|
899
|
+
self.associate_job_with_file(job_id, file_id)
|
|
900
|
+
return file_id
|
|
901
|
+
|
|
902
|
+
###################################### LOGGING API ######################################
|
|
903
|
+
|
|
904
|
+
def write_logs(self, log_msg: Union[bytes, str]) -> None:
|
|
905
|
+
if isinstance(log_msg, str):
|
|
906
|
+
log_msg = log_msg.encode('utf-8', errors='ignore')
|
|
907
|
+
file_obj = BytesIO(log_msg)
|
|
908
|
+
|
|
909
|
+
key_name = self._key_in_bucket(
|
|
910
|
+
identifier=f'{datetime.datetime.now()}{str(uuid.uuid4())}'.replace(
|
|
911
|
+
' ', '_'
|
|
912
|
+
),
|
|
913
|
+
prefix=self.logs_key_prefix,
|
|
891
914
|
)
|
|
915
|
+
self.s3_client.upload_fileobj(Bucket=self.bucket_name,
|
|
916
|
+
Key=key_name,
|
|
917
|
+
ExtraArgs=self._get_encryption_args(),
|
|
918
|
+
Fileobj=file_obj)
|
|
892
919
|
|
|
893
|
-
|
|
894
|
-
# query_auth is False when using an IAM role (see issue #2043). Including the
|
|
895
|
-
# x-amz-security-token parameter without the access key results in a 403,
|
|
896
|
-
# even if the resource is public, so we need to remove it.
|
|
897
|
-
scheme, netloc, path, query, fragment = urlsplit(url)
|
|
898
|
-
params = parse_qs(query)
|
|
899
|
-
if "x-amz-security-token" in params:
|
|
900
|
-
del params["x-amz-security-token"]
|
|
901
|
-
if "AWSAccessKeyId" in params:
|
|
902
|
-
del params["AWSAccessKeyId"]
|
|
903
|
-
if "Signature" in params:
|
|
904
|
-
del params["Signature"]
|
|
905
|
-
query = urlencode(params, doseq=True)
|
|
906
|
-
url = urlunsplit((scheme, netloc, path, query, fragment))
|
|
907
|
-
return url
|
|
908
|
-
|
|
909
|
-
def get_shared_public_url(self, shared_file_name):
|
|
910
|
-
self._requireValidSharedFileName(shared_file_name)
|
|
911
|
-
return self.get_public_url(self._shared_file_id(shared_file_name))
|
|
912
|
-
|
|
913
|
-
def _bindBucket(
|
|
914
|
-
self,
|
|
915
|
-
bucket_name: str,
|
|
916
|
-
create: bool = False,
|
|
917
|
-
block: bool = True,
|
|
918
|
-
versioning: bool = False,
|
|
919
|
-
check_versioning_consistency: bool = True,
|
|
920
|
-
):
|
|
920
|
+
def read_logs(self, callback: Callable[..., Any], read_all: bool = False) -> int:
|
|
921
921
|
"""
|
|
922
|
-
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
:param str bucket_name: the name of the bucket to bind to
|
|
926
|
-
|
|
927
|
-
:param bool create: Whether to create bucket the if it doesn't exist
|
|
928
|
-
|
|
929
|
-
:param bool block: If False, return None if the bucket doesn't exist. If True, wait until
|
|
930
|
-
bucket appears. Ignored if `create` is True.
|
|
931
|
-
|
|
932
|
-
:rtype: Bucket|None
|
|
933
|
-
:raises botocore.exceptions.ClientError: If `block` is True and the bucket still doesn't exist after the
|
|
934
|
-
retry timeout expires.
|
|
922
|
+
This fetches all referenced logs in the database from s3 as readable objects
|
|
923
|
+
and runs "callback()" on them.
|
|
935
924
|
"""
|
|
936
|
-
|
|
937
|
-
|
|
938
|
-
|
|
939
|
-
|
|
940
|
-
|
|
941
|
-
|
|
942
|
-
|
|
943
|
-
|
|
944
|
-
|
|
945
|
-
|
|
946
|
-
|
|
947
|
-
#
|
|
948
|
-
#
|
|
949
|
-
|
|
950
|
-
|
|
951
|
-
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
|
|
955
|
-
|
|
956
|
-
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
except ClientError as e:
|
|
971
|
-
error_http_status = get_error_status(e)
|
|
972
|
-
if error_http_status == 404:
|
|
973
|
-
bucketExisted = False
|
|
974
|
-
logger.debug("Bucket '%s' does not exist.", bucket_name)
|
|
975
|
-
if create:
|
|
976
|
-
bucket = create_s3_bucket(
|
|
977
|
-
self.s3_resource, bucket_name, self.region
|
|
978
|
-
)
|
|
979
|
-
# Wait until the bucket exists before checking the region and adding tags
|
|
980
|
-
bucket.wait_until_exists()
|
|
981
|
-
|
|
982
|
-
# It is possible for create_bucket to return but
|
|
983
|
-
# for an immediate request for the bucket region to
|
|
984
|
-
# produce an S3ResponseError with code
|
|
985
|
-
# NoSuchBucket. We let that kick us back up to the
|
|
986
|
-
# main retry loop.
|
|
987
|
-
assert (
|
|
988
|
-
get_bucket_region(bucket_name) == self.region
|
|
989
|
-
), f"bucket_name: {bucket_name}, {get_bucket_region(bucket_name)} != {self.region}"
|
|
990
|
-
|
|
991
|
-
tags = build_tag_dict_from_env()
|
|
992
|
-
|
|
993
|
-
if tags:
|
|
994
|
-
flat_tags = flatten_tags(tags)
|
|
995
|
-
bucket_tagging = self.s3_resource.BucketTagging(
|
|
996
|
-
bucket_name
|
|
997
|
-
)
|
|
998
|
-
bucket_tagging.put(Tagging={"TagSet": flat_tags})
|
|
999
|
-
|
|
1000
|
-
# Configure bucket so that we can make objects in
|
|
1001
|
-
# it public, which was the historical default.
|
|
1002
|
-
enable_public_objects(bucket_name)
|
|
1003
|
-
elif block:
|
|
1004
|
-
raise
|
|
1005
|
-
else:
|
|
1006
|
-
return None
|
|
1007
|
-
elif error_http_status == 301:
|
|
1008
|
-
# This is raised if the user attempts to get a bucket in a region outside
|
|
1009
|
-
# the specified one, if the specified one is not `us-east-1`. The us-east-1
|
|
1010
|
-
# server allows a user to use buckets from any region.
|
|
1011
|
-
raise BucketLocationConflictException(
|
|
1012
|
-
get_bucket_region(bucket_name)
|
|
1013
|
-
)
|
|
1014
|
-
else:
|
|
1015
|
-
raise
|
|
1016
|
-
else:
|
|
1017
|
-
bucketRegion = get_bucket_region(bucket_name)
|
|
1018
|
-
if bucketRegion != self.region:
|
|
1019
|
-
raise BucketLocationConflictException(bucketRegion)
|
|
1020
|
-
|
|
1021
|
-
if versioning and not bucketExisted:
|
|
1022
|
-
# only call this method on bucket creation
|
|
1023
|
-
bucket.Versioning().enable()
|
|
1024
|
-
# Now wait until versioning is actually on. Some uploads
|
|
1025
|
-
# would come back with no versions; maybe they were
|
|
1026
|
-
# happening too fast and this setting isn't sufficiently
|
|
1027
|
-
# consistent?
|
|
1028
|
-
time.sleep(1)
|
|
1029
|
-
while not self._getBucketVersioning(bucket_name):
|
|
1030
|
-
logger.warning(
|
|
1031
|
-
f"Waiting for versioning activation on bucket '{bucket_name}'..."
|
|
1032
|
-
)
|
|
1033
|
-
time.sleep(1)
|
|
1034
|
-
elif check_versioning_consistency:
|
|
1035
|
-
# now test for versioning consistency
|
|
1036
|
-
# we should never see any of these errors since 'versioning' should always be true
|
|
1037
|
-
bucket_versioning = self._getBucketVersioning(bucket_name)
|
|
1038
|
-
if bucket_versioning != versioning:
|
|
1039
|
-
assert False, "Cannot modify versioning on existing bucket"
|
|
1040
|
-
elif bucket_versioning is None:
|
|
1041
|
-
assert False, "Cannot use a bucket with versioning suspended"
|
|
1042
|
-
if bucketExisted:
|
|
1043
|
-
logger.debug(
|
|
1044
|
-
f"Using pre-existing job store bucket '{bucket_name}'."
|
|
1045
|
-
)
|
|
1046
|
-
else:
|
|
1047
|
-
logger.debug(
|
|
1048
|
-
f"Created new job store bucket '{bucket_name}' with versioning state {versioning}."
|
|
1049
|
-
)
|
|
1050
|
-
|
|
1051
|
-
return bucket
|
|
1052
|
-
|
|
1053
|
-
def _bindDomain(
|
|
1054
|
-
self, domain_name: str, create: bool = False, block: bool = True
|
|
1055
|
-
) -> None:
|
|
925
|
+
items_processed = 0
|
|
926
|
+
LOG_MARKER = "most_recently_read_log.marker"
|
|
927
|
+
read_log_marker = "0"
|
|
928
|
+
if not read_all:
|
|
929
|
+
# We want to pick up reading where we left off
|
|
930
|
+
try:
|
|
931
|
+
read_log_marker = self.read_from_bucket(
|
|
932
|
+
identifier=LOG_MARKER,
|
|
933
|
+
prefix=self.shared_key_prefix
|
|
934
|
+
).decode('utf-8')
|
|
935
|
+
except self.s3_client.exceptions.NoSuchKey:
|
|
936
|
+
# We haven't recorded that we've read anything yet.
|
|
937
|
+
# Leave read_log_marker at "0"
|
|
938
|
+
pass
|
|
939
|
+
|
|
940
|
+
startafter = None if read_log_marker == "0" else read_log_marker
|
|
941
|
+
for result in list_s3_items(self.s3_resource, bucket=self.bucket_name, prefix=self.logs_key_prefix, startafter=startafter):
|
|
942
|
+
if result['Key'] > read_log_marker or read_all:
|
|
943
|
+
read_log_marker = result['Key']
|
|
944
|
+
with download_stream(self.s3_resource,
|
|
945
|
+
bucket=self.bucket_name,
|
|
946
|
+
key=result['Key'],
|
|
947
|
+
extra_args=self._get_encryption_args()) as readable:
|
|
948
|
+
callback(readable)
|
|
949
|
+
items_processed += 1
|
|
950
|
+
|
|
951
|
+
if items_processed > 0:
|
|
952
|
+
# We processed something, so we need to update the marker.
|
|
953
|
+
self.write_to_bucket(identifier=LOG_MARKER,
|
|
954
|
+
prefix=self.shared_key_prefix,
|
|
955
|
+
data=read_log_marker)
|
|
956
|
+
return items_processed
|
|
957
|
+
|
|
958
|
+
def _get_encryption_args(self) -> dict[str, Any]:
|
|
1056
959
|
"""
|
|
1057
|
-
|
|
1058
|
-
create the domain if it does not exist.
|
|
1059
|
-
Return the Boto Domain object representing the SDB domain of the given name. If the
|
|
1060
|
-
domain does not exist and `create` is True, it will be created.
|
|
1061
|
-
|
|
1062
|
-
:param str domain_name: the name of the domain to bind to
|
|
960
|
+
Get the encryption arguments to pass to an AWS function.
|
|
1063
961
|
|
|
1064
|
-
|
|
962
|
+
Reads live from the SSE key file referenced by the config.
|
|
1065
963
|
|
|
1066
|
-
|
|
1067
|
-
domain appears. This parameter is ignored if create is True.
|
|
964
|
+
If the config is not available, returns an empty dict.
|
|
1068
965
|
|
|
1069
|
-
:
|
|
1070
|
-
:raises ClientError: If `block` is True and the domain still doesn't exist after the
|
|
1071
|
-
retry timeout expires.
|
|
1072
|
-
"""
|
|
1073
|
-
logger.debug("Binding to job store domain '%s'.", domain_name)
|
|
1074
|
-
retryargs = dict(
|
|
1075
|
-
predicate=lambda e: no_such_sdb_domain(e) or sdb_unavailable(e)
|
|
1076
|
-
)
|
|
1077
|
-
if not block:
|
|
1078
|
-
retryargs["timeout"] = 15
|
|
1079
|
-
for attempt in retry_sdb(**retryargs):
|
|
1080
|
-
with attempt:
|
|
1081
|
-
try:
|
|
1082
|
-
self.db.domain_metadata(DomainName=domain_name)
|
|
1083
|
-
return
|
|
1084
|
-
except ClientError as e:
|
|
1085
|
-
if no_such_sdb_domain(e):
|
|
1086
|
-
if create:
|
|
1087
|
-
self.db.create_domain(DomainName=domain_name)
|
|
1088
|
-
return
|
|
1089
|
-
elif block:
|
|
1090
|
-
raise
|
|
1091
|
-
else:
|
|
1092
|
-
raise DomainDoesNotExist(domain_name)
|
|
1093
|
-
else:
|
|
1094
|
-
raise
|
|
1095
|
-
|
|
1096
|
-
def _new_job_id(self):
|
|
1097
|
-
return str(uuid.uuid4())
|
|
1098
|
-
|
|
1099
|
-
# A dummy job ID under which all shared files are stored
|
|
1100
|
-
sharedFileOwnerID = uuid.UUID("891f7db6-e4d9-4221-a58e-ab6cc4395f94")
|
|
1101
|
-
|
|
1102
|
-
# A dummy job ID under which all unread stats files are stored
|
|
1103
|
-
statsFileOwnerID = uuid.UUID("bfcf5286-4bc7-41ef-a85d-9ab415b69d53")
|
|
1104
|
-
|
|
1105
|
-
# A dummy job ID under which all read stats files are stored
|
|
1106
|
-
readStatsFileOwnerID = uuid.UUID("e77fc3aa-d232-4255-ae04-f64ee8eb0bfa")
|
|
1107
|
-
|
|
1108
|
-
def _shared_file_id(self, shared_file_name):
|
|
1109
|
-
return str(uuid.uuid5(self.sharedFileOwnerID, shared_file_name))
|
|
1110
|
-
|
|
1111
|
-
@InnerClass
|
|
1112
|
-
class FileInfo(SDBHelper):
|
|
966
|
+
:raises ValueError: If the key data is not formatted correctly.
|
|
1113
967
|
"""
|
|
1114
|
-
|
|
1115
|
-
"""
|
|
1116
|
-
|
|
1117
|
-
outer = None
|
|
1118
|
-
"""
|
|
1119
|
-
:type: AWSJobStore
|
|
1120
|
-
"""
|
|
1121
|
-
|
|
1122
|
-
def __init__(
|
|
1123
|
-
self,
|
|
1124
|
-
fileID,
|
|
1125
|
-
ownerID,
|
|
1126
|
-
encrypted,
|
|
1127
|
-
version=None,
|
|
1128
|
-
content=None,
|
|
1129
|
-
numContentChunks=0,
|
|
1130
|
-
checksum=None,
|
|
1131
|
-
):
|
|
1132
|
-
"""
|
|
1133
|
-
:type fileID: str
|
|
1134
|
-
:param fileID: the file's ID
|
|
1135
|
-
|
|
1136
|
-
:type ownerID: str
|
|
1137
|
-
:param ownerID: ID of the entity owning this file, typically a job ID aka jobStoreID
|
|
1138
|
-
|
|
1139
|
-
:type encrypted: bool
|
|
1140
|
-
:param encrypted: whether the file is stored in encrypted form
|
|
1141
|
-
|
|
1142
|
-
:type version: str|None
|
|
1143
|
-
:param version: a non-empty string containing the most recent version of the S3
|
|
1144
|
-
object storing this file's content, None if the file is new, or empty string if the
|
|
1145
|
-
file is inlined.
|
|
1146
|
-
|
|
1147
|
-
:type content: str|None
|
|
1148
|
-
:param content: this file's inlined content
|
|
1149
|
-
|
|
1150
|
-
:type numContentChunks: int
|
|
1151
|
-
:param numContentChunks: the number of SDB domain attributes occupied by this files
|
|
1152
|
-
|
|
1153
|
-
:type checksum: str|None
|
|
1154
|
-
:param checksum: the checksum of the file, if available. Formatted
|
|
1155
|
-
as <algorithm>$<lowercase hex hash>.
|
|
1156
|
-
|
|
1157
|
-
inlined content. Note that an inlined empty string still occupies one chunk.
|
|
1158
|
-
"""
|
|
1159
|
-
super().__init__()
|
|
1160
|
-
self._fileID = fileID
|
|
1161
|
-
self._ownerID = ownerID
|
|
1162
|
-
self.encrypted = encrypted
|
|
1163
|
-
self._version = version
|
|
1164
|
-
self._previousVersion = version
|
|
1165
|
-
self._content = content
|
|
1166
|
-
self._checksum = checksum
|
|
1167
|
-
self._numContentChunks = numContentChunks
|
|
1168
|
-
|
|
1169
|
-
@property
|
|
1170
|
-
def fileID(self):
|
|
1171
|
-
return self._fileID
|
|
1172
|
-
|
|
1173
|
-
@property
|
|
1174
|
-
def ownerID(self):
|
|
1175
|
-
return self._ownerID
|
|
1176
|
-
|
|
1177
|
-
@property
|
|
1178
|
-
def version(self):
|
|
1179
|
-
return self._version
|
|
1180
|
-
|
|
1181
|
-
@version.setter
|
|
1182
|
-
def version(self, version):
|
|
1183
|
-
# Version should only change once
|
|
1184
|
-
assert self._previousVersion == self._version
|
|
1185
|
-
self._version = version
|
|
1186
|
-
if version:
|
|
1187
|
-
self.content = None
|
|
1188
|
-
|
|
1189
|
-
@property
|
|
1190
|
-
def previousVersion(self):
|
|
1191
|
-
return self._previousVersion
|
|
1192
|
-
|
|
1193
|
-
@property
|
|
1194
|
-
def content(self):
|
|
1195
|
-
return self._content
|
|
1196
|
-
|
|
1197
|
-
@property
|
|
1198
|
-
def checksum(self):
|
|
1199
|
-
return self._checksum
|
|
1200
|
-
|
|
1201
|
-
@checksum.setter
|
|
1202
|
-
def checksum(self, checksum):
|
|
1203
|
-
self._checksum = checksum
|
|
1204
|
-
|
|
1205
|
-
@content.setter
|
|
1206
|
-
def content(self, content):
|
|
1207
|
-
assert content is None or isinstance(content, bytes)
|
|
1208
|
-
self._content = content
|
|
1209
|
-
if content is not None:
|
|
1210
|
-
self.version = ""
|
|
1211
|
-
|
|
1212
|
-
@classmethod
|
|
1213
|
-
def create(cls, ownerID: str):
|
|
1214
|
-
return cls(
|
|
1215
|
-
str(uuid.uuid4()), ownerID, encrypted=cls.outer.sseKeyPath is not None
|
|
1216
|
-
)
|
|
1217
|
-
|
|
1218
|
-
@classmethod
|
|
1219
|
-
def presenceIndicator(cls):
|
|
1220
|
-
return "encrypted"
|
|
1221
|
-
|
|
1222
|
-
@classmethod
|
|
1223
|
-
def exists(cls, jobStoreFileID):
|
|
1224
|
-
for attempt in retry_sdb():
|
|
1225
|
-
with attempt:
|
|
1226
|
-
return bool(
|
|
1227
|
-
cls.outer.db.get_attributes(
|
|
1228
|
-
DomainName=cls.outer.files_domain_name,
|
|
1229
|
-
ItemName=compat_bytes(jobStoreFileID),
|
|
1230
|
-
AttributeNames=[cls.presenceIndicator()],
|
|
1231
|
-
ConsistentRead=True,
|
|
1232
|
-
).get("Attributes", [])
|
|
1233
|
-
)
|
|
1234
|
-
|
|
1235
|
-
@classmethod
|
|
1236
|
-
def load(cls, jobStoreFileID):
|
|
1237
|
-
for attempt in retry_sdb():
|
|
1238
|
-
with attempt:
|
|
1239
|
-
self = cls.fromItem(
|
|
1240
|
-
{
|
|
1241
|
-
"Name": compat_bytes(jobStoreFileID),
|
|
1242
|
-
"Attributes": cls.outer.db.get_attributes(
|
|
1243
|
-
DomainName=cls.outer.files_domain_name,
|
|
1244
|
-
ItemName=compat_bytes(jobStoreFileID),
|
|
1245
|
-
ConsistentRead=True,
|
|
1246
|
-
).get("Attributes", []),
|
|
1247
|
-
}
|
|
1248
|
-
)
|
|
1249
|
-
return self
|
|
1250
|
-
|
|
1251
|
-
@classmethod
|
|
1252
|
-
def loadOrCreate(cls, jobStoreFileID, ownerID, encrypted):
|
|
1253
|
-
self = cls.load(jobStoreFileID)
|
|
1254
|
-
if encrypted is None:
|
|
1255
|
-
encrypted = cls.outer.sseKeyPath is not None
|
|
1256
|
-
if self is None:
|
|
1257
|
-
self = cls(jobStoreFileID, ownerID, encrypted=encrypted)
|
|
1258
|
-
else:
|
|
1259
|
-
assert self.fileID == jobStoreFileID
|
|
1260
|
-
assert self.ownerID == ownerID
|
|
1261
|
-
self.encrypted = encrypted
|
|
1262
|
-
return self
|
|
1263
|
-
|
|
1264
|
-
@classmethod
|
|
1265
|
-
def loadOrFail(cls, jobStoreFileID, customName=None):
|
|
1266
|
-
"""
|
|
1267
|
-
:rtype: AWSJobStore.FileInfo
|
|
1268
|
-
:return: an instance of this class representing the file with the given ID
|
|
1269
|
-
:raises NoSuchFileException: if given file does not exist
|
|
1270
|
-
"""
|
|
1271
|
-
self = cls.load(jobStoreFileID)
|
|
1272
|
-
if self is None:
|
|
1273
|
-
raise NoSuchFileException(jobStoreFileID, customName=customName)
|
|
1274
|
-
else:
|
|
1275
|
-
return self
|
|
1276
|
-
|
|
1277
|
-
@classmethod
|
|
1278
|
-
def fromItem(cls, item: "ItemTypeDef"):
|
|
1279
|
-
"""
|
|
1280
|
-
Convert an SDB item to an instance of this class.
|
|
1281
|
-
|
|
1282
|
-
:type item: Item
|
|
1283
|
-
"""
|
|
1284
|
-
assert item is not None
|
|
1285
|
-
|
|
1286
|
-
# Strings come back from SDB as unicode
|
|
1287
|
-
def strOrNone(s):
|
|
1288
|
-
return s if s is None else str(s)
|
|
1289
|
-
|
|
1290
|
-
# ownerID and encrypted are the only mandatory attributes
|
|
1291
|
-
ownerID, encrypted, version, checksum = SDBHelper.get_attributes_from_item(
|
|
1292
|
-
item, ["ownerID", "encrypted", "version", "checksum"]
|
|
1293
|
-
)
|
|
1294
|
-
if ownerID is None:
|
|
1295
|
-
assert encrypted is None
|
|
1296
|
-
return None
|
|
1297
|
-
else:
|
|
1298
|
-
encrypted = strict_bool(encrypted)
|
|
1299
|
-
content, numContentChunks = cls.attributesToBinary(item["Attributes"])
|
|
1300
|
-
if encrypted:
|
|
1301
|
-
sseKeyPath = cls.outer.sseKeyPath
|
|
1302
|
-
if sseKeyPath is None:
|
|
1303
|
-
raise AssertionError(
|
|
1304
|
-
"Content is encrypted but no key was provided."
|
|
1305
|
-
)
|
|
1306
|
-
if content is not None:
|
|
1307
|
-
content = encryption.decrypt(content, sseKeyPath)
|
|
1308
|
-
self = cls(
|
|
1309
|
-
fileID=item["Name"],
|
|
1310
|
-
ownerID=ownerID,
|
|
1311
|
-
encrypted=encrypted,
|
|
1312
|
-
version=version,
|
|
1313
|
-
content=content,
|
|
1314
|
-
numContentChunks=numContentChunks,
|
|
1315
|
-
checksum=checksum,
|
|
1316
|
-
)
|
|
1317
|
-
return self
|
|
1318
|
-
|
|
1319
|
-
def toItem(self) -> tuple[dict[str, str], int]:
|
|
1320
|
-
"""
|
|
1321
|
-
Convert this instance to a dictionary of attribute names to values
|
|
1322
|
-
|
|
1323
|
-
:return: the attributes dict and an integer specifying the the number of chunk
|
|
1324
|
-
attributes in the dictionary that are used for storing inlined content.
|
|
1325
|
-
"""
|
|
1326
|
-
content = self.content
|
|
1327
|
-
assert content is None or isinstance(content, bytes)
|
|
1328
|
-
if self.encrypted and content is not None:
|
|
1329
|
-
sseKeyPath = self.outer.sseKeyPath
|
|
1330
|
-
if sseKeyPath is None:
|
|
1331
|
-
raise AssertionError(
|
|
1332
|
-
"Encryption requested but no key was provided."
|
|
1333
|
-
)
|
|
1334
|
-
content = encryption.encrypt(content, sseKeyPath)
|
|
1335
|
-
assert content is None or isinstance(content, bytes)
|
|
1336
|
-
attributes = self.binaryToAttributes(content)
|
|
1337
|
-
numChunks = int(attributes["numChunks"])
|
|
1338
|
-
attributes.update(
|
|
1339
|
-
dict(
|
|
1340
|
-
ownerID=self.ownerID or "",
|
|
1341
|
-
encrypted=str(self.encrypted),
|
|
1342
|
-
version=self.version or "",
|
|
1343
|
-
checksum=self.checksum or "",
|
|
1344
|
-
)
|
|
1345
|
-
)
|
|
1346
|
-
return attributes, numChunks
|
|
1347
|
-
|
|
1348
|
-
@classmethod
|
|
1349
|
-
def _reservedAttributes(cls):
|
|
1350
|
-
return 3 + super()._reservedAttributes()
|
|
1351
|
-
|
|
1352
|
-
@staticmethod
|
|
1353
|
-
def maxInlinedSize():
|
|
1354
|
-
return 256
|
|
1355
|
-
|
|
1356
|
-
def save(self):
|
|
1357
|
-
attributes, numNewContentChunks = self.toItem()
|
|
1358
|
-
attributes_boto3 = SDBHelper.attributeDictToList(attributes)
|
|
1359
|
-
# False stands for absence
|
|
1360
|
-
if self.previousVersion is None:
|
|
1361
|
-
expected: "UpdateConditionTypeDef" = {
|
|
1362
|
-
"Name": "version",
|
|
1363
|
-
"Exists": False,
|
|
1364
|
-
}
|
|
1365
|
-
else:
|
|
1366
|
-
expected = {"Name": "version", "Value": cast(str, self.previousVersion)}
|
|
1367
|
-
try:
|
|
1368
|
-
for attempt in retry_sdb():
|
|
1369
|
-
with attempt:
|
|
1370
|
-
self.outer.db.put_attributes(
|
|
1371
|
-
DomainName=self.outer.files_domain_name,
|
|
1372
|
-
ItemName=compat_bytes(self.fileID),
|
|
1373
|
-
Attributes=[
|
|
1374
|
-
{
|
|
1375
|
-
"Name": attribute["Name"],
|
|
1376
|
-
"Value": attribute["Value"],
|
|
1377
|
-
"Replace": True,
|
|
1378
|
-
}
|
|
1379
|
-
for attribute in attributes_boto3
|
|
1380
|
-
],
|
|
1381
|
-
Expected=expected,
|
|
1382
|
-
)
|
|
1383
|
-
# clean up the old version of the file if necessary and safe
|
|
1384
|
-
if self.previousVersion and (self.previousVersion != self.version):
|
|
1385
|
-
for attempt in retry_s3():
|
|
1386
|
-
with attempt:
|
|
1387
|
-
self.outer.s3_client.delete_object(
|
|
1388
|
-
Bucket=self.outer.files_bucket.name,
|
|
1389
|
-
Key=compat_bytes(self.fileID),
|
|
1390
|
-
VersionId=self.previousVersion,
|
|
1391
|
-
)
|
|
1392
|
-
self._previousVersion = self._version
|
|
1393
|
-
if numNewContentChunks < self._numContentChunks:
|
|
1394
|
-
residualChunks = range(numNewContentChunks, self._numContentChunks)
|
|
1395
|
-
residual_chunk_names = [self._chunkName(i) for i in residualChunks]
|
|
1396
|
-
# boto3 requires providing the value as well as the name in the attribute, and we don't store it locally
|
|
1397
|
-
# the php sdk resolves this issue by not requiring the Value key https://github.com/aws/aws-sdk-php/issues/185
|
|
1398
|
-
# but this doesnt extend to boto3
|
|
1399
|
-
delete_attributes = self.outer.db.get_attributes(
|
|
1400
|
-
DomainName=self.outer.files_domain_name,
|
|
1401
|
-
ItemName=compat_bytes(self.fileID),
|
|
1402
|
-
AttributeNames=[chunk for chunk in residual_chunk_names],
|
|
1403
|
-
).get("Attributes")
|
|
1404
|
-
for attempt in retry_sdb():
|
|
1405
|
-
with attempt:
|
|
1406
|
-
self.outer.db.delete_attributes(
|
|
1407
|
-
DomainName=self.outer.files_domain_name,
|
|
1408
|
-
ItemName=compat_bytes(self.fileID),
|
|
1409
|
-
Attributes=delete_attributes,
|
|
1410
|
-
)
|
|
1411
|
-
self.outer.db.get_attributes(
|
|
1412
|
-
DomainName=self.outer.files_domain_name,
|
|
1413
|
-
ItemName=compat_bytes(self.fileID),
|
|
1414
|
-
)
|
|
1415
|
-
|
|
1416
|
-
self._numContentChunks = numNewContentChunks
|
|
1417
|
-
except ClientError as e:
|
|
1418
|
-
if get_error_code(e) == "ConditionalCheckFailed":
|
|
1419
|
-
raise ConcurrentFileModificationException(self.fileID)
|
|
1420
|
-
else:
|
|
1421
|
-
raise
|
|
1422
|
-
|
|
1423
|
-
def upload(self, localFilePath, calculateChecksum=True):
|
|
1424
|
-
file_size, file_time = fileSizeAndTime(localFilePath)
|
|
1425
|
-
if file_size <= self.maxInlinedSize():
|
|
1426
|
-
with open(localFilePath, "rb") as f:
|
|
1427
|
-
self.content = f.read()
|
|
1428
|
-
# Clear out any old checksum in case of overwrite
|
|
1429
|
-
self.checksum = ""
|
|
1430
|
-
else:
|
|
1431
|
-
headerArgs = self._s3EncryptionArgs()
|
|
1432
|
-
# Create a new Resource in case it needs to be on its own thread
|
|
1433
|
-
resource = boto3_session.resource("s3", region_name=self.outer.region)
|
|
1434
|
-
|
|
1435
|
-
self.checksum = (
|
|
1436
|
-
self._get_file_checksum(localFilePath)
|
|
1437
|
-
if calculateChecksum
|
|
1438
|
-
else None
|
|
1439
|
-
)
|
|
1440
|
-
self.version = uploadFromPath(
|
|
1441
|
-
localFilePath,
|
|
1442
|
-
resource=resource,
|
|
1443
|
-
bucketName=self.outer.files_bucket.name,
|
|
1444
|
-
fileID=compat_bytes(self.fileID),
|
|
1445
|
-
headerArgs=headerArgs,
|
|
1446
|
-
partSize=self.outer.part_size,
|
|
1447
|
-
)
|
|
1448
|
-
|
|
1449
|
-
def _start_checksum(self, to_match=None, algorithm="sha1"):
|
|
1450
|
-
"""
|
|
1451
|
-
Get a hasher that can be used with _update_checksum and
|
|
1452
|
-
_finish_checksum.
|
|
1453
|
-
|
|
1454
|
-
If to_match is set, it is a precomputed checksum which we expect
|
|
1455
|
-
the result to match.
|
|
1456
|
-
|
|
1457
|
-
The right way to compare checksums is to feed in the checksum to be
|
|
1458
|
-
matched, so we can see its algorithm, instead of getting a new one
|
|
1459
|
-
and comparing. If a checksum to match is fed in, _finish_checksum()
|
|
1460
|
-
will raise a ChecksumError if it isn't matched.
|
|
1461
|
-
"""
|
|
1462
|
-
|
|
1463
|
-
# If we have an expexted result it will go here.
|
|
1464
|
-
expected = None
|
|
1465
|
-
|
|
1466
|
-
if to_match is not None:
|
|
1467
|
-
parts = to_match.split("$")
|
|
1468
|
-
algorithm = parts[0]
|
|
1469
|
-
expected = parts[1]
|
|
1470
|
-
|
|
1471
|
-
wrapped = getattr(hashlib, algorithm)()
|
|
1472
|
-
logger.debug(f"Starting {algorithm} checksum to match {expected}")
|
|
1473
|
-
return algorithm, wrapped, expected
|
|
1474
|
-
|
|
1475
|
-
def _update_checksum(self, checksum_in_progress, data):
|
|
1476
|
-
"""
|
|
1477
|
-
Update a checksum in progress from _start_checksum with new data.
|
|
1478
|
-
"""
|
|
1479
|
-
checksum_in_progress[1].update(data)
|
|
1480
|
-
|
|
1481
|
-
def _finish_checksum(self, checksum_in_progress):
|
|
1482
|
-
"""
|
|
1483
|
-
Complete a checksum in progress from _start_checksum and return the
|
|
1484
|
-
checksum result string.
|
|
1485
|
-
"""
|
|
1486
|
-
|
|
1487
|
-
result_hash = checksum_in_progress[1].hexdigest()
|
|
1488
|
-
|
|
1489
|
-
logger.debug(
|
|
1490
|
-
f"Completed checksum with hash {result_hash} vs. expected {checksum_in_progress[2]}"
|
|
1491
|
-
)
|
|
1492
|
-
if checksum_in_progress[2] is not None:
|
|
1493
|
-
# We expected a particular hash
|
|
1494
|
-
if result_hash != checksum_in_progress[2]:
|
|
1495
|
-
raise ChecksumError(
|
|
1496
|
-
"Checksum mismatch. Expected: %s Actual: %s"
|
|
1497
|
-
% (checksum_in_progress[2], result_hash)
|
|
1498
|
-
)
|
|
1499
|
-
|
|
1500
|
-
return "$".join([checksum_in_progress[0], result_hash])
|
|
1501
|
-
|
|
1502
|
-
def _get_file_checksum(self, localFilePath, to_match=None):
|
|
1503
|
-
with open(localFilePath, "rb") as f:
|
|
1504
|
-
hasher = self._start_checksum(to_match=to_match)
|
|
1505
|
-
contents = f.read(1024 * 1024)
|
|
1506
|
-
while contents != b"":
|
|
1507
|
-
self._update_checksum(hasher, contents)
|
|
1508
|
-
contents = f.read(1024 * 1024)
|
|
1509
|
-
return self._finish_checksum(hasher)
|
|
1510
|
-
|
|
1511
|
-
@contextmanager
|
|
1512
|
-
def uploadStream(
|
|
1513
|
-
self, multipart=True, allowInlining=True, encoding=None, errors=None
|
|
1514
|
-
):
|
|
1515
|
-
"""
|
|
1516
|
-
Context manager that gives out a binary or text mode upload stream to upload data.
|
|
1517
|
-
"""
|
|
1518
|
-
|
|
1519
|
-
# Note that we have to handle already having a content or a version
|
|
1520
|
-
# if we are overwriting something.
|
|
1521
|
-
|
|
1522
|
-
# But make sure we don't have both.
|
|
1523
|
-
assert not (bool(self.version) and self.content is not None)
|
|
1524
|
-
|
|
1525
|
-
info = self
|
|
1526
|
-
store = self.outer
|
|
1527
|
-
|
|
1528
|
-
class MultiPartPipe(WritablePipe):
|
|
1529
|
-
def readFrom(self, readable):
|
|
1530
|
-
# Get the first block of data we want to put
|
|
1531
|
-
buf = readable.read(store.part_size)
|
|
1532
|
-
assert isinstance(buf, bytes)
|
|
1533
|
-
|
|
1534
|
-
if allowInlining and len(buf) <= info.maxInlinedSize():
|
|
1535
|
-
logger.debug("Inlining content of %d bytes", len(buf))
|
|
1536
|
-
info.content = buf
|
|
1537
|
-
# There will be no checksum
|
|
1538
|
-
info.checksum = ""
|
|
1539
|
-
else:
|
|
1540
|
-
# We will compute a checksum
|
|
1541
|
-
hasher = info._start_checksum()
|
|
1542
|
-
logger.debug("Updating checksum with %d bytes", len(buf))
|
|
1543
|
-
info._update_checksum(hasher, buf)
|
|
1544
|
-
|
|
1545
|
-
client = store.s3_client
|
|
1546
|
-
bucket_name = store.files_bucket.name
|
|
1547
|
-
headerArgs = info._s3EncryptionArgs()
|
|
1548
|
-
|
|
1549
|
-
for attempt in retry_s3():
|
|
1550
|
-
with attempt:
|
|
1551
|
-
logger.debug("Starting multipart upload")
|
|
1552
|
-
# low-level clients are thread safe
|
|
1553
|
-
upload = client.create_multipart_upload(
|
|
1554
|
-
Bucket=bucket_name,
|
|
1555
|
-
Key=compat_bytes(info.fileID),
|
|
1556
|
-
**headerArgs,
|
|
1557
|
-
)
|
|
1558
|
-
uploadId = upload["UploadId"]
|
|
1559
|
-
parts = []
|
|
1560
|
-
logger.debug("Multipart upload started as %s", uploadId)
|
|
1561
|
-
|
|
1562
|
-
for attempt in retry_s3():
|
|
1563
|
-
with attempt:
|
|
1564
|
-
for i in range(CONSISTENCY_TICKS):
|
|
1565
|
-
# Sometimes we can create a multipart upload and not see it. Wait around for it.
|
|
1566
|
-
response = client.list_multipart_uploads(
|
|
1567
|
-
Bucket=bucket_name,
|
|
1568
|
-
MaxUploads=1,
|
|
1569
|
-
Prefix=compat_bytes(info.fileID),
|
|
1570
|
-
)
|
|
1571
|
-
if (
|
|
1572
|
-
"Uploads" in response
|
|
1573
|
-
and len(response["Uploads"]) != 0
|
|
1574
|
-
and response["Uploads"][0]["UploadId"]
|
|
1575
|
-
== uploadId
|
|
1576
|
-
):
|
|
1577
|
-
|
|
1578
|
-
logger.debug(
|
|
1579
|
-
"Multipart upload visible as %s", uploadId
|
|
1580
|
-
)
|
|
1581
|
-
break
|
|
1582
|
-
else:
|
|
1583
|
-
logger.debug(
|
|
1584
|
-
"Multipart upload %s is not visible; we see %s",
|
|
1585
|
-
uploadId,
|
|
1586
|
-
response.get("Uploads"),
|
|
1587
|
-
)
|
|
1588
|
-
time.sleep(CONSISTENCY_TIME * 2**i)
|
|
1589
|
-
|
|
1590
|
-
try:
|
|
1591
|
-
for part_num in itertools.count():
|
|
1592
|
-
for attempt in retry_s3():
|
|
1593
|
-
with attempt:
|
|
1594
|
-
logger.debug(
|
|
1595
|
-
"Uploading part %d of %d bytes to %s",
|
|
1596
|
-
part_num + 1,
|
|
1597
|
-
len(buf),
|
|
1598
|
-
uploadId,
|
|
1599
|
-
)
|
|
1600
|
-
# TODO: include the Content-MD5 header:
|
|
1601
|
-
# https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/s3.html#S3.Client.complete_multipart_upload
|
|
1602
|
-
part = client.upload_part(
|
|
1603
|
-
Bucket=bucket_name,
|
|
1604
|
-
Key=compat_bytes(info.fileID),
|
|
1605
|
-
PartNumber=part_num + 1,
|
|
1606
|
-
UploadId=uploadId,
|
|
1607
|
-
Body=BytesIO(buf),
|
|
1608
|
-
**headerArgs,
|
|
1609
|
-
)
|
|
1610
|
-
|
|
1611
|
-
parts.append(
|
|
1612
|
-
{
|
|
1613
|
-
"PartNumber": part_num + 1,
|
|
1614
|
-
"ETag": part["ETag"],
|
|
1615
|
-
}
|
|
1616
|
-
)
|
|
1617
|
-
|
|
1618
|
-
# Get the next block of data we want to put
|
|
1619
|
-
buf = readable.read(info.outer.part_size)
|
|
1620
|
-
assert isinstance(buf, bytes)
|
|
1621
|
-
if len(buf) == 0:
|
|
1622
|
-
# Don't allow any part other than the very first to be empty.
|
|
1623
|
-
break
|
|
1624
|
-
info._update_checksum(hasher, buf)
|
|
1625
|
-
except:
|
|
1626
|
-
with panic(log=logger):
|
|
1627
|
-
for attempt in retry_s3():
|
|
1628
|
-
with attempt:
|
|
1629
|
-
client.abort_multipart_upload(
|
|
1630
|
-
Bucket=bucket_name,
|
|
1631
|
-
Key=compat_bytes(info.fileID),
|
|
1632
|
-
UploadId=uploadId,
|
|
1633
|
-
)
|
|
1634
|
-
|
|
1635
|
-
else:
|
|
1636
|
-
|
|
1637
|
-
while not store._getBucketVersioning(
|
|
1638
|
-
store.files_bucket.name
|
|
1639
|
-
):
|
|
1640
|
-
logger.warning(
|
|
1641
|
-
"Versioning does not appear to be enabled yet. Deferring multipart "
|
|
1642
|
-
"upload completion..."
|
|
1643
|
-
)
|
|
1644
|
-
time.sleep(1)
|
|
1645
|
-
|
|
1646
|
-
# Save the checksum
|
|
1647
|
-
info.checksum = info._finish_checksum(hasher)
|
|
1648
|
-
|
|
1649
|
-
for attempt in retry_s3(timeout=600):
|
|
1650
|
-
# Wait here for a bit longer if S3 breaks,
|
|
1651
|
-
# because we have been known to flake out here
|
|
1652
|
-
# in tests
|
|
1653
|
-
# (https://github.com/DataBiosphere/toil/issues/3894)
|
|
1654
|
-
with attempt:
|
|
1655
|
-
logger.debug("Attempting to complete upload...")
|
|
1656
|
-
completed = client.complete_multipart_upload(
|
|
1657
|
-
Bucket=bucket_name,
|
|
1658
|
-
Key=compat_bytes(info.fileID),
|
|
1659
|
-
UploadId=uploadId,
|
|
1660
|
-
MultipartUpload={"Parts": parts},
|
|
1661
|
-
)
|
|
1662
|
-
|
|
1663
|
-
logger.debug(
|
|
1664
|
-
"Completed upload object of type %s: %s",
|
|
1665
|
-
str(type(completed)),
|
|
1666
|
-
repr(completed),
|
|
1667
|
-
)
|
|
1668
|
-
info.version = completed.get("VersionId")
|
|
1669
|
-
logger.debug(
|
|
1670
|
-
"Completed upload with version %s",
|
|
1671
|
-
str(info.version),
|
|
1672
|
-
)
|
|
1673
|
-
|
|
1674
|
-
if info.version is None:
|
|
1675
|
-
# Somehow we don't know the version. Try and get it.
|
|
1676
|
-
for attempt in retry_s3(
|
|
1677
|
-
predicate=lambda e: retryable_s3_errors(e)
|
|
1678
|
-
or isinstance(e, AssertionError)
|
|
1679
|
-
):
|
|
1680
|
-
with attempt:
|
|
1681
|
-
version = client.head_object(
|
|
1682
|
-
Bucket=bucket_name,
|
|
1683
|
-
Key=compat_bytes(info.fileID),
|
|
1684
|
-
**headerArgs,
|
|
1685
|
-
).get("VersionId", None)
|
|
1686
|
-
logger.warning(
|
|
1687
|
-
"Loaded key for upload with no version and got version %s",
|
|
1688
|
-
str(version),
|
|
1689
|
-
)
|
|
1690
|
-
info.version = version
|
|
1691
|
-
assert info.version is not None
|
|
1692
|
-
|
|
1693
|
-
# Make sure we actually wrote something, even if an empty file
|
|
1694
|
-
assert bool(info.version) or info.content is not None
|
|
1695
|
-
|
|
1696
|
-
class SinglePartPipe(WritablePipe):
|
|
1697
|
-
def readFrom(self, readable):
|
|
1698
|
-
buf = readable.read()
|
|
1699
|
-
assert isinstance(buf, bytes)
|
|
1700
|
-
dataLength = len(buf)
|
|
1701
|
-
if allowInlining and dataLength <= info.maxInlinedSize():
|
|
1702
|
-
logger.debug("Inlining content of %d bytes", len(buf))
|
|
1703
|
-
info.content = buf
|
|
1704
|
-
# There will be no checksum
|
|
1705
|
-
info.checksum = ""
|
|
1706
|
-
else:
|
|
1707
|
-
# We will compute a checksum
|
|
1708
|
-
hasher = info._start_checksum()
|
|
1709
|
-
info._update_checksum(hasher, buf)
|
|
1710
|
-
info.checksum = info._finish_checksum(hasher)
|
|
1711
|
-
|
|
1712
|
-
bucket_name = store.files_bucket.name
|
|
1713
|
-
headerArgs = info._s3EncryptionArgs()
|
|
1714
|
-
client = store.s3_client
|
|
1715
|
-
|
|
1716
|
-
buf = BytesIO(buf)
|
|
1717
|
-
|
|
1718
|
-
while not store._getBucketVersioning(bucket_name):
|
|
1719
|
-
logger.warning(
|
|
1720
|
-
"Versioning does not appear to be enabled yet. Deferring single part "
|
|
1721
|
-
"upload..."
|
|
1722
|
-
)
|
|
1723
|
-
time.sleep(1)
|
|
1724
|
-
|
|
1725
|
-
for attempt in retry_s3():
|
|
1726
|
-
with attempt:
|
|
1727
|
-
logger.debug(
|
|
1728
|
-
"Uploading single part of %d bytes", dataLength
|
|
1729
|
-
)
|
|
1730
|
-
client.upload_fileobj(
|
|
1731
|
-
Bucket=bucket_name,
|
|
1732
|
-
Key=compat_bytes(info.fileID),
|
|
1733
|
-
Fileobj=buf,
|
|
1734
|
-
ExtraArgs=headerArgs,
|
|
1735
|
-
)
|
|
1736
|
-
|
|
1737
|
-
# use head_object with the SSE headers to access versionId and content_length attributes
|
|
1738
|
-
headObj = client.head_object(
|
|
1739
|
-
Bucket=bucket_name,
|
|
1740
|
-
Key=compat_bytes(info.fileID),
|
|
1741
|
-
**headerArgs,
|
|
1742
|
-
)
|
|
1743
|
-
assert dataLength == headObj.get("ContentLength", None)
|
|
1744
|
-
info.version = headObj.get("VersionId", None)
|
|
1745
|
-
logger.debug(
|
|
1746
|
-
"Upload received version %s", str(info.version)
|
|
1747
|
-
)
|
|
1748
|
-
|
|
1749
|
-
if info.version is None:
|
|
1750
|
-
# Somehow we don't know the version
|
|
1751
|
-
for attempt in retry_s3(
|
|
1752
|
-
predicate=lambda e: retryable_s3_errors(e)
|
|
1753
|
-
or isinstance(e, AssertionError)
|
|
1754
|
-
):
|
|
1755
|
-
with attempt:
|
|
1756
|
-
headObj = client.head_object(
|
|
1757
|
-
Bucket=bucket_name,
|
|
1758
|
-
Key=compat_bytes(info.fileID),
|
|
1759
|
-
**headerArgs,
|
|
1760
|
-
)
|
|
1761
|
-
info.version = headObj.get("VersionId", None)
|
|
1762
|
-
logger.warning(
|
|
1763
|
-
"Reloaded key with no version and got version %s",
|
|
1764
|
-
str(info.version),
|
|
1765
|
-
)
|
|
1766
|
-
assert info.version is not None
|
|
1767
|
-
|
|
1768
|
-
# Make sure we actually wrote something, even if an empty file
|
|
1769
|
-
assert bool(info.version) or info.content is not None
|
|
1770
|
-
|
|
1771
|
-
if multipart:
|
|
1772
|
-
pipe = MultiPartPipe(encoding=encoding, errors=errors)
|
|
1773
|
-
else:
|
|
1774
|
-
pipe = SinglePartPipe(encoding=encoding, errors=errors)
|
|
968
|
+
# TODO: Maybe memoize the file read, subject to config field changes?
|
|
1775
969
|
|
|
1776
|
-
|
|
1777
|
-
|
|
1778
|
-
|
|
1779
|
-
|
|
1780
|
-
|
|
1781
|
-
|
|
1782
|
-
|
|
970
|
+
try:
|
|
971
|
+
config = self.config
|
|
972
|
+
except AttributeError:
|
|
973
|
+
# The config isn't set yet. This happens during resume(), when we
|
|
974
|
+
# need to get the encryption args to talk to the job store to
|
|
975
|
+
# download the config, before we have it.
|
|
976
|
+
return {}
|
|
977
|
+
|
|
978
|
+
if config is not None and config.sseKey:
|
|
979
|
+
with open(config.sseKey, 'r') as f:
|
|
980
|
+
sse_key = f.read()
|
|
981
|
+
if not len(sse_key) == 32: # TODO: regex
|
|
982
|
+
raise ValueError(
|
|
983
|
+
f'Check that {self.config.sseKey} '
|
|
984
|
+
f'is the path to a real SSE key. '
|
|
985
|
+
f'(Key length {len(sse_key)} != 32)'
|
|
1783
986
|
)
|
|
987
|
+
return {'SSECustomerAlgorithm': 'AES256', 'SSECustomerKey': sse_key}
|
|
988
|
+
else:
|
|
989
|
+
return {}
|
|
1784
990
|
|
|
1785
|
-
|
|
1786
|
-
|
|
1787
|
-
|
|
1788
|
-
if self.content is None:
|
|
1789
|
-
if not bool(self.version):
|
|
1790
|
-
logger.debug(f"Version: {self.version} Content: {self.content}")
|
|
1791
|
-
raise RuntimeError("No content added and no version created")
|
|
1792
|
-
else:
|
|
1793
|
-
if bool(self.version):
|
|
1794
|
-
logger.debug(f"Version: {self.version} Content: {self.content}")
|
|
1795
|
-
raise RuntimeError("Content added and version created")
|
|
1796
|
-
|
|
1797
|
-
def copyFrom(self, srcObj):
|
|
1798
|
-
"""
|
|
1799
|
-
Copies contents of source key into this file.
|
|
1800
|
-
|
|
1801
|
-
:param S3.Object srcObj: The key (object) that will be copied from
|
|
1802
|
-
"""
|
|
1803
|
-
assert srcObj.content_length is not None
|
|
1804
|
-
if srcObj.content_length <= self.maxInlinedSize():
|
|
1805
|
-
self.content = srcObj.get().get("Body").read()
|
|
1806
|
-
else:
|
|
1807
|
-
# Create a new Resource in case it needs to be on its own thread
|
|
1808
|
-
resource = boto3_session.resource("s3", region_name=self.outer.region)
|
|
1809
|
-
self.version = copyKeyMultipart(
|
|
1810
|
-
resource,
|
|
1811
|
-
srcBucketName=compat_bytes(srcObj.bucket_name),
|
|
1812
|
-
srcKeyName=compat_bytes(srcObj.key),
|
|
1813
|
-
srcKeyVersion=compat_bytes(srcObj.version_id),
|
|
1814
|
-
dstBucketName=compat_bytes(self.outer.files_bucket.name),
|
|
1815
|
-
dstKeyName=compat_bytes(self._fileID),
|
|
1816
|
-
sseAlgorithm="AES256",
|
|
1817
|
-
sseKey=self._getSSEKey(),
|
|
1818
|
-
)
|
|
991
|
+
def parse_jobstore_identifier(jobstore_identifier: str) -> Tuple[str, str]:
|
|
992
|
+
region, jobstore_name = jobstore_identifier.split(':')
|
|
993
|
+
bucket_name = f'{jobstore_name}--toil'
|
|
1819
994
|
|
|
1820
|
-
|
|
1821
|
-
|
|
1822
|
-
|
|
1823
|
-
|
|
1824
|
-
:param S3.Object dstObj: The key (object) to copy this file's content to
|
|
1825
|
-
"""
|
|
1826
|
-
if self.content is not None:
|
|
1827
|
-
for attempt in retry_s3():
|
|
1828
|
-
with attempt:
|
|
1829
|
-
dstObj.put(Body=self.content)
|
|
1830
|
-
elif self.version:
|
|
1831
|
-
# Create a new Resource in case it needs to be on its own thread
|
|
1832
|
-
resource = boto3_session.resource("s3", region_name=self.outer.region)
|
|
1833
|
-
|
|
1834
|
-
for attempt in retry_s3():
|
|
1835
|
-
# encrypted = True if self.outer.sseKeyPath else False
|
|
1836
|
-
with attempt:
|
|
1837
|
-
copyKeyMultipart(
|
|
1838
|
-
resource,
|
|
1839
|
-
srcBucketName=compat_bytes(self.outer.files_bucket.name),
|
|
1840
|
-
srcKeyName=compat_bytes(self.fileID),
|
|
1841
|
-
srcKeyVersion=compat_bytes(self.version),
|
|
1842
|
-
dstBucketName=compat_bytes(dstObj.bucket_name),
|
|
1843
|
-
dstKeyName=compat_bytes(dstObj.key),
|
|
1844
|
-
copySourceSseAlgorithm="AES256",
|
|
1845
|
-
copySourceSseKey=self._getSSEKey(),
|
|
1846
|
-
)
|
|
1847
|
-
else:
|
|
1848
|
-
assert False
|
|
1849
|
-
|
|
1850
|
-
def download(self, localFilePath, verifyChecksum=True):
|
|
1851
|
-
if self.content is not None:
|
|
1852
|
-
with AtomicFileCreate(localFilePath) as tmpPath:
|
|
1853
|
-
with open(tmpPath, "wb") as f:
|
|
1854
|
-
f.write(self.content)
|
|
1855
|
-
elif self.version:
|
|
1856
|
-
headerArgs = self._s3EncryptionArgs()
|
|
1857
|
-
obj = self.outer.files_bucket.Object(compat_bytes(self.fileID))
|
|
1858
|
-
|
|
1859
|
-
for attempt in retry_s3(
|
|
1860
|
-
predicate=lambda e: retryable_s3_errors(e)
|
|
1861
|
-
or isinstance(e, ChecksumError)
|
|
1862
|
-
):
|
|
1863
|
-
with attempt:
|
|
1864
|
-
with AtomicFileCreate(localFilePath) as tmpPath:
|
|
1865
|
-
obj.download_file(
|
|
1866
|
-
Filename=tmpPath,
|
|
1867
|
-
ExtraArgs={"VersionId": self.version, **headerArgs},
|
|
1868
|
-
)
|
|
1869
|
-
|
|
1870
|
-
if verifyChecksum and self.checksum:
|
|
1871
|
-
try:
|
|
1872
|
-
# This automatically compares the result and matches the algorithm.
|
|
1873
|
-
self._get_file_checksum(localFilePath, self.checksum)
|
|
1874
|
-
except ChecksumError as e:
|
|
1875
|
-
# Annotate checksum mismatches with file name
|
|
1876
|
-
raise ChecksumError(
|
|
1877
|
-
"Checksums do not match for file %s."
|
|
1878
|
-
% localFilePath
|
|
1879
|
-
) from e
|
|
1880
|
-
# The error will get caught and result in a retry of the download until we run out of retries.
|
|
1881
|
-
# TODO: handle obviously truncated downloads by resuming instead.
|
|
1882
|
-
else:
|
|
1883
|
-
assert False
|
|
1884
|
-
|
|
1885
|
-
@contextmanager
|
|
1886
|
-
def downloadStream(self, verifyChecksum=True, encoding=None, errors=None):
|
|
1887
|
-
"""
|
|
1888
|
-
Context manager that gives out a download stream to download data.
|
|
1889
|
-
"""
|
|
1890
|
-
info = self
|
|
1891
|
-
|
|
1892
|
-
class DownloadPipe(ReadablePipe):
|
|
1893
|
-
def writeTo(self, writable):
|
|
1894
|
-
if info.content is not None:
|
|
1895
|
-
writable.write(info.content)
|
|
1896
|
-
elif info.version:
|
|
1897
|
-
headerArgs = info._s3EncryptionArgs()
|
|
1898
|
-
obj = info.outer.files_bucket.Object(compat_bytes(info.fileID))
|
|
1899
|
-
for attempt in retry_s3():
|
|
1900
|
-
with attempt:
|
|
1901
|
-
obj.download_fileobj(
|
|
1902
|
-
writable,
|
|
1903
|
-
ExtraArgs={"VersionId": info.version, **headerArgs},
|
|
1904
|
-
)
|
|
1905
|
-
else:
|
|
1906
|
-
assert False
|
|
1907
|
-
|
|
1908
|
-
class HashingPipe(ReadableTransformingPipe):
|
|
1909
|
-
"""
|
|
1910
|
-
Class which checksums all the data read through it. If it
|
|
1911
|
-
reaches EOF and the checksum isn't correct, raises
|
|
1912
|
-
ChecksumError.
|
|
1913
|
-
|
|
1914
|
-
Assumes info actually has a checksum.
|
|
1915
|
-
"""
|
|
1916
|
-
|
|
1917
|
-
def transform(self, readable, writable):
|
|
1918
|
-
hasher = info._start_checksum(to_match=info.checksum)
|
|
1919
|
-
contents = readable.read(1024 * 1024)
|
|
1920
|
-
while contents != b"":
|
|
1921
|
-
info._update_checksum(hasher, contents)
|
|
1922
|
-
try:
|
|
1923
|
-
writable.write(contents)
|
|
1924
|
-
except BrokenPipeError:
|
|
1925
|
-
# Read was stopped early by user code.
|
|
1926
|
-
# Can't check the checksum.
|
|
1927
|
-
return
|
|
1928
|
-
contents = readable.read(1024 * 1024)
|
|
1929
|
-
# We reached EOF in the input.
|
|
1930
|
-
# Finish checksumming and verify.
|
|
1931
|
-
info._finish_checksum(hasher)
|
|
1932
|
-
# Now stop so EOF happens in the output.
|
|
1933
|
-
|
|
1934
|
-
if verifyChecksum and self.checksum:
|
|
1935
|
-
with DownloadPipe() as readable:
|
|
1936
|
-
# Interpose a pipe to check the hash
|
|
1937
|
-
with HashingPipe(
|
|
1938
|
-
readable, encoding=encoding, errors=errors
|
|
1939
|
-
) as verified:
|
|
1940
|
-
yield verified
|
|
1941
|
-
else:
|
|
1942
|
-
# Readable end of pipe produces text mode output if encoding specified
|
|
1943
|
-
with DownloadPipe(encoding=encoding, errors=errors) as readable:
|
|
1944
|
-
# No true checksum available, so don't hash
|
|
1945
|
-
yield readable
|
|
1946
|
-
|
|
1947
|
-
def delete(self):
|
|
1948
|
-
store = self.outer
|
|
1949
|
-
if self.previousVersion is not None:
|
|
1950
|
-
expected: "UpdateConditionTypeDef" = {
|
|
1951
|
-
"Name": "version",
|
|
1952
|
-
"Value": cast(str, self.previousVersion),
|
|
1953
|
-
}
|
|
1954
|
-
for attempt in retry_sdb():
|
|
1955
|
-
with attempt:
|
|
1956
|
-
store.db.delete_attributes(
|
|
1957
|
-
DomainName=store.files_domain_name,
|
|
1958
|
-
ItemName=compat_bytes(self.fileID),
|
|
1959
|
-
Expected=expected,
|
|
1960
|
-
)
|
|
1961
|
-
if self.previousVersion:
|
|
1962
|
-
for attempt in retry_s3():
|
|
1963
|
-
with attempt:
|
|
1964
|
-
store.s3_client.delete_object(
|
|
1965
|
-
Bucket=store.files_bucket.name,
|
|
1966
|
-
Key=compat_bytes(self.fileID),
|
|
1967
|
-
VersionId=self.previousVersion,
|
|
1968
|
-
)
|
|
1969
|
-
|
|
1970
|
-
def getSize(self):
|
|
1971
|
-
"""
|
|
1972
|
-
Return the size of the referenced item in bytes.
|
|
1973
|
-
"""
|
|
1974
|
-
if self.content is not None:
|
|
1975
|
-
return len(self.content)
|
|
1976
|
-
elif self.version:
|
|
1977
|
-
for attempt in retry_s3():
|
|
1978
|
-
with attempt:
|
|
1979
|
-
obj = self.outer.files_bucket.Object(compat_bytes(self.fileID))
|
|
1980
|
-
return obj.content_length
|
|
1981
|
-
else:
|
|
1982
|
-
return 0
|
|
1983
|
-
|
|
1984
|
-
def _getSSEKey(self) -> Optional[bytes]:
|
|
1985
|
-
sseKeyPath = self.outer.sseKeyPath
|
|
1986
|
-
if sseKeyPath:
|
|
1987
|
-
with open(sseKeyPath, "rb") as f:
|
|
1988
|
-
sseKey = f.read()
|
|
1989
|
-
return sseKey
|
|
1990
|
-
|
|
1991
|
-
def _s3EncryptionArgs(self):
|
|
1992
|
-
# the keys of the returned dictionary are unpacked to the corresponding boto3 optional
|
|
1993
|
-
# parameters and will be used to set the http headers
|
|
1994
|
-
if self.encrypted:
|
|
1995
|
-
sseKey = self._getSSEKey()
|
|
1996
|
-
assert (
|
|
1997
|
-
sseKey is not None
|
|
1998
|
-
), "Content is encrypted but no key was provided."
|
|
1999
|
-
assert len(sseKey) == 32
|
|
2000
|
-
# boto3 encodes the key and calculates the MD5 for us
|
|
2001
|
-
return {"SSECustomerAlgorithm": "AES256", "SSECustomerKey": sseKey}
|
|
2002
|
-
else:
|
|
2003
|
-
return {}
|
|
2004
|
-
|
|
2005
|
-
def __repr__(self):
|
|
2006
|
-
r = custom_repr
|
|
2007
|
-
d = (
|
|
2008
|
-
("fileID", r(self.fileID)),
|
|
2009
|
-
("ownerID", r(self.ownerID)),
|
|
2010
|
-
("encrypted", r(self.encrypted)),
|
|
2011
|
-
("version", r(self.version)),
|
|
2012
|
-
("previousVersion", r(self.previousVersion)),
|
|
2013
|
-
("content", r(self.content)),
|
|
2014
|
-
("checksum", r(self.checksum)),
|
|
2015
|
-
("_numContentChunks", r(self._numContentChunks)),
|
|
2016
|
-
)
|
|
2017
|
-
return "{}({})".format(
|
|
2018
|
-
type(self).__name__, ", ".join(f"{k}={v}" for k, v in d)
|
|
2019
|
-
)
|
|
995
|
+
regions = EC2Regions.keys()
|
|
996
|
+
if region not in regions:
|
|
997
|
+
raise ValueError(f'AWS Region "{region}" is not one of: {regions}')
|
|
2020
998
|
|
|
2021
|
-
|
|
999
|
+
if not 3 <= len(jobstore_name) <= 56:
|
|
1000
|
+
raise ValueError(f'AWS jobstore name must be between 3 and 56 chars: '
|
|
1001
|
+
f'{jobstore_name} (len: {len(jobstore_name)})')
|
|
2022
1002
|
|
|
2023
|
-
|
|
2024
|
-
""
|
|
2025
|
-
|
|
2026
|
-
which we map to True, None and False respectively. Note that we've never seen a versioning
|
|
2027
|
-
status of 'Disabled', only the None return value. Calling BucketVersioning.suspend() will
|
|
2028
|
-
cause BucketVersioning.status to then return 'Suspended' even on a new bucket that never
|
|
2029
|
-
had versioning enabled.
|
|
1003
|
+
if not re.compile(r'^[a-z0-9][a-z0-9-]+[a-z0-9]$').match(jobstore_name):
|
|
1004
|
+
raise ValueError(f"Invalid AWS jobstore name: '{jobstore_name}'. Must contain only digits, "
|
|
1005
|
+
f"lower-case letters, and hyphens. Must also not start or end in a hyphen.")
|
|
2030
1006
|
|
|
2031
|
-
|
|
2032
|
-
""
|
|
2033
|
-
|
|
2034
|
-
with attempt:
|
|
2035
|
-
status = self.s3_resource.BucketVersioning(bucket_name).status
|
|
2036
|
-
return self.versionings.get(status) if status else False
|
|
2037
|
-
|
|
2038
|
-
# TODO: Make this retry more specific?
|
|
2039
|
-
# example: https://github.com/DataBiosphere/toil/issues/3378
|
|
2040
|
-
@retry()
|
|
2041
|
-
def destroy(self):
|
|
2042
|
-
# FIXME: Destruction of encrypted stores only works after initialize() or .resume()
|
|
2043
|
-
# See https://github.com/BD2KGenomics/toil/issues/1041
|
|
2044
|
-
try:
|
|
2045
|
-
self._bind(create=False, block=False, check_versioning_consistency=False)
|
|
2046
|
-
except BucketLocationConflictException:
|
|
2047
|
-
# If the unique jobstore bucket name existed, _bind would have raised a
|
|
2048
|
-
# BucketLocationConflictException before calling destroy. Calling _bind here again
|
|
2049
|
-
# would reraise the same exception so we need to catch and ignore that exception.
|
|
2050
|
-
pass
|
|
2051
|
-
# TODO: Add other failure cases to be ignored here.
|
|
2052
|
-
self._registered = None
|
|
2053
|
-
if self.files_bucket is not None:
|
|
2054
|
-
self._delete_bucket(self.files_bucket)
|
|
2055
|
-
self.files_bucket = None
|
|
2056
|
-
for name in "files_domain_name", "jobs_domain_name":
|
|
2057
|
-
domainName = getattr(self, name)
|
|
2058
|
-
if domainName is not None:
|
|
2059
|
-
self._delete_domain(domainName)
|
|
2060
|
-
setattr(self, name, None)
|
|
2061
|
-
self._registered = False
|
|
2062
|
-
|
|
2063
|
-
def _delete_domain(self, domainName):
|
|
2064
|
-
for attempt in retry_sdb():
|
|
2065
|
-
with attempt:
|
|
2066
|
-
try:
|
|
2067
|
-
self.db.delete_domain(DomainName=domainName)
|
|
2068
|
-
except ClientError as e:
|
|
2069
|
-
if not no_such_sdb_domain(e):
|
|
2070
|
-
raise
|
|
2071
|
-
|
|
2072
|
-
@staticmethod
|
|
2073
|
-
def _delete_bucket(bucket):
|
|
2074
|
-
"""
|
|
2075
|
-
:param bucket: S3.Bucket
|
|
2076
|
-
"""
|
|
2077
|
-
for attempt in retry_s3():
|
|
2078
|
-
with attempt:
|
|
2079
|
-
try:
|
|
2080
|
-
uploads = s3_boto3_client.list_multipart_uploads(
|
|
2081
|
-
Bucket=bucket.name
|
|
2082
|
-
).get("Uploads")
|
|
2083
|
-
if uploads:
|
|
2084
|
-
for u in uploads:
|
|
2085
|
-
s3_boto3_client.abort_multipart_upload(
|
|
2086
|
-
Bucket=bucket.name, Key=u["Key"], UploadId=u["UploadId"]
|
|
2087
|
-
)
|
|
2088
|
-
|
|
2089
|
-
bucket.objects.all().delete()
|
|
2090
|
-
bucket.object_versions.delete()
|
|
2091
|
-
bucket.delete()
|
|
2092
|
-
except s3_boto3_client.exceptions.NoSuchBucket:
|
|
2093
|
-
pass
|
|
2094
|
-
except ClientError as e:
|
|
2095
|
-
if get_error_status(e) != 404:
|
|
2096
|
-
raise
|
|
2097
|
-
|
|
2098
|
-
|
|
2099
|
-
aRepr = reprlib.Repr()
|
|
2100
|
-
aRepr.maxstring = 38 # so UUIDs don't get truncated (36 for UUID plus 2 for quotes)
|
|
2101
|
-
custom_repr = aRepr.repr
|
|
2102
|
-
|
|
2103
|
-
|
|
2104
|
-
class BucketLocationConflictException(LocatorException):
|
|
2105
|
-
def __init__(self, bucketRegion):
|
|
2106
|
-
super().__init__(
|
|
2107
|
-
"A bucket with the same name as the jobstore was found in another region (%s). "
|
|
2108
|
-
"Cannot proceed as the unique bucket name is already in use.",
|
|
2109
|
-
locator=bucketRegion,
|
|
2110
|
-
)
|
|
1007
|
+
if '--' in jobstore_name:
|
|
1008
|
+
raise ValueError(f"AWS jobstore names may not contain '--': {jobstore_name}")
|
|
1009
|
+
return region, bucket_name
|