toil 8.0.0__py3-none-any.whl → 8.1.0b1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. toil/__init__.py +4 -4
  2. toil/batchSystems/options.py +1 -0
  3. toil/batchSystems/slurm.py +227 -83
  4. toil/common.py +161 -45
  5. toil/cwl/cwltoil.py +31 -10
  6. toil/job.py +47 -38
  7. toil/jobStores/aws/jobStore.py +46 -10
  8. toil/lib/aws/session.py +14 -3
  9. toil/lib/aws/utils.py +92 -35
  10. toil/lib/dockstore.py +379 -0
  11. toil/lib/ec2nodes.py +3 -2
  12. toil/lib/history.py +1271 -0
  13. toil/lib/history_submission.py +681 -0
  14. toil/lib/io.py +22 -1
  15. toil/lib/misc.py +18 -0
  16. toil/lib/retry.py +10 -10
  17. toil/lib/{integration.py → trs.py} +95 -46
  18. toil/lib/web.py +38 -0
  19. toil/options/common.py +17 -2
  20. toil/options/cwl.py +10 -0
  21. toil/provisioners/gceProvisioner.py +4 -4
  22. toil/server/cli/wes_cwl_runner.py +3 -3
  23. toil/server/utils.py +2 -3
  24. toil/statsAndLogging.py +35 -1
  25. toil/test/batchSystems/test_slurm.py +172 -2
  26. toil/test/cwl/conftest.py +39 -0
  27. toil/test/cwl/cwlTest.py +105 -2
  28. toil/test/cwl/optional-file.cwl +18 -0
  29. toil/test/lib/test_history.py +212 -0
  30. toil/test/lib/test_trs.py +161 -0
  31. toil/test/wdl/wdltoil_test.py +1 -1
  32. toil/version.py +10 -10
  33. toil/wdl/wdltoil.py +23 -9
  34. toil/worker.py +113 -33
  35. {toil-8.0.0.dist-info → toil-8.1.0b1.dist-info}/METADATA +9 -4
  36. {toil-8.0.0.dist-info → toil-8.1.0b1.dist-info}/RECORD +40 -34
  37. {toil-8.0.0.dist-info → toil-8.1.0b1.dist-info}/WHEEL +1 -1
  38. toil/test/lib/test_integration.py +0 -104
  39. {toil-8.0.0.dist-info → toil-8.1.0b1.dist-info}/LICENSE +0 -0
  40. {toil-8.0.0.dist-info → toil-8.1.0b1.dist-info}/entry_points.txt +0 -0
  41. {toil-8.0.0.dist-info → toil-8.1.0b1.dist-info}/top_level.txt +0 -0
@@ -637,30 +637,61 @@ class AWSJobStore(AbstractJobStore):
637
637
  else:
638
638
  super()._default_export_file(otherCls, file_id, uri)
639
639
 
640
+ ###
641
+ # URL access implementation
642
+ ###
643
+
644
+ # URL access methods aren't used by the rest of the job store methods.
645
+
640
646
  @classmethod
641
647
  def _url_exists(cls, url: ParseResult) -> bool:
642
648
  try:
643
- get_object_for_url(url, existing=True)
649
+ try:
650
+ get_object_for_url(url, existing=True, anonymous=True)
651
+ except PermissionError:
652
+ # If we can't look anonymously, log in
653
+ get_object_for_url(url, existing=True)
644
654
  return True
645
655
  except FileNotFoundError:
646
656
  # Not a file
647
- # Might be a directory.
657
+ # Might be a directory. Or we might not have access to know.
658
+ # See if it's a directory.
648
659
  return cls._get_is_directory(url)
649
660
 
650
661
  @classmethod
651
662
  def _get_size(cls, url: ParseResult) -> int:
652
- return get_object_for_url(url, existing=True).content_length
663
+ try:
664
+ src_obj = get_object_for_url(url, existing=True, anonymous=True)
665
+ except PermissionError:
666
+ src_obj = get_object_for_url(url, existing=True)
667
+ return src_obj.content_length
653
668
 
654
669
  @classmethod
655
670
  def _read_from_url(cls, url: ParseResult, writable):
656
- srcObj = get_object_for_url(url, existing=True)
657
- srcObj.download_fileobj(writable)
658
- return (srcObj.content_length, False) # executable bit is always False
671
+ try:
672
+ src_obj = get_object_for_url(url, existing=True, anonymous=True)
673
+ src_obj.download_fileobj(writable)
674
+ except Exception as e:
675
+ if isinstance(e, PermissionError) or (isinstance(e, ClientError) and get_error_status(e) == 403):
676
+ # The object setup or the download does not have permission. Try again with a login.
677
+ src_obj = get_object_for_url(url, existing=True)
678
+ src_obj.download_fileobj(writable)
679
+ else:
680
+ raise
681
+ return (src_obj.content_length, False) # executable bit is always False
659
682
 
660
683
  @classmethod
661
684
  def _open_url(cls, url: ParseResult) -> IO[bytes]:
662
- src_obj = get_object_for_url(url, existing=True)
663
- response = src_obj.get()
685
+ try:
686
+ src_obj = get_object_for_url(url, existing=True, anonymous=True)
687
+ response = src_obj.get()
688
+ except Exception as e:
689
+ if isinstance(e, PermissionError) or (isinstance(e, ClientError) and get_error_status(e) == 403):
690
+ # The object setup or the download does not have permission. Try again with a login.
691
+ src_obj = get_object_for_url(url, existing=True)
692
+ response = src_obj.get()
693
+ else:
694
+ raise
664
695
  # We should get back a response with a stream in 'Body'
665
696
  if "Body" not in response:
666
697
  raise RuntimeError(f"Could not fetch body stream for {url}")
@@ -670,6 +701,7 @@ class AWSJobStore(AbstractJobStore):
670
701
  def _write_to_url(
671
702
  cls, readable, url: ParseResult, executable: bool = False
672
703
  ) -> None:
704
+ # Don't try to do anonympus writes.
673
705
  dstObj = get_object_for_url(url)
674
706
 
675
707
  logger.debug("Uploading %s", dstObj.key)
@@ -684,13 +716,17 @@ class AWSJobStore(AbstractJobStore):
684
716
 
685
717
  @classmethod
686
718
  def _list_url(cls, url: ParseResult) -> list[str]:
687
- return list_objects_for_url(url)
719
+ try:
720
+ return list_objects_for_url(url, anonymous=True)
721
+ except PermissionError:
722
+ return list_objects_for_url(url)
723
+
688
724
 
689
725
  @classmethod
690
726
  def _get_is_directory(cls, url: ParseResult) -> bool:
691
727
  # We consider it a directory if anything is in it.
692
728
  # TODO: Can we just get the first item and not the whole list?
693
- return len(list_objects_for_url(url)) > 0
729
+ return len(cls._list_url(url)) > 0
694
730
 
695
731
  @classmethod
696
732
  def _supports_url(cls, url: ParseResult, export: bool = False) -> bool:
toil/lib/aws/session.py CHANGED
@@ -35,6 +35,9 @@ if TYPE_CHECKING:
35
35
 
36
36
  logger = logging.getLogger(__name__)
37
37
 
38
+ # You can pass config=ANONYMOUS_CONFIG to make anonymous S3 accesses
39
+ ANONYMOUS_CONFIG = Config(signature_version=botocore.UNSIGNED)
40
+
38
41
  # A note on thread safety:
39
42
  #
40
43
  # Boto3 Session: Not thread safe, 1 per thread is required.
@@ -148,6 +151,7 @@ class AWSConnectionManager:
148
151
  region: Optional[str],
149
152
  service_name: Literal["s3"],
150
153
  endpoint_url: Optional[str] = None,
154
+ config: Optional[Config] = None,
151
155
  ) -> "S3ServiceResource": ...
152
156
  @overload
153
157
  def resource(
@@ -155,6 +159,7 @@ class AWSConnectionManager:
155
159
  region: Optional[str],
156
160
  service_name: Literal["iam"],
157
161
  endpoint_url: Optional[str] = None,
162
+ config: Optional[Config] = None,
158
163
  ) -> "IAMServiceResource": ...
159
164
  @overload
160
165
  def resource(
@@ -162,6 +167,7 @@ class AWSConnectionManager:
162
167
  region: Optional[str],
163
168
  service_name: Literal["ec2"],
164
169
  endpoint_url: Optional[str] = None,
170
+ config: Optional[Config] = None,
165
171
  ) -> "EC2ServiceResource": ...
166
172
 
167
173
  def resource(
@@ -169,6 +175,7 @@ class AWSConnectionManager:
169
175
  region: Optional[str],
170
176
  service_name: str,
171
177
  endpoint_url: Optional[str] = None,
178
+ config: Optional[Config] = None,
172
179
  ) -> boto3.resources.base.ServiceResource:
173
180
  """
174
181
  Get the Boto3 Resource to use with the given service (like 'ec2') in the given region.
@@ -188,10 +195,10 @@ class AWSConnectionManager:
188
195
  # The Boto3 stubs are missing an overload for `resource` that takes
189
196
  # a non-literal string. See
190
197
  # <https://github.com/vemel/mypy_boto3_builder/issues/121#issuecomment-1011322636>
191
- storage.item = self.session(region).resource(service_name, endpoint_url=endpoint_url) # type: ignore
198
+ storage.item = self.session(region).resource(service_name, endpoint_url=endpoint_url, config=config) # type: ignore
192
199
  else:
193
200
  # We might not be able to pass None to Boto3 and have it be the same as no argument.
194
- storage.item = self.session(region).resource(service_name) # type: ignore
201
+ storage.item = self.session(region).resource(service_name, config=config) # type: ignore
195
202
 
196
203
  return cast(boto3.resources.base.ServiceResource, storage.item)
197
204
 
@@ -369,18 +376,21 @@ def resource(
369
376
  service_name: Literal["s3"],
370
377
  region_name: Optional[str] = None,
371
378
  endpoint_url: Optional[str] = None,
379
+ config: Optional[Config] = None,
372
380
  ) -> "S3ServiceResource": ...
373
381
  @overload
374
382
  def resource(
375
383
  service_name: Literal["iam"],
376
384
  region_name: Optional[str] = None,
377
385
  endpoint_url: Optional[str] = None,
386
+ config: Optional[Config] = None,
378
387
  ) -> "IAMServiceResource": ...
379
388
  @overload
380
389
  def resource(
381
390
  service_name: Literal["ec2"],
382
391
  region_name: Optional[str] = None,
383
392
  endpoint_url: Optional[str] = None,
393
+ config: Optional[Config] = None,
384
394
  ) -> "EC2ServiceResource": ...
385
395
 
386
396
 
@@ -388,6 +398,7 @@ def resource(
388
398
  service_name: Literal["s3", "iam", "ec2"],
389
399
  region_name: Optional[str] = None,
390
400
  endpoint_url: Optional[str] = None,
401
+ config: Optional[Config] = None,
391
402
  ) -> boto3.resources.base.ServiceResource:
392
403
  """
393
404
  Get a Boto 3 resource for a particular AWS service, usable by the current thread.
@@ -397,5 +408,5 @@ def resource(
397
408
 
398
409
  # Just use a global version of the manager. Note that we change the argument order!
399
410
  return _global_manager.resource(
400
- region_name, service_name, endpoint_url=endpoint_url
411
+ region_name, service_name, endpoint_url=endpoint_url, config=config
401
412
  )
toil/lib/aws/utils.py CHANGED
@@ -19,8 +19,10 @@ from collections.abc import Iterable, Iterator
19
19
  from typing import TYPE_CHECKING, Any, Callable, ContextManager, Optional, cast
20
20
  from urllib.parse import ParseResult
21
21
 
22
+ # To import toil.lib.aws.session, the AWS libraries must be installed
22
23
  from toil.lib.aws import AWSRegionName, AWSServerErrors, session
23
24
  from toil.lib.conversions import strtobool
25
+ from toil.lib.memoize import memoize
24
26
  from toil.lib.misc import printq
25
27
  from toil.lib.retry import (
26
28
  DEFAULT_DELAYS,
@@ -37,12 +39,7 @@ if TYPE_CHECKING:
37
39
  from mypy_boto3_s3.service_resource import Object as S3Object
38
40
  from mypy_boto3_sdb.type_defs import AttributeTypeDef
39
41
 
40
- try:
41
- from botocore.exceptions import ClientError, EndpointConnectionError
42
- except ImportError:
43
- ClientError = None # type: ignore
44
- EndpointConnectionError = None # type: ignore
45
- # AWS/boto extra is not installed
42
+ from botocore.exceptions import ClientError, EndpointConnectionError
46
43
 
47
44
  logger = logging.getLogger(__name__)
48
45
 
@@ -232,6 +229,7 @@ def get_bucket_region(
232
229
  bucket_name: str,
233
230
  endpoint_url: Optional[str] = None,
234
231
  only_strategies: Optional[set[int]] = None,
232
+ anonymous: Optional[bool] = None
235
233
  ) -> str:
236
234
  """
237
235
  Get the AWS region name associated with the given S3 bucket, or raise NoBucketLocationError.
@@ -241,9 +239,13 @@ def get_bucket_region(
241
239
  Takes an optional S3 API URL override.
242
240
 
243
241
  :param only_strategies: For testing, use only strategies with 1-based numbers in this set.
242
+
243
+ :raises NoBucketLocationError: if the bucket's region cannot be determined
244
+ (possibly due to lack of permissions).
244
245
  """
245
246
 
246
- s3_client = session.client("s3", endpoint_url=endpoint_url)
247
+ config = session.ANONYMOUS_CONFIG if anonymous else None
248
+ s3_client = session.client("s3", endpoint_url=endpoint_url, config=config)
247
249
 
248
250
  def attempt_get_bucket_location() -> Optional[str]:
249
251
  """
@@ -267,7 +269,7 @@ def get_bucket_region(
267
269
  # It could also be because AWS open data buckets (which we tend to
268
270
  # encounter this problem for) tend to actually themselves be in
269
271
  # us-east-1.
270
- backup_s3_client = session.client("s3", region_name="us-east-1")
272
+ backup_s3_client = session.client("s3", region_name="us-east-1", config=config)
271
273
  return backup_s3_client.get_bucket_location(Bucket=bucket_name).get(
272
274
  "LocationConstraint", None
273
275
  )
@@ -337,6 +339,30 @@ def get_bucket_region(
337
339
  "Could not get bucket location: " + "\n".join(error_messages)
338
340
  ) from last_error
339
341
 
342
+ @memoize
343
+ def get_bucket_region_if_available(
344
+ bucket_name: str,
345
+ endpoint_url: Optional[str] = None,
346
+ only_strategies: Optional[set[int]] = None,
347
+ anonymous: Optional[bool] = None
348
+ ) -> Optional[str]:
349
+ """
350
+ Get the AWS region name associated with the given S3 bucket, or return None.
351
+
352
+ Caches results, so may not return the location for a bucket that has been
353
+ created but was previously observed to be nonexistent.
354
+
355
+ :param only_strategies: For testing, use only strategies with 1-based numbers in this set.
356
+ """
357
+
358
+ try:
359
+ return get_bucket_region(bucket_name, endpoint_url, only_strategies, anonymous)
360
+ except Exception as e:
361
+ if isinstance(e, NoBucketLocationError) or (isinstance(e, ClientError) and get_error_status(e) == 403):
362
+ # We can't know
363
+ return None
364
+ else:
365
+ raise
340
366
 
341
367
  def region_to_bucket_location(region: str) -> str:
342
368
  return "" if region == "us-east-1" else region
@@ -346,7 +372,7 @@ def bucket_location_to_region(location: Optional[str]) -> str:
346
372
  return "us-east-1" if location == "" or location is None else location
347
373
 
348
374
 
349
- def get_object_for_url(url: ParseResult, existing: Optional[bool] = None) -> "S3Object":
375
+ def get_object_for_url(url: ParseResult, existing: Optional[bool] = None, anonymous: Optional[bool] = None) -> "S3Object":
350
376
  """
351
377
  Extracts a key (object) from a given parsed s3:// URL.
352
378
 
@@ -354,6 +380,10 @@ def get_object_for_url(url: ParseResult, existing: Optional[bool] = None) -> "S3
354
380
 
355
381
  :param bool existing: If True, key is expected to exist. If False, key is expected not to
356
382
  exists and it will be created. If None, the key will be created if it doesn't exist.
383
+
384
+ :raises FileNotFoundError: when existing is True and the object does not exist.
385
+ :raises RuntimeError: when existing is False but the object exists.
386
+ :raises PermissionError: when we are not authorized to look at the object.
357
387
  """
358
388
 
359
389
  key_name = url.path[1:]
@@ -372,17 +402,19 @@ def get_object_for_url(url: ParseResult, existing: Optional[bool] = None) -> "S3
372
402
  # TODO: OrdinaryCallingFormat equivalent in boto3?
373
403
  # if botoargs:
374
404
  # botoargs['calling_format'] = boto.s3.connection.OrdinaryCallingFormat()
375
-
376
- try:
377
- # Get the bucket's region to avoid a redirect per request
378
- region = get_bucket_region(bucket_name, endpoint_url=endpoint_url)
379
- s3 = session.resource("s3", region_name=region, endpoint_url=endpoint_url)
380
- except NoBucketLocationError as e:
381
- # Probably don't have permission.
382
- # TODO: check if it is that
383
- logger.debug("Couldn't get bucket location: %s", e)
405
+
406
+ config = session.ANONYMOUS_CONFIG if anonymous else None
407
+ # Get the bucket's region to avoid a redirect per request.
408
+ # Cache the result
409
+ region = get_bucket_region_if_available(bucket_name, endpoint_url=endpoint_url, anonymous=anonymous)
410
+ if region is not None:
411
+ s3 = session.resource("s3", region_name=region, endpoint_url=endpoint_url, config=config)
412
+ else:
413
+ # We can't get the bucket location, perhaps because we don't have
414
+ # permission to do that.
415
+ logger.debug("Couldn't get bucket location")
384
416
  logger.debug("Fall back to not specifying location")
385
- s3 = session.resource("s3", endpoint_url=endpoint_url)
417
+ s3 = session.resource("s3", endpoint_url=endpoint_url, config=config)
386
418
 
387
419
  obj = s3.Object(bucket_name, key_name)
388
420
  objExists = True
@@ -392,6 +424,10 @@ def get_object_for_url(url: ParseResult, existing: Optional[bool] = None) -> "S3
392
424
  except ClientError as e:
393
425
  if get_error_status(e) == 404:
394
426
  objExists = False
427
+ elif get_error_status(e) == 403:
428
+ raise PermissionError(
429
+ f"Key '{key_name}' is not accessible in bucket '{bucket_name}'."
430
+ ) from e
395
431
  else:
396
432
  raise
397
433
  if existing is True and not objExists:
@@ -402,16 +438,27 @@ def get_object_for_url(url: ParseResult, existing: Optional[bool] = None) -> "S3
402
438
  raise RuntimeError(f"Key '{key_name}' exists in bucket '{bucket_name}'.")
403
439
 
404
440
  if not objExists:
405
- obj.put() # write an empty file
441
+ try:
442
+ obj.put() # write an empty file
443
+ except ClientError as e:
444
+ if get_error_status(e) == 403:
445
+ raise PermissionError(
446
+ f"Key '{key_name}' is not writable in bucket '{bucket_name}'."
447
+ ) from e
448
+ else:
449
+ raise
406
450
  return obj
407
451
 
408
452
 
409
453
  @retry(errors=[AWSServerErrors])
410
- def list_objects_for_url(url: ParseResult) -> list[str]:
454
+ def list_objects_for_url(url: ParseResult, anonymous: Optional[bool] = None) -> list[str]:
411
455
  """
412
456
  Extracts a key (object) from a given parsed s3:// URL. The URL will be
413
457
  supplemented with a trailing slash if it is missing.
458
+
459
+ :raises PermissionError: when we are not authorized to do the list operation.
414
460
  """
461
+
415
462
  key_name = url.path[1:]
416
463
  bucket_name = url.netloc
417
464
 
@@ -430,23 +477,33 @@ def list_objects_for_url(url: ParseResult) -> list[str]:
430
477
  protocol = "http"
431
478
  if host:
432
479
  endpoint_url = f"{protocol}://{host}" + f":{port}" if port else ""
433
-
434
- client = session.client("s3", endpoint_url=endpoint_url)
480
+
481
+ config = session.ANONYMOUS_CONFIG if anonymous else None
482
+ client = session.client("s3", endpoint_url=endpoint_url, config=config)
435
483
 
436
484
  listing = []
485
+
486
+ try:
487
+ paginator = client.get_paginator("list_objects_v2")
488
+ result = paginator.paginate(Bucket=bucket_name, Prefix=key_name, Delimiter="/")
489
+ for page in result:
490
+ if "CommonPrefixes" in page:
491
+ for prefix_item in page["CommonPrefixes"]:
492
+ listing.append(prefix_item["Prefix"][len(key_name) :])
493
+ if "Contents" in page:
494
+ for content_item in page["Contents"]:
495
+ if content_item["Key"] == key_name:
496
+ # Ignore folder name itself
497
+ continue
498
+ listing.append(content_item["Key"][len(key_name) :])
499
+ except ClientError as e:
500
+ if get_error_status(e) == 403:
501
+ raise PermissionError(
502
+ f"Prefix '{key_name}' is not authorized to be listed in bucket '{bucket_name}'."
503
+ ) from e
504
+ else:
505
+ raise
437
506
 
438
- paginator = client.get_paginator("list_objects_v2")
439
- result = paginator.paginate(Bucket=bucket_name, Prefix=key_name, Delimiter="/")
440
- for page in result:
441
- if "CommonPrefixes" in page:
442
- for prefix_item in page["CommonPrefixes"]:
443
- listing.append(prefix_item["Prefix"][len(key_name) :])
444
- if "Contents" in page:
445
- for content_item in page["Contents"]:
446
- if content_item["Key"] == key_name:
447
- # Ignore folder name itself
448
- continue
449
- listing.append(content_item["Key"][len(key_name) :])
450
507
 
451
508
  logger.debug("Found in %s items: %s", url, listing)
452
509
  return listing