toil 7.0.0__py3-none-any.whl → 8.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- toil/__init__.py +121 -83
- toil/batchSystems/__init__.py +1 -0
- toil/batchSystems/abstractBatchSystem.py +137 -77
- toil/batchSystems/abstractGridEngineBatchSystem.py +211 -101
- toil/batchSystems/awsBatch.py +237 -128
- toil/batchSystems/cleanup_support.py +22 -16
- toil/batchSystems/contained_executor.py +30 -26
- toil/batchSystems/gridengine.py +85 -49
- toil/batchSystems/htcondor.py +164 -87
- toil/batchSystems/kubernetes.py +622 -386
- toil/batchSystems/local_support.py +17 -12
- toil/batchSystems/lsf.py +132 -79
- toil/batchSystems/lsfHelper.py +13 -11
- toil/batchSystems/mesos/__init__.py +41 -29
- toil/batchSystems/mesos/batchSystem.py +288 -149
- toil/batchSystems/mesos/executor.py +77 -49
- toil/batchSystems/mesos/test/__init__.py +31 -23
- toil/batchSystems/options.py +38 -29
- toil/batchSystems/registry.py +53 -19
- toil/batchSystems/singleMachine.py +293 -123
- toil/batchSystems/slurm.py +489 -137
- toil/batchSystems/torque.py +46 -32
- toil/bus.py +141 -73
- toil/common.py +630 -359
- toil/cwl/__init__.py +1 -1
- toil/cwl/cwltoil.py +1114 -532
- toil/cwl/utils.py +17 -22
- toil/deferred.py +62 -41
- toil/exceptions.py +5 -3
- toil/fileStores/__init__.py +5 -5
- toil/fileStores/abstractFileStore.py +88 -57
- toil/fileStores/cachingFileStore.py +711 -247
- toil/fileStores/nonCachingFileStore.py +113 -75
- toil/job.py +988 -315
- toil/jobStores/abstractJobStore.py +387 -243
- toil/jobStores/aws/jobStore.py +727 -403
- toil/jobStores/aws/utils.py +161 -109
- toil/jobStores/conftest.py +1 -0
- toil/jobStores/fileJobStore.py +289 -151
- toil/jobStores/googleJobStore.py +137 -70
- toil/jobStores/utils.py +36 -15
- toil/leader.py +614 -269
- toil/lib/accelerators.py +115 -18
- toil/lib/aws/__init__.py +55 -28
- toil/lib/aws/ami.py +122 -87
- toil/lib/aws/iam.py +284 -108
- toil/lib/aws/s3.py +31 -0
- toil/lib/aws/session.py +193 -58
- toil/lib/aws/utils.py +238 -218
- toil/lib/bioio.py +13 -5
- toil/lib/compatibility.py +11 -6
- toil/lib/conversions.py +83 -49
- toil/lib/docker.py +131 -103
- toil/lib/ec2.py +322 -209
- toil/lib/ec2nodes.py +174 -106
- toil/lib/encryption/_dummy.py +5 -3
- toil/lib/encryption/_nacl.py +10 -6
- toil/lib/encryption/conftest.py +1 -0
- toil/lib/exceptions.py +26 -7
- toil/lib/expando.py +4 -2
- toil/lib/ftp_utils.py +217 -0
- toil/lib/generatedEC2Lists.py +127 -19
- toil/lib/humanize.py +6 -2
- toil/lib/integration.py +341 -0
- toil/lib/io.py +99 -11
- toil/lib/iterables.py +4 -2
- toil/lib/memoize.py +12 -8
- toil/lib/misc.py +65 -18
- toil/lib/objects.py +2 -2
- toil/lib/resources.py +19 -7
- toil/lib/retry.py +115 -77
- toil/lib/threading.py +282 -80
- toil/lib/throttle.py +15 -14
- toil/options/common.py +834 -401
- toil/options/cwl.py +175 -90
- toil/options/runner.py +50 -0
- toil/options/wdl.py +70 -19
- toil/provisioners/__init__.py +111 -46
- toil/provisioners/abstractProvisioner.py +322 -157
- toil/provisioners/aws/__init__.py +62 -30
- toil/provisioners/aws/awsProvisioner.py +980 -627
- toil/provisioners/clusterScaler.py +541 -279
- toil/provisioners/gceProvisioner.py +282 -179
- toil/provisioners/node.py +147 -79
- toil/realtimeLogger.py +34 -22
- toil/resource.py +137 -75
- toil/server/app.py +127 -61
- toil/server/celery_app.py +3 -1
- toil/server/cli/wes_cwl_runner.py +82 -53
- toil/server/utils.py +54 -28
- toil/server/wes/abstract_backend.py +64 -26
- toil/server/wes/amazon_wes_utils.py +21 -15
- toil/server/wes/tasks.py +121 -63
- toil/server/wes/toil_backend.py +142 -107
- toil/server/wsgi_app.py +4 -3
- toil/serviceManager.py +58 -22
- toil/statsAndLogging.py +148 -64
- toil/test/__init__.py +263 -179
- toil/test/batchSystems/batchSystemTest.py +438 -195
- toil/test/batchSystems/batch_system_plugin_test.py +18 -7
- toil/test/batchSystems/test_gridengine.py +173 -0
- toil/test/batchSystems/test_lsf_helper.py +67 -58
- toil/test/batchSystems/test_slurm.py +93 -47
- toil/test/cactus/test_cactus_integration.py +20 -22
- toil/test/cwl/cwlTest.py +271 -71
- toil/test/cwl/measure_default_memory.cwl +12 -0
- toil/test/cwl/not_run_required_input.cwl +29 -0
- toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
- toil/test/docs/scriptsTest.py +60 -34
- toil/test/jobStores/jobStoreTest.py +412 -235
- toil/test/lib/aws/test_iam.py +116 -48
- toil/test/lib/aws/test_s3.py +16 -9
- toil/test/lib/aws/test_utils.py +5 -6
- toil/test/lib/dockerTest.py +118 -141
- toil/test/lib/test_conversions.py +113 -115
- toil/test/lib/test_ec2.py +57 -49
- toil/test/lib/test_integration.py +104 -0
- toil/test/lib/test_misc.py +12 -5
- toil/test/mesos/MesosDataStructuresTest.py +23 -10
- toil/test/mesos/helloWorld.py +7 -6
- toil/test/mesos/stress.py +25 -20
- toil/test/options/options.py +7 -2
- toil/test/provisioners/aws/awsProvisionerTest.py +293 -140
- toil/test/provisioners/clusterScalerTest.py +440 -250
- toil/test/provisioners/clusterTest.py +81 -42
- toil/test/provisioners/gceProvisionerTest.py +174 -100
- toil/test/provisioners/provisionerTest.py +25 -13
- toil/test/provisioners/restartScript.py +5 -4
- toil/test/server/serverTest.py +188 -141
- toil/test/sort/restart_sort.py +137 -68
- toil/test/sort/sort.py +134 -66
- toil/test/sort/sortTest.py +91 -49
- toil/test/src/autoDeploymentTest.py +140 -100
- toil/test/src/busTest.py +20 -18
- toil/test/src/checkpointTest.py +8 -2
- toil/test/src/deferredFunctionTest.py +49 -35
- toil/test/src/dockerCheckTest.py +33 -26
- toil/test/src/environmentTest.py +20 -10
- toil/test/src/fileStoreTest.py +538 -271
- toil/test/src/helloWorldTest.py +7 -4
- toil/test/src/importExportFileTest.py +61 -31
- toil/test/src/jobDescriptionTest.py +32 -17
- toil/test/src/jobEncapsulationTest.py +2 -0
- toil/test/src/jobFileStoreTest.py +74 -50
- toil/test/src/jobServiceTest.py +187 -73
- toil/test/src/jobTest.py +120 -70
- toil/test/src/miscTests.py +19 -18
- toil/test/src/promisedRequirementTest.py +82 -36
- toil/test/src/promisesTest.py +7 -6
- toil/test/src/realtimeLoggerTest.py +6 -6
- toil/test/src/regularLogTest.py +71 -37
- toil/test/src/resourceTest.py +80 -49
- toil/test/src/restartDAGTest.py +36 -22
- toil/test/src/resumabilityTest.py +9 -2
- toil/test/src/retainTempDirTest.py +45 -14
- toil/test/src/systemTest.py +12 -8
- toil/test/src/threadingTest.py +44 -25
- toil/test/src/toilContextManagerTest.py +10 -7
- toil/test/src/userDefinedJobArgTypeTest.py +8 -5
- toil/test/src/workerTest.py +33 -16
- toil/test/utils/toilDebugTest.py +70 -58
- toil/test/utils/toilKillTest.py +4 -5
- toil/test/utils/utilsTest.py +239 -102
- toil/test/wdl/wdltoil_test.py +789 -148
- toil/test/wdl/wdltoil_test_kubernetes.py +37 -23
- toil/toilState.py +52 -26
- toil/utils/toilConfig.py +13 -4
- toil/utils/toilDebugFile.py +44 -27
- toil/utils/toilDebugJob.py +85 -25
- toil/utils/toilDestroyCluster.py +11 -6
- toil/utils/toilKill.py +8 -3
- toil/utils/toilLaunchCluster.py +251 -145
- toil/utils/toilMain.py +37 -16
- toil/utils/toilRsyncCluster.py +27 -14
- toil/utils/toilSshCluster.py +45 -22
- toil/utils/toilStats.py +75 -36
- toil/utils/toilStatus.py +226 -119
- toil/utils/toilUpdateEC2Instances.py +3 -1
- toil/version.py +11 -11
- toil/wdl/utils.py +5 -5
- toil/wdl/wdltoil.py +3513 -1052
- toil/worker.py +269 -128
- toil-8.0.0.dist-info/METADATA +173 -0
- toil-8.0.0.dist-info/RECORD +253 -0
- {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/WHEEL +1 -1
- toil-7.0.0.dist-info/METADATA +0 -158
- toil-7.0.0.dist-info/RECORD +0 -244
- {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/LICENSE +0 -0
- {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/entry_points.txt +0 -0
- {toil-7.0.0.dist-info → toil-8.0.0.dist-info}/top_level.txt +0 -0
toil/lib/misc.py
CHANGED
|
@@ -7,9 +7,9 @@ import socket
|
|
|
7
7
|
import subprocess
|
|
8
8
|
import sys
|
|
9
9
|
import time
|
|
10
|
-
import
|
|
10
|
+
from collections.abc import Iterator
|
|
11
11
|
from contextlib import closing
|
|
12
|
-
from typing import
|
|
12
|
+
from typing import Optional
|
|
13
13
|
|
|
14
14
|
logger = logging.getLogger(__name__)
|
|
15
15
|
|
|
@@ -21,19 +21,20 @@ def get_public_ip() -> str:
|
|
|
21
21
|
try:
|
|
22
22
|
# Try to get the internet-facing IP by attempting a connection
|
|
23
23
|
# to a non-existent server and reading what IP was used.
|
|
24
|
-
ip =
|
|
24
|
+
ip = "127.0.0.1"
|
|
25
25
|
with closing(socket.socket(socket.AF_INET, socket.SOCK_DGRAM)) as sock:
|
|
26
26
|
# 203.0.113.0/24 is reserved as TEST-NET-3 by RFC 5737, so
|
|
27
27
|
# there is guaranteed to be no one listening on the other
|
|
28
28
|
# end (and we won't accidentally DOS anyone).
|
|
29
|
-
sock.connect((
|
|
29
|
+
sock.connect(("203.0.113.1", 1))
|
|
30
30
|
ip = sock.getsockname()[0]
|
|
31
31
|
return ip
|
|
32
32
|
except:
|
|
33
33
|
# Something went terribly wrong. Just give loopback rather
|
|
34
34
|
# than killing everything, because this is often called just
|
|
35
35
|
# to provide a default argument
|
|
36
|
-
return
|
|
36
|
+
return "127.0.0.1"
|
|
37
|
+
|
|
37
38
|
|
|
38
39
|
def get_user_name() -> str:
|
|
39
40
|
"""
|
|
@@ -46,20 +47,23 @@ def get_user_name() -> str:
|
|
|
46
47
|
except KeyError:
|
|
47
48
|
# This is expected if the user isn't in /etc/passwd, such as in a
|
|
48
49
|
# Docker container when running as a weird UID. Make something up.
|
|
49
|
-
return
|
|
50
|
+
return "UnknownUser" + str(os.getuid())
|
|
50
51
|
except Exception as e:
|
|
51
52
|
# We can't get the UID, or something weird has gone wrong.
|
|
52
|
-
logger.error(
|
|
53
|
-
return
|
|
53
|
+
logger.error("Unexpected error getting user name: %s", e)
|
|
54
|
+
return "UnknownUser"
|
|
55
|
+
|
|
54
56
|
|
|
55
57
|
def utc_now() -> datetime.datetime:
|
|
56
58
|
"""Return a datetime in the UTC timezone corresponding to right now."""
|
|
57
59
|
return datetime.datetime.utcnow().replace(tzinfo=datetime.timezone.utc)
|
|
58
60
|
|
|
61
|
+
|
|
59
62
|
def unix_now_ms() -> float:
|
|
60
63
|
"""Return the current time in milliseconds since the Unix epoch."""
|
|
61
64
|
return time.time() * 1000
|
|
62
65
|
|
|
66
|
+
|
|
63
67
|
def slow_down(seconds: float) -> float:
|
|
64
68
|
"""
|
|
65
69
|
Toil jobs that have completed are not allowed to have taken 0 seconds, but
|
|
@@ -77,9 +81,25 @@ def slow_down(seconds: float) -> float:
|
|
|
77
81
|
|
|
78
82
|
return max(seconds, sys.float_info.epsilon)
|
|
79
83
|
|
|
80
|
-
|
|
84
|
+
|
|
85
|
+
def printq(msg: str, quiet: bool, log: bool = False) -> None:
|
|
86
|
+
"""
|
|
87
|
+
This is for functions used simultaneously in Toil proper and in the admin scripts.
|
|
88
|
+
|
|
89
|
+
Our admin scripts "print" to stdout, while Toil proper uses logging. For a script that,
|
|
90
|
+
for example, cleans up IAM, EC2, etc. cruft leftover after failed CI runs, we can call
|
|
91
|
+
an AWS delete IAM role function, and this prints or logs progress (unless quiet is True),
|
|
92
|
+
depending on whether the function is called in, say, the jobstore or a script.
|
|
93
|
+
|
|
94
|
+
:param msg: The string to print or log to stdout.
|
|
95
|
+
:param quiet: Silent output to stdout.
|
|
96
|
+
:param log: Use logging (else "print" to the screen).
|
|
97
|
+
"""
|
|
81
98
|
if not quiet:
|
|
82
|
-
|
|
99
|
+
if not log:
|
|
100
|
+
print(msg)
|
|
101
|
+
else:
|
|
102
|
+
logger.debug(msg)
|
|
83
103
|
|
|
84
104
|
|
|
85
105
|
def truncExpBackoff() -> Iterator[float]:
|
|
@@ -102,12 +122,23 @@ class CalledProcessErrorStderr(subprocess.CalledProcessError):
|
|
|
102
122
|
if (self.returncode < 0) or (self.stderr is None):
|
|
103
123
|
return str(super())
|
|
104
124
|
else:
|
|
105
|
-
err =
|
|
125
|
+
err = (
|
|
126
|
+
self.stderr
|
|
127
|
+
if isinstance(self.stderr, str)
|
|
128
|
+
else self.stderr.decode("ascii", errors="replace")
|
|
129
|
+
)
|
|
106
130
|
return "Command '%s' exit status %d: %s" % (self.cmd, self.returncode, err)
|
|
107
131
|
|
|
108
132
|
|
|
109
|
-
def call_command(
|
|
110
|
-
|
|
133
|
+
def call_command(
|
|
134
|
+
cmd: list[str],
|
|
135
|
+
*args: str,
|
|
136
|
+
input: Optional[str] = None,
|
|
137
|
+
timeout: Optional[float] = None,
|
|
138
|
+
useCLocale: bool = True,
|
|
139
|
+
env: Optional[dict[str, str]] = None,
|
|
140
|
+
quiet: Optional[bool] = False
|
|
141
|
+
) -> str:
|
|
111
142
|
"""
|
|
112
143
|
Simplified calling of external commands.
|
|
113
144
|
|
|
@@ -138,14 +169,30 @@ def call_command(cmd: List[str], *args: str, input: Optional[str] = None, timeou
|
|
|
138
169
|
|
|
139
170
|
logger.debug("run command: {}".format(" ".join(cmd)))
|
|
140
171
|
start_time = datetime.datetime.now()
|
|
141
|
-
proc = subprocess.Popen(
|
|
142
|
-
|
|
172
|
+
proc = subprocess.Popen(
|
|
173
|
+
cmd,
|
|
174
|
+
stdout=subprocess.PIPE,
|
|
175
|
+
stderr=subprocess.PIPE,
|
|
176
|
+
encoding="utf-8",
|
|
177
|
+
errors="replace",
|
|
178
|
+
env=env,
|
|
179
|
+
)
|
|
143
180
|
stdout, stderr = proc.communicate(input=input, timeout=timeout)
|
|
144
181
|
end_time = datetime.datetime.now()
|
|
145
182
|
runtime = (end_time - start_time).total_seconds()
|
|
146
183
|
sys.stderr.write(stderr)
|
|
147
184
|
if proc.returncode != 0:
|
|
148
|
-
logger.debug(
|
|
149
|
-
|
|
150
|
-
|
|
185
|
+
logger.debug(
|
|
186
|
+
"command failed in {}s: {}: {}".format(
|
|
187
|
+
runtime, " ".join(cmd), stderr.rstrip()
|
|
188
|
+
)
|
|
189
|
+
)
|
|
190
|
+
raise CalledProcessErrorStderr(
|
|
191
|
+
proc.returncode, cmd, output=stdout, stderr=stderr
|
|
192
|
+
)
|
|
193
|
+
logger.debug(
|
|
194
|
+
"command succeeded in {}s: {}{}".format(
|
|
195
|
+
runtime, " ".join(cmd), (": " + stdout.rstrip()) if not quiet else ""
|
|
196
|
+
)
|
|
197
|
+
)
|
|
151
198
|
return stdout
|
toil/lib/objects.py
CHANGED
|
@@ -126,10 +126,10 @@ class InnerClass:
|
|
|
126
126
|
if instance is None:
|
|
127
127
|
return self.inner_class
|
|
128
128
|
else:
|
|
129
|
-
return self._bind(
|
|
129
|
+
return self._bind(instance)
|
|
130
130
|
|
|
131
131
|
@sync_memoize
|
|
132
|
-
def _bind(
|
|
132
|
+
def _bind(self, _outer):
|
|
133
133
|
class BoundInner(self.inner_class):
|
|
134
134
|
outer = _outer
|
|
135
135
|
|
toil/lib/resources.py
CHANGED
|
@@ -12,11 +12,11 @@
|
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
import fnmatch
|
|
15
|
-
import os
|
|
16
15
|
import math
|
|
17
|
-
import
|
|
16
|
+
import os
|
|
18
17
|
import resource
|
|
19
|
-
|
|
18
|
+
import sys
|
|
19
|
+
|
|
20
20
|
|
|
21
21
|
class ResourceMonitor:
|
|
22
22
|
"""
|
|
@@ -52,14 +52,20 @@ class ResourceMonitor:
|
|
|
52
52
|
cls._extra_cpu_seconds += seconds
|
|
53
53
|
|
|
54
54
|
@classmethod
|
|
55
|
-
def get_total_cpu_time_and_memory_usage(cls) ->
|
|
55
|
+
def get_total_cpu_time_and_memory_usage(cls) -> tuple[float, int]:
|
|
56
56
|
"""
|
|
57
57
|
Gives the total cpu time of itself and all its children, and the maximum RSS memory usage of
|
|
58
58
|
itself and its single largest child (in kibibytes).
|
|
59
59
|
"""
|
|
60
60
|
me = resource.getrusage(resource.RUSAGE_SELF)
|
|
61
61
|
children = resource.getrusage(resource.RUSAGE_CHILDREN)
|
|
62
|
-
total_cpu_time =
|
|
62
|
+
total_cpu_time = (
|
|
63
|
+
me.ru_utime
|
|
64
|
+
+ me.ru_stime
|
|
65
|
+
+ children.ru_utime
|
|
66
|
+
+ children.ru_stime
|
|
67
|
+
+ cls._extra_cpu_seconds
|
|
68
|
+
)
|
|
63
69
|
total_memory_usage = me.ru_maxrss + children.ru_maxrss
|
|
64
70
|
if sys.platform == "darwin":
|
|
65
71
|
# On Linux, getrusage works in "kilobytes" (really kibibytes), but on
|
|
@@ -74,10 +80,16 @@ class ResourceMonitor:
|
|
|
74
80
|
"""Gives the total cpu time, including the children."""
|
|
75
81
|
me = resource.getrusage(resource.RUSAGE_SELF)
|
|
76
82
|
childs = resource.getrusage(resource.RUSAGE_CHILDREN)
|
|
77
|
-
return
|
|
83
|
+
return (
|
|
84
|
+
me.ru_utime
|
|
85
|
+
+ me.ru_stime
|
|
86
|
+
+ childs.ru_utime
|
|
87
|
+
+ childs.ru_stime
|
|
88
|
+
+ cls._extra_cpu_seconds
|
|
89
|
+
)
|
|
78
90
|
|
|
79
91
|
|
|
80
|
-
def glob(glob_pattern: str, directoryname: str) ->
|
|
92
|
+
def glob(glob_pattern: str, directoryname: str) -> list[str]:
|
|
81
93
|
"""
|
|
82
94
|
Walks through a directory and its subdirectories looking for files matching
|
|
83
95
|
the glob_pattern and returns a list=[].
|
toil/lib/retry.py
CHANGED
|
@@ -131,35 +131,30 @@ import sqlite3
|
|
|
131
131
|
import time
|
|
132
132
|
import traceback
|
|
133
133
|
import urllib.error
|
|
134
|
+
from collections.abc import Generator, Iterable, Sequence
|
|
134
135
|
from contextlib import contextmanager
|
|
135
|
-
from typing import
|
|
136
|
-
Callable,
|
|
137
|
-
ContextManager,
|
|
138
|
-
Generator,
|
|
139
|
-
Iterable,
|
|
140
|
-
List,
|
|
141
|
-
Optional,
|
|
142
|
-
Sequence,
|
|
143
|
-
Tuple,
|
|
144
|
-
Type,
|
|
145
|
-
Union, TypeVar)
|
|
136
|
+
from typing import Any, Callable, ContextManager, Optional, TypeVar, Union
|
|
146
137
|
|
|
147
138
|
import requests.exceptions
|
|
148
139
|
import urllib3.exceptions
|
|
149
140
|
|
|
150
|
-
SUPPORTED_HTTP_ERRORS = [
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
141
|
+
SUPPORTED_HTTP_ERRORS = [
|
|
142
|
+
http.client.HTTPException,
|
|
143
|
+
urllib.error.HTTPError,
|
|
144
|
+
urllib3.exceptions.HTTPError,
|
|
145
|
+
requests.exceptions.HTTPError,
|
|
146
|
+
]
|
|
154
147
|
|
|
155
148
|
try:
|
|
156
149
|
import kubernetes.client.rest
|
|
150
|
+
|
|
157
151
|
SUPPORTED_HTTP_ERRORS.append(kubernetes.client.rest.ApiException)
|
|
158
152
|
except ModuleNotFoundError:
|
|
159
153
|
kubernetes = None
|
|
160
154
|
|
|
161
155
|
try:
|
|
162
156
|
import botocore.exceptions
|
|
157
|
+
|
|
163
158
|
SUPPORTED_HTTP_ERRORS.append(botocore.exceptions.ClientError)
|
|
164
159
|
except ModuleNotFoundError:
|
|
165
160
|
botocore = None
|
|
@@ -175,12 +170,14 @@ class ErrorCondition:
|
|
|
175
170
|
whether to retry.
|
|
176
171
|
"""
|
|
177
172
|
|
|
178
|
-
def __init__(
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
173
|
+
def __init__(
|
|
174
|
+
self,
|
|
175
|
+
error: Optional[Any] = None,
|
|
176
|
+
error_codes: list[int] = None,
|
|
177
|
+
boto_error_codes: list[str] = None,
|
|
178
|
+
error_message_must_include: str = None,
|
|
179
|
+
retry_on_this_condition: bool = True,
|
|
180
|
+
):
|
|
184
181
|
"""
|
|
185
182
|
Initialize this ErrorCondition.
|
|
186
183
|
|
|
@@ -227,12 +224,14 @@ class ErrorCondition:
|
|
|
227
224
|
# There is a better way to type hint this with python 3.10
|
|
228
225
|
# https://stackoverflow.com/a/68290080
|
|
229
226
|
RT = TypeVar("RT")
|
|
227
|
+
|
|
228
|
+
|
|
230
229
|
def retry(
|
|
231
|
-
intervals: Optional[
|
|
230
|
+
intervals: Optional[list] = None,
|
|
232
231
|
infinite_retries: bool = False,
|
|
233
|
-
errors: Optional[Sequence[Union[ErrorCondition,
|
|
234
|
-
log_message: Optional[
|
|
235
|
-
prepare: Optional[
|
|
232
|
+
errors: Optional[Sequence[Union[ErrorCondition, type[Exception]]]] = None,
|
|
233
|
+
log_message: Optional[tuple[Callable, str]] = None,
|
|
234
|
+
prepare: Optional[list[Callable]] = None,
|
|
236
235
|
) -> Callable[[Callable[..., RT]], Callable[..., RT]]:
|
|
237
236
|
"""
|
|
238
237
|
Retry a function if it fails with any Exception defined in "errors".
|
|
@@ -266,7 +265,9 @@ def retry(
|
|
|
266
265
|
errors = errors if errors else [Exception]
|
|
267
266
|
|
|
268
267
|
error_conditions = {error for error in errors if isinstance(error, ErrorCondition)}
|
|
269
|
-
retriable_errors = {
|
|
268
|
+
retriable_errors = {
|
|
269
|
+
error for error in errors if not isinstance(error, ErrorCondition)
|
|
270
|
+
}
|
|
270
271
|
|
|
271
272
|
if log_message:
|
|
272
273
|
post_message_function = log_message[0]
|
|
@@ -275,7 +276,10 @@ def retry(
|
|
|
275
276
|
# if a generic error exists (with no restrictions),
|
|
276
277
|
# delete more specific error_condition instances of it
|
|
277
278
|
for error_condition in error_conditions:
|
|
278
|
-
if
|
|
279
|
+
if (
|
|
280
|
+
error_condition.retry_on_this_condition
|
|
281
|
+
and error_condition.error in retriable_errors
|
|
282
|
+
):
|
|
279
283
|
error_conditions.remove(error_condition)
|
|
280
284
|
|
|
281
285
|
# if a more specific error exists that isn't in the general set,
|
|
@@ -306,13 +310,17 @@ def retry(
|
|
|
306
310
|
raise
|
|
307
311
|
|
|
308
312
|
interval = intervals_remaining.pop(0)
|
|
309
|
-
logger.warning(
|
|
313
|
+
logger.warning(
|
|
314
|
+
f"Error in {func}: {e}. Retrying after {interval} s..."
|
|
315
|
+
)
|
|
310
316
|
time.sleep(interval)
|
|
311
317
|
if prepare is not None:
|
|
312
318
|
for prep_function in prepare:
|
|
313
319
|
# Reset state for next attempt
|
|
314
320
|
prep_function(*args, **kwargs)
|
|
321
|
+
|
|
315
322
|
return call
|
|
323
|
+
|
|
316
324
|
return decorate
|
|
317
325
|
|
|
318
326
|
|
|
@@ -323,17 +331,18 @@ def return_status_code(e):
|
|
|
323
331
|
|
|
324
332
|
if botocore:
|
|
325
333
|
if isinstance(e, botocore.exceptions.ClientError):
|
|
326
|
-
return e.response.get(
|
|
334
|
+
return e.response.get("ResponseMetadata", {}).get("HTTPStatusCode")
|
|
327
335
|
|
|
328
336
|
if isinstance(e, requests.exceptions.HTTPError):
|
|
329
337
|
return e.response.status_code
|
|
330
|
-
elif isinstance(e, http.client.HTTPException) or
|
|
331
|
-
|
|
338
|
+
elif isinstance(e, http.client.HTTPException) or isinstance(
|
|
339
|
+
e, urllib3.exceptions.HTTPError
|
|
340
|
+
):
|
|
332
341
|
return e.status
|
|
333
342
|
elif isinstance(e, urllib.error.HTTPError):
|
|
334
343
|
return e.code
|
|
335
344
|
else:
|
|
336
|
-
raise ValueError(f
|
|
345
|
+
raise ValueError(f"Unsupported error type; cannot grok status code: {e}.")
|
|
337
346
|
|
|
338
347
|
|
|
339
348
|
def get_error_code(e: Exception) -> str:
|
|
@@ -342,21 +351,21 @@ def get_error_code(e: Exception) -> str:
|
|
|
342
351
|
|
|
343
352
|
Returns empty string for other errors.
|
|
344
353
|
"""
|
|
345
|
-
if hasattr(e,
|
|
354
|
+
if hasattr(e, "error_code") and isinstance(e.error_code, str):
|
|
346
355
|
# A Boto 2 error
|
|
347
356
|
return e.error_code
|
|
348
|
-
if hasattr(e,
|
|
357
|
+
if hasattr(e, "code") and isinstance(e.code, str):
|
|
349
358
|
# A (different?) Boto 2 error
|
|
350
359
|
return e.code
|
|
351
|
-
elif hasattr(e,
|
|
360
|
+
elif hasattr(e, "response") and hasattr(e.response, "get"):
|
|
352
361
|
# A Boto 3 error
|
|
353
|
-
code = e.response.get(
|
|
362
|
+
code = e.response.get("Error", {}).get("Code")
|
|
354
363
|
if isinstance(code, str):
|
|
355
364
|
return code
|
|
356
365
|
else:
|
|
357
|
-
return
|
|
366
|
+
return ""
|
|
358
367
|
else:
|
|
359
|
-
return
|
|
368
|
+
return ""
|
|
360
369
|
|
|
361
370
|
|
|
362
371
|
def get_error_message(e: Exception) -> str:
|
|
@@ -366,18 +375,18 @@ def get_error_message(e: Exception) -> str:
|
|
|
366
375
|
Note that error message conditions also check more than this; this function
|
|
367
376
|
does not fall back to the traceback for incompatible types.
|
|
368
377
|
"""
|
|
369
|
-
if hasattr(e,
|
|
378
|
+
if hasattr(e, "error_message") and isinstance(e.error_message, str):
|
|
370
379
|
# A Boto 2 error
|
|
371
380
|
return e.error_message
|
|
372
|
-
elif hasattr(e,
|
|
381
|
+
elif hasattr(e, "response") and hasattr(e.response, "get"):
|
|
373
382
|
# A Boto 3 error
|
|
374
|
-
message = e.response.get(
|
|
383
|
+
message = e.response.get("Error", {}).get("Message")
|
|
375
384
|
if isinstance(message, str):
|
|
376
385
|
return message
|
|
377
386
|
else:
|
|
378
|
-
return
|
|
387
|
+
return ""
|
|
379
388
|
else:
|
|
380
|
-
return
|
|
389
|
+
return ""
|
|
381
390
|
|
|
382
391
|
|
|
383
392
|
def get_error_status(e: Exception) -> int:
|
|
@@ -391,22 +400,23 @@ def get_error_status(e: Exception) -> int:
|
|
|
391
400
|
|
|
392
401
|
Returns 0 from other errors.
|
|
393
402
|
"""
|
|
403
|
+
|
|
394
404
|
def numify(x):
|
|
395
405
|
"""Make sure a value is an integer."""
|
|
396
406
|
return int(str(x).strip())
|
|
397
407
|
|
|
398
|
-
if hasattr(e,
|
|
408
|
+
if hasattr(e, "status"):
|
|
399
409
|
# A Boto 2 error, kubernetes.client.rest.ApiException,
|
|
400
410
|
# http.client.HTTPException, or urllib3.exceptions.HTTPError
|
|
401
411
|
return numify(e.status)
|
|
402
|
-
elif hasattr(e,
|
|
403
|
-
if hasattr(e.response,
|
|
412
|
+
elif hasattr(e, "response"):
|
|
413
|
+
if hasattr(e.response, "status_code"):
|
|
404
414
|
# A requests.exceptions.HTTPError
|
|
405
415
|
return numify(e.response.status_code)
|
|
406
|
-
elif hasattr(e.response,
|
|
416
|
+
elif hasattr(e.response, "get"):
|
|
407
417
|
# A Boto 3 error
|
|
408
|
-
return numify(e.response.get(
|
|
409
|
-
elif hasattr(e,
|
|
418
|
+
return numify(e.response.get("ResponseMetadata", {}).get("HTTPStatusCode"))
|
|
419
|
+
elif hasattr(e, "code"):
|
|
410
420
|
# A urllib.error.HTTPError
|
|
411
421
|
return numify(e.code)
|
|
412
422
|
else:
|
|
@@ -419,16 +429,17 @@ def get_error_body(e: Exception) -> str:
|
|
|
419
429
|
|
|
420
430
|
Returns the code and message if the error does not have a body.
|
|
421
431
|
"""
|
|
422
|
-
if hasattr(e,
|
|
432
|
+
if hasattr(e, "body"):
|
|
423
433
|
# A Boto 2 error
|
|
424
434
|
if isinstance(e.body, bytes):
|
|
425
435
|
# Decode the body first
|
|
426
|
-
return e.body.decode(
|
|
436
|
+
return e.body.decode("utf-8")
|
|
427
437
|
elif isinstance(e.body, str):
|
|
428
438
|
return e.body
|
|
429
439
|
|
|
430
440
|
# Anything else
|
|
431
|
-
return f
|
|
441
|
+
return f"{get_error_code(e)}: {get_error_message(e)}"
|
|
442
|
+
|
|
432
443
|
|
|
433
444
|
def meets_error_message_condition(e: Exception, error_message: Optional[str]):
|
|
434
445
|
if error_message:
|
|
@@ -440,7 +451,9 @@ def meets_error_message_condition(e: Exception, error_message: Optional[str]):
|
|
|
440
451
|
if isinstance(e, botocore.exceptions.ClientError):
|
|
441
452
|
return error_message in str(e)
|
|
442
453
|
|
|
443
|
-
if isinstance(e, http.client.HTTPException) or isinstance(
|
|
454
|
+
if isinstance(e, http.client.HTTPException) or isinstance(
|
|
455
|
+
e, urllib3.exceptions.HTTPError
|
|
456
|
+
):
|
|
444
457
|
return error_message in e.reason
|
|
445
458
|
elif isinstance(e, sqlite3.OperationalError):
|
|
446
459
|
return error_message in str(e)
|
|
@@ -448,7 +461,7 @@ def meets_error_message_condition(e: Exception, error_message: Optional[str]):
|
|
|
448
461
|
return error_message in e.msg
|
|
449
462
|
elif isinstance(e, requests.exceptions.HTTPError):
|
|
450
463
|
return error_message in e.raw
|
|
451
|
-
elif hasattr(e,
|
|
464
|
+
elif hasattr(e, "msg"):
|
|
452
465
|
return error_message in e.msg
|
|
453
466
|
else:
|
|
454
467
|
return error_message in traceback.format_exc()
|
|
@@ -456,7 +469,7 @@ def meets_error_message_condition(e: Exception, error_message: Optional[str]):
|
|
|
456
469
|
return True
|
|
457
470
|
|
|
458
471
|
|
|
459
|
-
def meets_error_code_condition(e: Exception, error_codes: Optional[
|
|
472
|
+
def meets_error_code_condition(e: Exception, error_codes: Optional[list[int]]):
|
|
460
473
|
"""These are expected to be normal HTTP error codes, like 404 or 500."""
|
|
461
474
|
if error_codes:
|
|
462
475
|
status_code = get_error_status(e)
|
|
@@ -465,7 +478,9 @@ def meets_error_code_condition(e: Exception, error_codes: Optional[List[int]]):
|
|
|
465
478
|
return True
|
|
466
479
|
|
|
467
480
|
|
|
468
|
-
def meets_boto_error_code_condition(
|
|
481
|
+
def meets_boto_error_code_condition(
|
|
482
|
+
e: Exception, boto_error_codes: Optional[list[str]]
|
|
483
|
+
):
|
|
469
484
|
"""These are expected to be AWS's custom error aliases, like 'BucketNotFound' or 'AccessDenied'."""
|
|
470
485
|
if boto_error_codes:
|
|
471
486
|
status_code = get_error_code(e)
|
|
@@ -478,21 +493,37 @@ def error_meets_conditions(e, error_conditions):
|
|
|
478
493
|
condition_met = False
|
|
479
494
|
for error in error_conditions:
|
|
480
495
|
if isinstance(e, error.error):
|
|
481
|
-
if
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
496
|
+
if (
|
|
497
|
+
error.error_codes
|
|
498
|
+
or error.boto_error_codes
|
|
499
|
+
or error.error_message_must_include
|
|
500
|
+
):
|
|
501
|
+
error_message_condition_met = meets_error_message_condition(
|
|
502
|
+
e, error.error_message_must_include
|
|
503
|
+
)
|
|
504
|
+
error_code_condition_met = meets_error_code_condition(
|
|
505
|
+
e, error.error_codes
|
|
506
|
+
)
|
|
507
|
+
boto_error_code_condition_met = meets_boto_error_code_condition(
|
|
508
|
+
e, error.boto_error_codes
|
|
509
|
+
)
|
|
510
|
+
if (
|
|
511
|
+
error_message_condition_met
|
|
512
|
+
and error_code_condition_met
|
|
513
|
+
and boto_error_code_condition_met
|
|
514
|
+
):
|
|
486
515
|
if not error.retry_on_this_condition:
|
|
487
516
|
return False
|
|
488
517
|
condition_met = True
|
|
489
518
|
return condition_met
|
|
490
519
|
|
|
520
|
+
|
|
491
521
|
DEFAULT_DELAYS = (0, 1, 1, 4, 16, 64)
|
|
492
522
|
DEFAULT_TIMEOUT = 300
|
|
493
523
|
|
|
494
524
|
E = TypeVar("E", bound=Exception) # so mypy understands passed through types
|
|
495
525
|
|
|
526
|
+
|
|
496
527
|
# TODO: Replace the use of this with retry()
|
|
497
528
|
# The aws provisioner and jobstore need a large refactoring to be boto3 compliant, so this is
|
|
498
529
|
# still used there to avoid the duplication of future work
|
|
@@ -575,38 +606,45 @@ def old_retry(
|
|
|
575
606
|
if timeout is None:
|
|
576
607
|
timeout = DEFAULT_TIMEOUT
|
|
577
608
|
if timeout > 0:
|
|
578
|
-
go = [
|
|
609
|
+
go = [None]
|
|
579
610
|
|
|
580
611
|
@contextmanager
|
|
581
|
-
def repeated_attempt(
|
|
612
|
+
def repeated_attempt(delay):
|
|
582
613
|
try:
|
|
583
614
|
yield
|
|
584
615
|
except Exception as e:
|
|
585
|
-
if time.time(
|
|
586
|
-
if predicate(
|
|
587
|
-
logger.info(
|
|
588
|
-
time.sleep(
|
|
616
|
+
if time.time() + delay < expiration:
|
|
617
|
+
if predicate(e):
|
|
618
|
+
logger.info("Got %s, trying again in %is.", e, delay)
|
|
619
|
+
time.sleep(delay)
|
|
589
620
|
else:
|
|
590
|
-
logger.error(
|
|
621
|
+
logger.error(
|
|
622
|
+
"Got a %s: %s which is not retriable according to %s",
|
|
623
|
+
type(e),
|
|
624
|
+
e,
|
|
625
|
+
predicate,
|
|
626
|
+
)
|
|
591
627
|
raise
|
|
592
628
|
else:
|
|
593
|
-
logger.error(
|
|
629
|
+
logger.error("Got %s and no time is left to retry", e)
|
|
594
630
|
raise
|
|
595
631
|
else:
|
|
596
|
-
go.pop(
|
|
632
|
+
go.pop()
|
|
597
633
|
|
|
598
|
-
delays = iter(
|
|
599
|
-
expiration = time.time(
|
|
600
|
-
delay = next(
|
|
634
|
+
delays = iter(delays)
|
|
635
|
+
expiration = time.time() + timeout
|
|
636
|
+
delay = next(delays)
|
|
601
637
|
while go:
|
|
602
|
-
yield repeated_attempt(
|
|
603
|
-
delay = next(
|
|
638
|
+
yield repeated_attempt(delay)
|
|
639
|
+
delay = next(delays, delay)
|
|
604
640
|
else:
|
|
641
|
+
|
|
605
642
|
@contextmanager
|
|
606
|
-
def single_attempt(
|
|
643
|
+
def single_attempt():
|
|
607
644
|
yield
|
|
608
645
|
|
|
609
|
-
yield single_attempt(
|
|
646
|
+
yield single_attempt()
|
|
647
|
+
|
|
610
648
|
|
|
611
649
|
# Decorator to retry tests that fail. Needs to be called with
|
|
612
650
|
# prepare=[tearDown, setUp] if the test class has tear down and set up that
|