toil 6.1.0a1__py3-none-any.whl → 8.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (193) hide show
  1. toil/__init__.py +122 -315
  2. toil/batchSystems/__init__.py +1 -0
  3. toil/batchSystems/abstractBatchSystem.py +173 -89
  4. toil/batchSystems/abstractGridEngineBatchSystem.py +272 -148
  5. toil/batchSystems/awsBatch.py +244 -135
  6. toil/batchSystems/cleanup_support.py +26 -16
  7. toil/batchSystems/contained_executor.py +31 -28
  8. toil/batchSystems/gridengine.py +86 -50
  9. toil/batchSystems/htcondor.py +166 -89
  10. toil/batchSystems/kubernetes.py +632 -382
  11. toil/batchSystems/local_support.py +20 -15
  12. toil/batchSystems/lsf.py +134 -81
  13. toil/batchSystems/lsfHelper.py +13 -11
  14. toil/batchSystems/mesos/__init__.py +41 -29
  15. toil/batchSystems/mesos/batchSystem.py +290 -151
  16. toil/batchSystems/mesos/executor.py +79 -50
  17. toil/batchSystems/mesos/test/__init__.py +31 -23
  18. toil/batchSystems/options.py +46 -28
  19. toil/batchSystems/registry.py +53 -19
  20. toil/batchSystems/singleMachine.py +296 -125
  21. toil/batchSystems/slurm.py +603 -138
  22. toil/batchSystems/torque.py +47 -33
  23. toil/bus.py +186 -76
  24. toil/common.py +664 -368
  25. toil/cwl/__init__.py +1 -1
  26. toil/cwl/cwltoil.py +1136 -483
  27. toil/cwl/utils.py +17 -22
  28. toil/deferred.py +63 -42
  29. toil/exceptions.py +5 -3
  30. toil/fileStores/__init__.py +5 -5
  31. toil/fileStores/abstractFileStore.py +140 -60
  32. toil/fileStores/cachingFileStore.py +717 -269
  33. toil/fileStores/nonCachingFileStore.py +116 -87
  34. toil/job.py +1225 -368
  35. toil/jobStores/abstractJobStore.py +416 -266
  36. toil/jobStores/aws/jobStore.py +863 -477
  37. toil/jobStores/aws/utils.py +201 -120
  38. toil/jobStores/conftest.py +3 -2
  39. toil/jobStores/fileJobStore.py +292 -154
  40. toil/jobStores/googleJobStore.py +140 -74
  41. toil/jobStores/utils.py +36 -15
  42. toil/leader.py +668 -272
  43. toil/lib/accelerators.py +115 -18
  44. toil/lib/aws/__init__.py +74 -31
  45. toil/lib/aws/ami.py +122 -87
  46. toil/lib/aws/iam.py +284 -108
  47. toil/lib/aws/s3.py +31 -0
  48. toil/lib/aws/session.py +214 -39
  49. toil/lib/aws/utils.py +287 -231
  50. toil/lib/bioio.py +13 -5
  51. toil/lib/compatibility.py +11 -6
  52. toil/lib/conversions.py +104 -47
  53. toil/lib/docker.py +131 -103
  54. toil/lib/ec2.py +361 -199
  55. toil/lib/ec2nodes.py +174 -106
  56. toil/lib/encryption/_dummy.py +5 -3
  57. toil/lib/encryption/_nacl.py +10 -6
  58. toil/lib/encryption/conftest.py +1 -0
  59. toil/lib/exceptions.py +26 -7
  60. toil/lib/expando.py +5 -3
  61. toil/lib/ftp_utils.py +217 -0
  62. toil/lib/generatedEC2Lists.py +127 -19
  63. toil/lib/humanize.py +6 -2
  64. toil/lib/integration.py +341 -0
  65. toil/lib/io.py +141 -15
  66. toil/lib/iterables.py +4 -2
  67. toil/lib/memoize.py +12 -8
  68. toil/lib/misc.py +66 -21
  69. toil/lib/objects.py +2 -2
  70. toil/lib/resources.py +68 -15
  71. toil/lib/retry.py +126 -81
  72. toil/lib/threading.py +299 -82
  73. toil/lib/throttle.py +16 -15
  74. toil/options/common.py +843 -409
  75. toil/options/cwl.py +175 -90
  76. toil/options/runner.py +50 -0
  77. toil/options/wdl.py +73 -17
  78. toil/provisioners/__init__.py +117 -46
  79. toil/provisioners/abstractProvisioner.py +332 -157
  80. toil/provisioners/aws/__init__.py +70 -33
  81. toil/provisioners/aws/awsProvisioner.py +1145 -715
  82. toil/provisioners/clusterScaler.py +541 -279
  83. toil/provisioners/gceProvisioner.py +282 -179
  84. toil/provisioners/node.py +155 -79
  85. toil/realtimeLogger.py +34 -22
  86. toil/resource.py +137 -75
  87. toil/server/app.py +128 -62
  88. toil/server/celery_app.py +3 -1
  89. toil/server/cli/wes_cwl_runner.py +82 -53
  90. toil/server/utils.py +54 -28
  91. toil/server/wes/abstract_backend.py +64 -26
  92. toil/server/wes/amazon_wes_utils.py +21 -15
  93. toil/server/wes/tasks.py +121 -63
  94. toil/server/wes/toil_backend.py +142 -107
  95. toil/server/wsgi_app.py +4 -3
  96. toil/serviceManager.py +58 -22
  97. toil/statsAndLogging.py +224 -70
  98. toil/test/__init__.py +282 -183
  99. toil/test/batchSystems/batchSystemTest.py +460 -210
  100. toil/test/batchSystems/batch_system_plugin_test.py +90 -0
  101. toil/test/batchSystems/test_gridengine.py +173 -0
  102. toil/test/batchSystems/test_lsf_helper.py +67 -58
  103. toil/test/batchSystems/test_slurm.py +110 -49
  104. toil/test/cactus/__init__.py +0 -0
  105. toil/test/cactus/test_cactus_integration.py +56 -0
  106. toil/test/cwl/cwlTest.py +496 -287
  107. toil/test/cwl/measure_default_memory.cwl +12 -0
  108. toil/test/cwl/not_run_required_input.cwl +29 -0
  109. toil/test/cwl/scatter_duplicate_outputs.cwl +40 -0
  110. toil/test/cwl/seqtk_seq.cwl +1 -1
  111. toil/test/docs/scriptsTest.py +69 -46
  112. toil/test/jobStores/jobStoreTest.py +427 -264
  113. toil/test/lib/aws/test_iam.py +118 -50
  114. toil/test/lib/aws/test_s3.py +16 -9
  115. toil/test/lib/aws/test_utils.py +5 -6
  116. toil/test/lib/dockerTest.py +118 -141
  117. toil/test/lib/test_conversions.py +113 -115
  118. toil/test/lib/test_ec2.py +58 -50
  119. toil/test/lib/test_integration.py +104 -0
  120. toil/test/lib/test_misc.py +12 -5
  121. toil/test/mesos/MesosDataStructuresTest.py +23 -10
  122. toil/test/mesos/helloWorld.py +7 -6
  123. toil/test/mesos/stress.py +25 -20
  124. toil/test/options/__init__.py +13 -0
  125. toil/test/options/options.py +42 -0
  126. toil/test/provisioners/aws/awsProvisionerTest.py +320 -150
  127. toil/test/provisioners/clusterScalerTest.py +440 -250
  128. toil/test/provisioners/clusterTest.py +166 -44
  129. toil/test/provisioners/gceProvisionerTest.py +174 -100
  130. toil/test/provisioners/provisionerTest.py +25 -13
  131. toil/test/provisioners/restartScript.py +5 -4
  132. toil/test/server/serverTest.py +188 -141
  133. toil/test/sort/restart_sort.py +137 -68
  134. toil/test/sort/sort.py +134 -66
  135. toil/test/sort/sortTest.py +91 -49
  136. toil/test/src/autoDeploymentTest.py +141 -101
  137. toil/test/src/busTest.py +20 -18
  138. toil/test/src/checkpointTest.py +8 -2
  139. toil/test/src/deferredFunctionTest.py +49 -35
  140. toil/test/src/dockerCheckTest.py +32 -24
  141. toil/test/src/environmentTest.py +135 -0
  142. toil/test/src/fileStoreTest.py +539 -272
  143. toil/test/src/helloWorldTest.py +7 -4
  144. toil/test/src/importExportFileTest.py +61 -31
  145. toil/test/src/jobDescriptionTest.py +46 -21
  146. toil/test/src/jobEncapsulationTest.py +2 -0
  147. toil/test/src/jobFileStoreTest.py +74 -50
  148. toil/test/src/jobServiceTest.py +187 -73
  149. toil/test/src/jobTest.py +121 -71
  150. toil/test/src/miscTests.py +19 -18
  151. toil/test/src/promisedRequirementTest.py +82 -36
  152. toil/test/src/promisesTest.py +7 -6
  153. toil/test/src/realtimeLoggerTest.py +10 -6
  154. toil/test/src/regularLogTest.py +71 -37
  155. toil/test/src/resourceTest.py +80 -49
  156. toil/test/src/restartDAGTest.py +36 -22
  157. toil/test/src/resumabilityTest.py +9 -2
  158. toil/test/src/retainTempDirTest.py +45 -14
  159. toil/test/src/systemTest.py +12 -8
  160. toil/test/src/threadingTest.py +44 -25
  161. toil/test/src/toilContextManagerTest.py +10 -7
  162. toil/test/src/userDefinedJobArgTypeTest.py +8 -5
  163. toil/test/src/workerTest.py +73 -23
  164. toil/test/utils/toilDebugTest.py +103 -33
  165. toil/test/utils/toilKillTest.py +4 -5
  166. toil/test/utils/utilsTest.py +245 -106
  167. toil/test/wdl/wdltoil_test.py +818 -149
  168. toil/test/wdl/wdltoil_test_kubernetes.py +91 -0
  169. toil/toilState.py +120 -35
  170. toil/utils/toilConfig.py +13 -4
  171. toil/utils/toilDebugFile.py +44 -27
  172. toil/utils/toilDebugJob.py +214 -27
  173. toil/utils/toilDestroyCluster.py +11 -6
  174. toil/utils/toilKill.py +8 -3
  175. toil/utils/toilLaunchCluster.py +256 -140
  176. toil/utils/toilMain.py +37 -16
  177. toil/utils/toilRsyncCluster.py +32 -14
  178. toil/utils/toilSshCluster.py +49 -22
  179. toil/utils/toilStats.py +356 -273
  180. toil/utils/toilStatus.py +292 -139
  181. toil/utils/toilUpdateEC2Instances.py +3 -1
  182. toil/version.py +12 -12
  183. toil/wdl/utils.py +5 -5
  184. toil/wdl/wdltoil.py +3913 -1033
  185. toil/worker.py +367 -184
  186. {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/LICENSE +25 -0
  187. toil-8.0.0.dist-info/METADATA +173 -0
  188. toil-8.0.0.dist-info/RECORD +253 -0
  189. {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/WHEEL +1 -1
  190. toil-6.1.0a1.dist-info/METADATA +0 -125
  191. toil-6.1.0a1.dist-info/RECORD +0 -237
  192. {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/entry_points.txt +0 -0
  193. {toil-6.1.0a1.dist-info → toil-8.0.0.dist-info}/top_level.txt +0 -0
toil/lib/ec2.py CHANGED
@@ -1,22 +1,29 @@
1
1
  import logging
2
2
  import time
3
3
  from base64 import b64encode
4
- from operator import attrgetter
5
- from typing import Dict, Iterable, List, Optional, Union
6
-
7
- from boto3.resources.base import ServiceResource
8
- from boto.ec2.instance import Instance as Boto2Instance
9
- from boto.ec2.spotinstancerequest import SpotInstanceRequest
10
- from botocore.client import BaseClient
4
+ from collections.abc import Generator, Iterable, Mapping
5
+ from typing import TYPE_CHECKING, Any, Callable, Optional, Union
11
6
 
12
7
  from toil.lib.aws.session import establish_boto3_session
13
8
  from toil.lib.aws.utils import flatten_tags
14
9
  from toil.lib.exceptions import panic
15
- from toil.lib.retry import (ErrorCondition,
16
- get_error_code,
17
- get_error_message,
18
- old_retry,
19
- retry)
10
+ from toil.lib.retry import (
11
+ ErrorCondition,
12
+ get_error_code,
13
+ get_error_message,
14
+ old_retry,
15
+ retry,
16
+ )
17
+
18
+ if TYPE_CHECKING:
19
+ from mypy_boto3_autoscaling.client import AutoScalingClient
20
+ from mypy_boto3_ec2.client import EC2Client
21
+ from mypy_boto3_ec2.service_resource import EC2ServiceResource, Instance
22
+ from mypy_boto3_ec2.type_defs import (
23
+ DescribeInstancesResultTypeDef,
24
+ InstanceTypeDef,
25
+ SpotInstanceRequestTypeDef,
26
+ )
20
27
 
21
28
  a_short_time = 5
22
29
  a_long_time = 60 * 60
@@ -27,44 +34,55 @@ class UserError(RuntimeError):
27
34
  def __init__(self, message=None, cause=None):
28
35
  if (message is None) == (cause is None):
29
36
  raise RuntimeError("Must pass either message or cause.")
30
- super().__init__(
31
- message if cause is None else cause.message)
37
+ super().__init__(message if cause is None else cause.message)
32
38
 
33
39
 
34
40
  def not_found(e):
35
41
  try:
36
- return get_error_code(e).endswith('.NotFound')
42
+ return get_error_code(e).endswith(".NotFound")
37
43
  except ValueError:
38
44
  # Not the right kind of error
39
45
  return False
40
46
 
47
+
41
48
  def inconsistencies_detected(e):
42
- if get_error_code(e) == 'InvalidGroup.NotFound':
49
+ if get_error_code(e) == "InvalidGroup.NotFound":
43
50
  return True
44
51
  m = get_error_message(e).lower()
45
- matches = ('invalid iam instance profile' in m) or ('no associated iam roles' in m)
52
+ matches = ("invalid iam instance profile" in m) or ("no associated iam roles" in m)
46
53
  return matches
47
54
 
55
+
48
56
  # We also define these error categories for the new retry decorator
49
- INCONSISTENCY_ERRORS = [ErrorCondition(boto_error_codes=['InvalidGroup.NotFound']),
50
- ErrorCondition(error_message_must_include='Invalid IAM Instance Profile'),
51
- ErrorCondition(error_message_must_include='no associated IAM Roles')]
57
+ INCONSISTENCY_ERRORS = [
58
+ ErrorCondition(boto_error_codes=["InvalidGroup.NotFound"]),
59
+ ErrorCondition(error_message_must_include="Invalid IAM Instance Profile"),
60
+ ErrorCondition(error_message_must_include="no associated IAM Roles"),
61
+ ]
52
62
 
53
63
 
54
64
  def retry_ec2(t=a_short_time, retry_for=10 * a_short_time, retry_while=not_found):
55
- return old_retry(delays=(t, t, t * 2, t * 4),
56
- timeout=retry_for,
57
- predicate=retry_while)
65
+ return old_retry(
66
+ delays=(t, t, t * 2, t * 4), timeout=retry_for, predicate=retry_while
67
+ )
58
68
 
59
69
 
60
70
  class UnexpectedResourceState(Exception):
61
71
  def __init__(self, resource, to_state, state):
62
72
  super().__init__(
63
- "Expected state of %s to be '%s' but got '%s'" %
64
- (resource, to_state, state))
65
-
66
- def wait_transition(resource, from_states, to_state,
67
- state_getter=attrgetter('state')):
73
+ "Expected state of %s to be '%s' but got '%s'" % (resource, to_state, state)
74
+ )
75
+
76
+
77
+ def wait_transition(
78
+ boto3_ec2: "EC2Client",
79
+ resource: "InstanceTypeDef",
80
+ from_states: Iterable[str],
81
+ to_state: str,
82
+ state_getter: Callable[["InstanceTypeDef"], str] = lambda x: x.get("State").get(
83
+ "Name"
84
+ ),
85
+ ):
68
86
  """
69
87
  Wait until the specified EC2 resource (instance, image, volume, ...) transitions from any
70
88
  of the given 'from' states to the specified 'to' state. If the instance is found in a state
@@ -76,59 +94,84 @@ def wait_transition(resource, from_states, to_state,
76
94
  :param to_state: the state of the resource when this method returns
77
95
  """
78
96
  state = state_getter(resource)
97
+ instance_id = resource["InstanceId"]
79
98
  while state in from_states:
80
99
  time.sleep(a_short_time)
81
100
  for attempt in retry_ec2():
82
101
  with attempt:
83
- resource.update(validate=True)
102
+ described = boto3_ec2.describe_instances(InstanceIds=[instance_id])
103
+ resource = described["Reservations"][0]["Instances"][
104
+ 0
105
+ ] # there should only be one requested
84
106
  state = state_getter(resource)
85
107
  if state != to_state:
86
108
  raise UnexpectedResourceState(resource, to_state, state)
87
109
 
88
110
 
89
- def wait_instances_running(ec2, instances: Iterable[Boto2Instance]) -> Iterable[Boto2Instance]:
111
+ def wait_instances_running(
112
+ boto3_ec2: "EC2Client", instances: Iterable["InstanceTypeDef"]
113
+ ) -> Generator["InstanceTypeDef", None, None]:
90
114
  """
91
115
  Wait until no instance in the given iterable is 'pending'. Yield every instance that
92
116
  entered the running state as soon as it does.
93
117
 
94
- :param boto.ec2.connection.EC2Connection ec2: the EC2 connection to use for making requests
95
- :param Iterable[Boto2Instance] instances: the instances to wait on
96
- :rtype: Iterable[Boto2Instance]
118
+ :param boto3_ec2: the EC2 connection to use for making requests
119
+ :param instances: the instances to wait on
97
120
  """
98
121
  running_ids = set()
99
122
  other_ids = set()
100
123
  while True:
101
124
  pending_ids = set()
102
125
  for i in instances:
103
- if i.state == 'pending':
104
- pending_ids.add(i.id)
105
- elif i.state == 'running':
106
- if i.id in running_ids:
107
- raise RuntimeError("An instance was already added to the list of running instance IDs. Maybe there is a duplicate.")
108
- running_ids.add(i.id)
126
+ i: "InstanceTypeDef"
127
+ if i["State"]["Name"] == "pending":
128
+ pending_ids.add(i["InstanceId"])
129
+ elif i["State"]["Name"] == "running":
130
+ if i["InstanceId"] in running_ids:
131
+ raise RuntimeError(
132
+ "An instance was already added to the list of running instance IDs. Maybe there is a duplicate."
133
+ )
134
+ running_ids.add(i["InstanceId"])
109
135
  yield i
110
136
  else:
111
- if i.id in other_ids:
112
- raise RuntimeError("An instance was already added to the list of other instances. Maybe there is a duplicate.")
113
- other_ids.add(i.id)
137
+ if i["InstanceId"] in other_ids:
138
+ raise RuntimeError(
139
+ "An instance was already added to the list of other instances. Maybe there is a duplicate."
140
+ )
141
+ other_ids.add(i["InstanceId"])
114
142
  yield i
115
- logger.info('%i instance(s) pending, %i running, %i other.',
116
- *list(map(len, (pending_ids, running_ids, other_ids))))
143
+ logger.info(
144
+ "%i instance(s) pending, %i running, %i other.",
145
+ *list(map(len, (pending_ids, running_ids, other_ids))),
146
+ )
117
147
  if not pending_ids:
118
148
  break
119
149
  seconds = max(a_short_time, min(len(pending_ids), 10 * a_short_time))
120
- logger.info('Sleeping for %is', seconds)
150
+ logger.info("Sleeping for %is", seconds)
121
151
  time.sleep(seconds)
122
152
  for attempt in retry_ec2():
123
153
  with attempt:
124
- instances = ec2.get_only_instances(list(pending_ids))
125
-
126
-
127
- def wait_spot_requests_active(ec2, requests: Iterable[SpotInstanceRequest], timeout: float = None, tentative: bool = False) -> Iterable[List[SpotInstanceRequest]]:
154
+ described_instances = boto3_ec2.describe_instances(
155
+ InstanceIds=list(pending_ids)
156
+ )
157
+ instances = [
158
+ instance
159
+ for reservation in described_instances["Reservations"]
160
+ for instance in reservation["Instances"]
161
+ ]
162
+
163
+
164
+ def wait_spot_requests_active(
165
+ boto3_ec2: "EC2Client",
166
+ requests: Iterable["SpotInstanceRequestTypeDef"],
167
+ timeout: float = None,
168
+ tentative: bool = False,
169
+ ) -> Iterable[list["SpotInstanceRequestTypeDef"]]:
128
170
  """
129
171
  Wait until no spot request in the given iterator is in the 'open' state or, optionally,
130
172
  a timeout occurs. Yield spot requests as soon as they leave the 'open' state.
131
173
 
174
+ :param boto3_ec2: ec2 client
132
175
  :param requests: The requests to wait on.
133
176
 
134
177
  :param timeout: Maximum time in seconds to spend waiting or None to wait forever. If a
@@ -145,55 +188,68 @@ def wait_spot_requests_active(ec2, requests: Iterable[SpotInstanceRequest], time
145
188
  other_ids = set()
146
189
  open_ids = None
147
190
 
148
- def cancel():
149
- logger.warning('Cancelling remaining %i spot requests.', len(open_ids))
150
- ec2.cancel_spot_instance_requests(list(open_ids))
191
+ def cancel() -> None:
192
+ logger.warning("Cancelling remaining %i spot requests.", len(open_ids))
193
+ boto3_ec2.cancel_spot_instance_requests(SpotInstanceRequestIds=list(open_ids))
151
194
 
152
- def spot_request_not_found(e):
153
- return get_error_code(e) == 'InvalidSpotInstanceRequestID.NotFound'
195
+ def spot_request_not_found(e: Exception) -> bool:
196
+ return get_error_code(e) == "InvalidSpotInstanceRequestID.NotFound"
154
197
 
155
198
  try:
156
199
  while True:
157
200
  open_ids, eval_ids, fulfill_ids = set(), set(), set()
158
201
  batch = []
159
202
  for r in requests:
160
- if r.state == 'open':
161
- open_ids.add(r.id)
162
- if r.status.code == 'pending-evaluation':
163
- eval_ids.add(r.id)
164
- elif r.status.code == 'pending-fulfillment':
165
- fulfill_ids.add(r.id)
203
+ r: "SpotInstanceRequestTypeDef" # pycharm thinks it is a string
204
+ if r["State"] == "open":
205
+ open_ids.add(r["InstanceId"])
206
+ if r["Status"] == "pending-evaluation":
207
+ eval_ids.add(r["InstanceId"])
208
+ elif r["Status"] == "pending-fulfillment":
209
+ fulfill_ids.add(r["InstanceId"])
166
210
  else:
167
211
  logger.info(
168
- 'Request %s entered status %s indicating that it will not be '
169
- 'fulfilled anytime soon.', r.id, r.status.code)
170
- elif r.state == 'active':
171
- if r.id in active_ids:
172
- raise RuntimeError("A request was already added to the list of active requests. Maybe there are duplicate requests.")
173
- active_ids.add(r.id)
212
+ "Request %s entered status %s indicating that it will not be "
213
+ "fulfilled anytime soon.",
214
+ r["InstanceId"],
215
+ r["Status"],
216
+ )
217
+ elif r["State"] == "active":
218
+ if r["InstanceId"] in active_ids:
219
+ raise RuntimeError(
220
+ "A request was already added to the list of active requests. Maybe there are duplicate requests."
221
+ )
222
+ active_ids.add(r["InstanceId"])
174
223
  batch.append(r)
175
224
  else:
176
- if r.id in other_ids:
177
- raise RuntimeError("A request was already added to the list of other IDs. Maybe there are duplicate requests.")
178
- other_ids.add(r.id)
225
+ if r["InstanceId"] in other_ids:
226
+ raise RuntimeError(
227
+ "A request was already added to the list of other IDs. Maybe there are duplicate requests."
228
+ )
229
+ other_ids.add(r["InstanceId"])
179
230
  batch.append(r)
180
231
  if batch:
181
232
  yield batch
182
- logger.info('%i spot requests(s) are open (%i of which are pending evaluation and %i '
183
- 'are pending fulfillment), %i are active and %i are in another state.',
184
- *list(map(len, (open_ids, eval_ids, fulfill_ids, active_ids, other_ids))))
233
+ logger.info(
234
+ "%i spot requests(s) are open (%i of which are pending evaluation and %i "
235
+ "are pending fulfillment), %i are active and %i are in another state.",
236
+ *list(
237
+ map(len, (open_ids, eval_ids, fulfill_ids, active_ids, other_ids))
238
+ ),
239
+ )
185
240
  if not open_ids or tentative and not eval_ids and not fulfill_ids:
186
241
  break
187
242
  sleep_time = 2 * a_short_time
188
243
  if timeout is not None and time.time() + sleep_time >= timeout:
189
- logger.warning('Timed out waiting for spot requests.')
244
+ logger.warning("Timed out waiting for spot requests.")
190
245
  break
191
- logger.info('Sleeping for %is', sleep_time)
246
+ logger.info("Sleeping for %is", sleep_time)
192
247
  time.sleep(sleep_time)
193
248
  for attempt in retry_ec2(retry_while=spot_request_not_found):
194
249
  with attempt:
195
- requests = ec2.get_all_spot_instance_requests(
196
- list(open_ids))
250
+ requests = boto3_ec2.describe_spot_instance_requests(
251
+ SpotInstanceRequestIds=list(open_ids)
252
+ )
197
253
  except BaseException:
198
254
  if open_ids:
199
255
  with panic(logger):
@@ -204,73 +260,125 @@ def wait_spot_requests_active(ec2, requests: Iterable[SpotInstanceRequest], time
204
260
  cancel()
205
261
 
206
262
 
207
- def create_spot_instances(ec2, price, image_id, spec, num_instances=1, timeout=None, tentative=False, tags=None) -> Iterable[List[Boto2Instance]]:
263
+ def create_spot_instances(
264
+ boto3_ec2: "EC2Client",
265
+ price,
266
+ image_id,
267
+ spec,
268
+ num_instances=1,
269
+ timeout=None,
270
+ tentative=False,
271
+ tags=None,
272
+ ) -> Generator["DescribeInstancesResultTypeDef", None, None]:
208
273
  """
209
274
  Create instances on the spot market.
210
275
  """
211
- def spotRequestNotFound(e):
212
- return getattr(e, 'error_code', None) == "InvalidSpotInstanceRequestID.NotFound"
213
276
 
214
- for attempt in retry_ec2(retry_for=a_long_time,
215
- retry_while=inconsistencies_detected):
277
+ def spotRequestNotFound(e):
278
+ return getattr(e, "error_code", None) == "InvalidSpotInstanceRequestID.NotFound"
279
+
280
+ spec["LaunchSpecification"].update(
281
+ {"ImageId": image_id}
282
+ ) # boto3 image id is in the launch specification
283
+ for attempt in retry_ec2(
284
+ retry_for=a_long_time, retry_while=inconsistencies_detected
285
+ ):
216
286
  with attempt:
217
- requests = ec2.request_spot_instances(
218
- price, image_id, count=num_instances, **spec)
287
+ requests_dict = boto3_ec2.request_spot_instances(
288
+ SpotPrice=price, InstanceCount=num_instances, **spec
289
+ )
290
+ requests = requests_dict["SpotInstanceRequests"]
219
291
 
220
292
  if tags is not None:
221
- for requestID in (request.id for request in requests):
293
+ for requestID in (request["SpotInstanceRequestId"] for request in requests):
222
294
  for attempt in retry_ec2(retry_while=spotRequestNotFound):
223
295
  with attempt:
224
- ec2.create_tags([requestID], tags)
296
+ boto3_ec2.create_tags(Resources=[requestID], Tags=tags)
225
297
 
226
298
  num_active, num_other = 0, 0
227
299
  # noinspection PyUnboundLocalVariable,PyTypeChecker
228
300
  # request_spot_instances's type annotation is wrong
229
- for batch in wait_spot_requests_active(ec2,
230
- requests,
231
- timeout=timeout,
232
- tentative=tentative):
301
+ for batch in wait_spot_requests_active(
302
+ boto3_ec2, requests, timeout=timeout, tentative=tentative
303
+ ):
233
304
  instance_ids = []
234
305
  for request in batch:
235
- if request.state == 'active':
236
- instance_ids.append(request.instance_id)
306
+ request: "SpotInstanceRequestTypeDef"
307
+ if request["State"] == "active":
308
+ instance_ids.append(request["InstanceId"])
237
309
  num_active += 1
238
310
  else:
239
311
  logger.info(
240
- 'Request %s in unexpected state %s.',
241
- request.id,
242
- request.state)
312
+ "Request %s in unexpected state %s.",
313
+ request["InstanceId"],
314
+ request["State"],
315
+ )
243
316
  num_other += 1
244
317
  if instance_ids:
245
318
  # This next line is the reason we batch. It's so we can get multiple instances in
246
319
  # a single request.
247
- yield ec2.get_only_instances(instance_ids)
320
+ for instance_id in instance_ids:
321
+ for attempt in retry_ec2():
322
+ with attempt:
323
+ # Increase hop limit from 1 to use Instance Metadata V2
324
+ boto3_ec2.modify_instance_metadata_options(
325
+ InstanceId=instance_id, HttpPutResponseHopLimit=3
326
+ )
327
+ yield boto3_ec2.describe_instances(InstanceIds=instance_ids)
248
328
  if not num_active:
249
- message = 'None of the spot requests entered the active state'
329
+ message = "None of the spot requests entered the active state"
250
330
  if tentative:
251
- logger.warning(message + '.')
331
+ logger.warning(message + ".")
252
332
  else:
253
333
  raise RuntimeError(message)
254
334
  if num_other:
255
- logger.warning('%i request(s) entered a state other than active.', num_other)
335
+ logger.warning("%i request(s) entered a state other than active.", num_other)
256
336
 
257
337
 
258
- def create_ondemand_instances(ec2, image_id, spec, num_instances=1) -> List[Boto2Instance]:
338
+ def create_ondemand_instances(
339
+ boto3_ec2: "EC2Client",
340
+ image_id: str,
341
+ spec: Mapping[str, Any],
342
+ num_instances: int = 1,
343
+ ) -> list["InstanceTypeDef"]:
259
344
  """
260
345
  Requests the RunInstances EC2 API call but accounts for the race between recently created
261
346
  instance profiles, IAM roles and an instance creation that refers to them.
262
-
263
- :rtype: List[Boto2Instance]
264
347
  """
265
- instance_type = spec['instance_type']
266
- logger.info('Creating %s instance(s) ... ', instance_type)
267
- for attempt in retry_ec2(retry_for=a_long_time,
268
- retry_while=inconsistencies_detected):
348
+ instance_type = spec["InstanceType"]
349
+ logger.info("Creating %s instance(s) ... ", instance_type)
350
+ boto_instance_list = []
351
+ for attempt in retry_ec2(
352
+ retry_for=a_long_time, retry_while=inconsistencies_detected
353
+ ):
269
354
  with attempt:
270
- return ec2.run_instances(image_id,
271
- min_count=num_instances,
272
- max_count=num_instances,
273
- **spec).instances
355
+ boto_instance_list: list["InstanceTypeDef"] = boto3_ec2.run_instances(
356
+ ImageId=image_id, MinCount=num_instances, MaxCount=num_instances, **spec
357
+ )["Instances"]
358
+
359
+ return boto_instance_list
360
+
361
+
362
+ def increase_instance_hop_limit(
363
+ boto3_ec2: "EC2Client", boto_instance_list: list["InstanceTypeDef"]
364
+ ) -> None:
365
+ """
366
+ Increase the default HTTP hop limit, as we are running Toil and Kubernetes inside a Docker container, so the default
367
+ hop limit of 1 will not be enough when grabbing metadata information with ec2_metadata
368
+
369
+ Must be called after the instances are guaranteed to be running.
370
+
371
+ :param boto_instance_list: List of boto instances to modify
372
+ :return:
373
+ """
374
+ for boto_instance in boto_instance_list:
375
+ instance_id = boto_instance["InstanceId"]
376
+ for attempt in retry_ec2():
377
+ with attempt:
378
+ # Increase hop limit from 1 to use Instance Metadata V2
379
+ boto3_ec2.modify_instance_metadata_options(
380
+ InstanceId=instance_id, HttpPutResponseHopLimit=3
381
+ )
274
382
 
275
383
 
276
384
  def prune(bushy: dict) -> dict:
@@ -287,32 +395,37 @@ def prune(bushy: dict) -> dict:
287
395
 
288
396
  # We need a module-level client to get the dynamically-generated error types to
289
397
  # catch, and to wait on IAM items.
290
- iam_client = establish_boto3_session().client('iam')
398
+ iam_client = establish_boto3_session().client("iam")
399
+
291
400
 
292
401
  # exception is generated by a factory so we weirdly need a client instance to reference it
293
- @retry(errors=[iam_client.exceptions.NoSuchEntityException],
294
- intervals=[1, 1, 2, 4, 8, 16, 32, 64])
402
+ @retry(
403
+ errors=[iam_client.exceptions.NoSuchEntityException],
404
+ intervals=[1, 1, 2, 4, 8, 16, 32, 64],
405
+ )
295
406
  def wait_until_instance_profile_arn_exists(instance_profile_arn: str):
296
407
  # TODO: We have no guarantee that the ARN contains the name.
297
- instance_profile_name = instance_profile_arn.split(':instance-profile/')[-1]
408
+ instance_profile_name = instance_profile_arn.split(":instance-profile/")[-1]
298
409
  logger.debug("Checking for instance profile %s...", instance_profile_name)
299
410
  iam_client.get_instance_profile(InstanceProfileName=instance_profile_name)
300
411
  logger.debug("Instance profile found")
301
412
 
302
413
 
303
414
  @retry(intervals=[5, 5, 10, 20, 20, 20, 20], errors=INCONSISTENCY_ERRORS)
304
- def create_instances(ec2_resource: ServiceResource,
305
- image_id: str,
306
- key_name: str,
307
- instance_type: str,
308
- num_instances: int = 1,
309
- security_group_ids: Optional[List] = None,
310
- user_data: Optional[Union[str, bytes]] = None,
311
- block_device_map: Optional[List[Dict]] = None,
312
- instance_profile_arn: Optional[str] = None,
313
- placement_az: Optional[str] = None,
314
- subnet_id: str = None,
315
- tags: Optional[Dict[str, str]] = None) -> List[dict]:
415
+ def create_instances(
416
+ ec2_resource: "EC2ServiceResource",
417
+ image_id: str,
418
+ key_name: str,
419
+ instance_type: str,
420
+ num_instances: int = 1,
421
+ security_group_ids: Optional[list] = None,
422
+ user_data: Optional[Union[str, bytes]] = None,
423
+ block_device_map: Optional[list[dict]] = None,
424
+ instance_profile_arn: Optional[str] = None,
425
+ placement_az: Optional[str] = None,
426
+ subnet_id: str = None,
427
+ tags: Optional[dict[str, str]] = None,
428
+ ) -> list["Instance"]:
316
429
  """
317
430
  Replaces create_ondemand_instances. Uses boto3 and returns a list of Boto3 instance dicts.
318
431
 
@@ -323,20 +436,25 @@ def create_instances(ec2_resource: ServiceResource,
323
436
 
324
437
  Tags, if given, are applied to the instances, and all volumes.
325
438
  """
326
- logger.info('Creating %s instance(s) ... ', instance_type)
439
+ logger.info("Creating %s instance(s) ... ", instance_type)
327
440
 
328
441
  if isinstance(user_data, str):
329
- user_data = user_data.encode('utf-8')
330
-
331
- request = {'ImageId': image_id,
332
- 'MinCount': num_instances,
333
- 'MaxCount': num_instances,
334
- 'KeyName': key_name,
335
- 'SecurityGroupIds': security_group_ids,
336
- 'InstanceType': instance_type,
337
- 'UserData': user_data,
338
- 'BlockDeviceMappings': block_device_map,
339
- 'SubnetId': subnet_id}
442
+ user_data = user_data.encode("utf-8")
443
+
444
+ request = {
445
+ "ImageId": image_id,
446
+ "MinCount": num_instances,
447
+ "MaxCount": num_instances,
448
+ "KeyName": key_name,
449
+ "SecurityGroupIds": security_group_ids,
450
+ "InstanceType": instance_type,
451
+ "UserData": user_data,
452
+ "BlockDeviceMappings": block_device_map,
453
+ "SubnetId": subnet_id,
454
+ # Metadata V2 defaults hops to 1, which is an issue when running inside a docker container
455
+ # https://github.com/adamchainz/ec2-metadata?tab=readme-ov-file#instance-metadata-service-version-2
456
+ "MetadataOptions": {"HttpPutResponseHopLimit": 3},
457
+ }
340
458
 
341
459
  if instance_profile_arn:
342
460
  # We could just retry when we get an error because the ARN doesn't
@@ -344,32 +462,37 @@ def create_instances(ec2_resource: ServiceResource,
344
462
  wait_until_instance_profile_arn_exists(instance_profile_arn)
345
463
 
346
464
  # Add it to the request
347
- request['IamInstanceProfile'] = {'Arn': instance_profile_arn}
465
+ request["IamInstanceProfile"] = {"Arn": instance_profile_arn}
348
466
 
349
467
  if placement_az:
350
- request['Placement'] = {'AvailabilityZone': placement_az}
468
+ request["Placement"] = {"AvailabilityZone": placement_az}
351
469
 
352
470
  if tags:
353
471
  # Tag everything when we make it.
354
472
  flat_tags = flatten_tags(tags)
355
- request['TagSpecifications'] = [{'ResourceType': 'instance', 'Tags': flat_tags},
356
- {'ResourceType': 'volume', 'Tags': flat_tags}]
473
+ request["TagSpecifications"] = [
474
+ {"ResourceType": "instance", "Tags": flat_tags},
475
+ {"ResourceType": "volume", "Tags": flat_tags},
476
+ ]
357
477
 
358
478
  return ec2_resource.create_instances(**prune(request))
359
479
 
480
+
360
481
  @retry(intervals=[5, 5, 10, 20, 20, 20, 20], errors=INCONSISTENCY_ERRORS)
361
- def create_launch_template(ec2_client: BaseClient,
362
- template_name: str,
363
- image_id: str,
364
- key_name: str,
365
- instance_type: str,
366
- security_group_ids: Optional[List] = None,
367
- user_data: Optional[Union[str, bytes]] = None,
368
- block_device_map: Optional[List[Dict]] = None,
369
- instance_profile_arn: Optional[str] = None,
370
- placement_az: Optional[str] = None,
371
- subnet_id: Optional[str] = None,
372
- tags: Optional[Dict[str, str]] = None) -> str:
482
+ def create_launch_template(
483
+ ec2_client: "EC2Client",
484
+ template_name: str,
485
+ image_id: str,
486
+ key_name: str,
487
+ instance_type: str,
488
+ security_group_ids: Optional[list] = None,
489
+ user_data: Optional[Union[str, bytes]] = None,
490
+ block_device_map: Optional[list[dict]] = None,
491
+ instance_profile_arn: Optional[str] = None,
492
+ placement_az: Optional[str] = None,
493
+ subnet_id: Optional[str] = None,
494
+ tags: Optional[dict[str, str]] = None,
495
+ ) -> str:
373
496
  """
374
497
  Creates a launch template with the given name for launching instances with the given parameters.
375
498
 
@@ -385,22 +508,26 @@ def create_launch_template(ec2_client: BaseClient,
385
508
 
386
509
 
387
510
  """
388
- logger.info('Creating launch template for %s instances ... ', instance_type)
511
+ logger.info("Creating launch template for %s instances ... ", instance_type)
389
512
 
390
513
  if isinstance(user_data, str):
391
514
  # Make sure we have bytes
392
- user_data = user_data.encode('utf-8')
515
+ user_data = user_data.encode("utf-8")
393
516
 
394
517
  # Then base64 and decode back to str.
395
- user_data = b64encode(user_data).decode('utf-8')
396
-
397
- template = {'ImageId': image_id,
398
- 'KeyName': key_name,
399
- 'SecurityGroupIds': security_group_ids,
400
- 'InstanceType': instance_type,
401
- 'UserData': user_data,
402
- 'BlockDeviceMappings': block_device_map,
403
- 'SubnetId': subnet_id}
518
+ user_data = b64encode(user_data).decode("utf-8")
519
+
520
+ template = {
521
+ "ImageId": image_id,
522
+ "KeyName": key_name,
523
+ "SecurityGroupIds": security_group_ids,
524
+ "InstanceType": instance_type,
525
+ "UserData": user_data,
526
+ "BlockDeviceMappings": block_device_map,
527
+ "SubnetId": subnet_id,
528
+ # Increase hop limit from 1 to use Instance Metadata V2
529
+ "MetadataOptions": {"HttpPutResponseHopLimit": 3},
530
+ }
404
531
 
405
532
  if instance_profile_arn:
406
533
  # We could just retry when we get an error because the ARN doesn't
@@ -408,38 +535,48 @@ def create_launch_template(ec2_client: BaseClient,
408
535
  wait_until_instance_profile_arn_exists(instance_profile_arn)
409
536
 
410
537
  # Add it to the request
411
- template['IamInstanceProfile'] = {'Arn': instance_profile_arn}
538
+ template["IamInstanceProfile"] = {"Arn": instance_profile_arn}
412
539
 
413
540
  if placement_az:
414
- template['Placement'] = {'AvailabilityZone': placement_az}
541
+ template["Placement"] = {"AvailabilityZone": placement_az}
415
542
 
543
+ flat_tags = []
416
544
  if tags:
417
545
  # Tag everything when we make it.
418
546
  flat_tags = flatten_tags(tags)
419
- template['TagSpecifications'] = [{'ResourceType': 'instance', 'Tags': flat_tags},
420
- {'ResourceType': 'volume', 'Tags': flat_tags}]
547
+ template["TagSpecifications"] = [
548
+ {"ResourceType": "instance", "Tags": flat_tags},
549
+ {"ResourceType": "volume", "Tags": flat_tags},
550
+ ]
421
551
 
422
- request = {'LaunchTemplateData': prune(template),
423
- 'LaunchTemplateName': template_name}
552
+ request = {
553
+ "LaunchTemplateData": prune(template),
554
+ "LaunchTemplateName": template_name,
555
+ }
424
556
 
425
557
  if tags:
426
- request['TagSpecifications'] = [{'ResourceType': 'launch-template', 'Tags': flat_tags}]
558
+ request["TagSpecifications"] = [
559
+ {"ResourceType": "launch-template", "Tags": flat_tags}
560
+ ]
427
561
 
428
- return ec2_client.create_launch_template(**request)['LaunchTemplate']['LaunchTemplateId']
562
+ return ec2_client.create_launch_template(**request)["LaunchTemplate"][
563
+ "LaunchTemplateId"
564
+ ]
429
565
 
430
566
 
431
567
  @retry(intervals=[5, 5, 10, 20, 20, 20, 20], errors=INCONSISTENCY_ERRORS)
432
- def create_auto_scaling_group(autoscaling_client: BaseClient,
433
- asg_name: str,
434
- launch_template_ids: Dict[str, str],
435
- vpc_subnets: List[str],
436
- min_size: int,
437
- max_size: int,
438
- instance_types: Optional[List[str]] = None,
439
- spot_bid: Optional[float] = None,
440
- spot_cheapest: bool = False,
441
- tags: Optional[Dict[str, str]] = None) -> None:
442
-
568
+ def create_auto_scaling_group(
569
+ autoscaling_client: "AutoScalingClient",
570
+ asg_name: str,
571
+ launch_template_ids: dict[str, str],
572
+ vpc_subnets: list[str],
573
+ min_size: int,
574
+ max_size: int,
575
+ instance_types: Optional[Iterable[str]] = None,
576
+ spot_bid: Optional[float] = None,
577
+ spot_cheapest: bool = False,
578
+ tags: Optional[dict[str, str]] = None,
579
+ ) -> None:
443
580
  """
444
581
  Create a new Auto Scaling Group with the given name (which is also its
445
582
  unique identifier).
@@ -472,19 +609,26 @@ def create_auto_scaling_group(autoscaling_client: BaseClient,
472
609
  """
473
610
 
474
611
  if instance_types is None:
475
- instance_types = []
612
+ instance_types: list[str] = []
476
613
 
477
614
  if instance_types is not None and len(instance_types) > 20:
478
- raise RuntimeError(f"Too many instance types ({len(instance_types)}) in group; AWS supports only 20.")
615
+ raise RuntimeError(
616
+ f"Too many instance types ({len(instance_types)}) in group; AWS supports only 20."
617
+ )
479
618
 
480
619
  if len(vpc_subnets) == 0:
481
- raise RuntimeError("No VPC subnets specified to launch into; not clear where to put instances")
620
+ raise RuntimeError(
621
+ "No VPC subnets specified to launch into; not clear where to put instances"
622
+ )
482
623
 
483
624
  def get_launch_template_spec(instance_type):
484
625
  """
485
626
  Get a LaunchTemplateSpecification for the given instance type.
486
627
  """
487
- return {'LaunchTemplateId': launch_template_ids[instance_type], 'Version': '$Default'}
628
+ return {
629
+ "LaunchTemplateId": launch_template_ids[instance_type],
630
+ "Version": "$Default",
631
+ }
488
632
 
489
633
  # We always write the ASG with a MixedInstancesPolicy even when we have only one type.
490
634
  # And we use a separate launch template for every instance type, and apply it as an override.
@@ -493,24 +637,42 @@ def create_auto_scaling_group(autoscaling_client: BaseClient,
493
637
  # We need to use a launch template per instance type so that different
494
638
  # instance types with specified EBS storage size overrides will get their
495
639
  # storage.
496
- mip = {'LaunchTemplate': {'LaunchTemplateSpecification': get_launch_template_spec(next(iter(instance_types))),
497
- 'Overrides': [{'InstanceType': t, 'LaunchTemplateSpecification': get_launch_template_spec(t)} for t in instance_types]}}
640
+ mip = {
641
+ "LaunchTemplate": {
642
+ "LaunchTemplateSpecification": get_launch_template_spec(
643
+ next(iter(instance_types))
644
+ ), # noqa
645
+ "Overrides": [
646
+ {
647
+ "InstanceType": t,
648
+ "LaunchTemplateSpecification": get_launch_template_spec(t),
649
+ }
650
+ for t in instance_types
651
+ ],
652
+ }
653
+ } # noqa
498
654
 
499
655
  if spot_bid is not None:
500
656
  # Ask for spot instances by saying everything above base capacity of 0 should be spot.
501
- mip['InstancesDistribution'] = {'OnDemandPercentageAboveBaseCapacity': 0,
502
- 'SpotAllocationStrategy': 'capacity-optimized' if not spot_cheapest else 'lowest-price',
503
- 'SpotMaxPrice': str(spot_bid)}
504
-
505
- asg = {'AutoScalingGroupName': asg_name,
506
- 'MixedInstancesPolicy': prune(mip),
507
- 'MinSize': min_size,
508
- 'MaxSize': max_size,
509
- 'VPCZoneIdentifier': ','.join(vpc_subnets)}
657
+ mip["InstancesDistribution"] = {
658
+ "OnDemandPercentageAboveBaseCapacity": 0,
659
+ "SpotAllocationStrategy": (
660
+ "capacity-optimized" if not spot_cheapest else "lowest-price"
661
+ ),
662
+ "SpotMaxPrice": str(spot_bid),
663
+ }
664
+
665
+ asg = {
666
+ "AutoScalingGroupName": asg_name,
667
+ "MixedInstancesPolicy": prune(mip),
668
+ "MinSize": min_size,
669
+ "MaxSize": max_size,
670
+ "VPCZoneIdentifier": ",".join(vpc_subnets),
671
+ }
510
672
 
511
673
  if tags:
512
674
  # Tag the ASG itself.
513
- asg['Tags'] = flatten_tags(tags)
675
+ asg["Tags"] = flatten_tags(tags)
514
676
 
515
677
  logger.debug("Creating Autoscaling Group across subnets: %s", vpc_subnets)
516
678