ob-metaflow 2.11.15.3__py2.py3-none-any.whl → 2.11.16.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ob-metaflow might be problematic. Click here for more details.
- metaflow/__init__.py +3 -0
- metaflow/clone_util.py +6 -0
- metaflow/extension_support/plugins.py +1 -1
- metaflow/metaflow_config.py +5 -3
- metaflow/metaflow_environment.py +3 -3
- metaflow/plugins/__init__.py +4 -4
- metaflow/plugins/azure/azure_secret_manager_secrets_provider.py +18 -14
- metaflow/plugins/datatools/s3/s3.py +1 -1
- metaflow/plugins/gcp/__init__.py +1 -1
- metaflow/plugins/gcp/gcp_secret_manager_secrets_provider.py +11 -6
- metaflow/plugins/kubernetes/kubernetes.py +79 -49
- metaflow/plugins/kubernetes/kubernetes_cli.py +20 -33
- metaflow/plugins/kubernetes/kubernetes_client.py +4 -1
- metaflow/plugins/kubernetes/kubernetes_decorator.py +44 -61
- metaflow/plugins/kubernetes/kubernetes_job.py +217 -584
- metaflow/plugins/kubernetes/kubernetes_jobsets.py +784 -0
- metaflow/plugins/timeout_decorator.py +2 -1
- metaflow/task.py +1 -12
- metaflow/tuple_util.py +27 -0
- metaflow/util.py +0 -15
- metaflow/version.py +1 -1
- {ob_metaflow-2.11.15.3.dist-info → ob_metaflow-2.11.16.1.dist-info}/METADATA +2 -2
- {ob_metaflow-2.11.15.3.dist-info → ob_metaflow-2.11.16.1.dist-info}/RECORD +27 -25
- {ob_metaflow-2.11.15.3.dist-info → ob_metaflow-2.11.16.1.dist-info}/LICENSE +0 -0
- {ob_metaflow-2.11.15.3.dist-info → ob_metaflow-2.11.16.1.dist-info}/WHEEL +0 -0
- {ob_metaflow-2.11.15.3.dist-info → ob_metaflow-2.11.16.1.dist-info}/entry_points.txt +0 -0
- {ob_metaflow-2.11.15.3.dist-info → ob_metaflow-2.11.16.1.dist-info}/top_level.txt +0 -0
|
@@ -2,18 +2,23 @@ import json
|
|
|
2
2
|
import math
|
|
3
3
|
import random
|
|
4
4
|
import time
|
|
5
|
-
import os
|
|
6
|
-
import socket
|
|
7
5
|
import copy
|
|
8
|
-
|
|
6
|
+
import sys
|
|
7
|
+
from metaflow.tracing import inject_tracing_vars
|
|
9
8
|
from metaflow.exception import MetaflowException
|
|
10
9
|
from metaflow.metaflow_config import KUBERNETES_SECRETS
|
|
10
|
+
from metaflow.unbounded_foreach import UBF_CONTROL, UBF_TASK
|
|
11
11
|
|
|
12
12
|
CLIENT_REFRESH_INTERVAL_SECONDS = 300
|
|
13
|
+
from .kubernetes_jobsets import (
|
|
14
|
+
KubernetesJobSet, # We need this import for Kubernetes Client.
|
|
15
|
+
)
|
|
16
|
+
|
|
13
17
|
|
|
14
18
|
class KubernetesJobException(MetaflowException):
|
|
15
19
|
headline = "Kubernetes job error"
|
|
16
20
|
|
|
21
|
+
|
|
17
22
|
# Implements truncated exponential backoff from
|
|
18
23
|
# https://cloud.google.com/storage/docs/retry-strategy#exponential-backoff
|
|
19
24
|
def k8s_retry(deadline_seconds=60, max_backoff=32):
|
|
@@ -56,19 +61,7 @@ class KubernetesJob(object):
|
|
|
56
61
|
self._client = client
|
|
57
62
|
self._kwargs = kwargs
|
|
58
63
|
|
|
59
|
-
def
|
|
60
|
-
# A discerning eye would notice and question the choice of using the
|
|
61
|
-
# V1Job construct over the V1Pod construct given that we don't rely much
|
|
62
|
-
# on any of the V1Job semantics. The major reasons at the moment are -
|
|
63
|
-
# 1. It makes the Kubernetes UIs (Octant, Lens) a bit easier on
|
|
64
|
-
# the eyes, although even that can be questioned.
|
|
65
|
-
# 2. AWS Step Functions, at the moment (Apr' 22) only supports
|
|
66
|
-
# executing Jobs and not Pods as part of it's publicly declared
|
|
67
|
-
# API. When we ship the AWS Step Functions integration with EKS,
|
|
68
|
-
# it will hopefully lessen our workload.
|
|
69
|
-
#
|
|
70
|
-
# Note: This implementation ensures that there is only one unique Pod
|
|
71
|
-
# (unique UID) per Metaflow task attempt.
|
|
64
|
+
def create_job_spec(self):
|
|
72
65
|
client = self._client.get()
|
|
73
66
|
|
|
74
67
|
# tmpfs variables
|
|
@@ -80,529 +73,103 @@ class KubernetesJob(object):
|
|
|
80
73
|
if self._kwargs["shared_memory"]
|
|
81
74
|
else None
|
|
82
75
|
)
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
passwordless_ssh_service_name = subdomain
|
|
95
|
-
passwordless_ssh_service_selector = {
|
|
96
|
-
"passwordless-ssh-jobset": "true"
|
|
97
|
-
}
|
|
98
|
-
else:
|
|
99
|
-
passwordless_ssh_service_name = None
|
|
100
|
-
passwordless_ssh_service_selector = {}
|
|
101
|
-
|
|
102
|
-
fqdn_suffix = "%s.svc.cluster.local" % self._kwargs["namespace"]
|
|
103
|
-
jobset_main_addr = "%s-%s-%s-%s.%s.%s" % (
|
|
104
|
-
jobset_name,
|
|
105
|
-
main_job_name,
|
|
106
|
-
main_job_index,
|
|
107
|
-
main_pod_index,
|
|
108
|
-
subdomain,
|
|
109
|
-
fqdn_suffix,
|
|
110
|
-
)
|
|
111
|
-
|
|
112
|
-
def _install_jobset(
|
|
113
|
-
repo_url="https://github.com/kubernetes-sigs/jobset",
|
|
114
|
-
python_sdk_path="jobset/sdk/python",
|
|
115
|
-
):
|
|
116
|
-
|
|
117
|
-
# TODO (Eddie): Remove this and suggest to user.
|
|
118
|
-
|
|
119
|
-
import subprocess
|
|
120
|
-
import tempfile
|
|
121
|
-
import shutil
|
|
122
|
-
import os
|
|
123
|
-
|
|
124
|
-
with open(os.devnull, "wb") as devnull:
|
|
125
|
-
cwd = os.getcwd()
|
|
126
|
-
tmp_dir = tempfile.mkdtemp()
|
|
127
|
-
os.chdir(tmp_dir)
|
|
128
|
-
subprocess.check_call(
|
|
129
|
-
["git", "clone", repo_url], stdout=devnull, stderr=subprocess.STDOUT
|
|
130
|
-
)
|
|
131
|
-
tmp_python_sdk_path = os.path.join(tmp_dir, python_sdk_path)
|
|
132
|
-
os.chdir(tmp_python_sdk_path)
|
|
133
|
-
subprocess.check_call(
|
|
134
|
-
["pip", "install", "."], stdout=devnull, stderr=subprocess.STDOUT
|
|
135
|
-
)
|
|
136
|
-
os.chdir(cwd)
|
|
137
|
-
shutil.rmtree(tmp_dir)
|
|
138
|
-
|
|
139
|
-
def _get_passwordless_ssh_service():
|
|
140
|
-
|
|
141
|
-
return client.V1Service(
|
|
142
|
-
api_version="v1",
|
|
143
|
-
kind="Service",
|
|
144
|
-
metadata=client.V1ObjectMeta(
|
|
145
|
-
name=passwordless_ssh_service_name,
|
|
146
|
-
namespace=self._kwargs["namespace"]
|
|
147
|
-
),
|
|
148
|
-
spec=client.V1ServiceSpec(
|
|
149
|
-
cluster_ip="None",
|
|
150
|
-
internal_traffic_policy="Cluster",
|
|
151
|
-
ip_families=["IPv4"],
|
|
152
|
-
ip_family_policy="SingleStack",
|
|
153
|
-
selector=passwordless_ssh_service_selector,
|
|
154
|
-
session_affinity="None",
|
|
155
|
-
type="ClusterIP",
|
|
156
|
-
ports=[
|
|
157
|
-
client.V1ServicePort(
|
|
158
|
-
name="control",
|
|
159
|
-
port=22,
|
|
160
|
-
protocol="TCP",
|
|
161
|
-
target_port=22
|
|
162
|
-
)
|
|
163
|
-
]
|
|
164
|
-
)
|
|
165
|
-
)
|
|
166
|
-
|
|
167
|
-
def _get_replicated_job(job_name, parallelism, command):
|
|
168
|
-
return jobset.models.jobset_v1alpha2_replicated_job.JobsetV1alpha2ReplicatedJob(
|
|
169
|
-
name=job_name,
|
|
170
|
-
template=client.V1JobTemplateSpec(
|
|
171
|
-
metadata=client.V1ObjectMeta(
|
|
172
|
-
annotations=self._kwargs.get("annotations", {}),
|
|
173
|
-
labels=self._kwargs.get("labels", {}),
|
|
174
|
-
namespace=self._kwargs["namespace"],
|
|
175
|
-
),
|
|
176
|
-
spec=client.V1JobSpec(
|
|
177
|
-
parallelism=parallelism, # how many jobs can run at once
|
|
178
|
-
completions=parallelism, # how many Pods the JobSet creates in total
|
|
179
|
-
backoff_limit=0,
|
|
180
|
-
ttl_seconds_after_finished=7
|
|
181
|
-
* 60
|
|
182
|
-
* 60
|
|
183
|
-
* 24,
|
|
184
|
-
template=client.V1PodTemplateSpec(
|
|
185
|
-
metadata=client.V1ObjectMeta(
|
|
186
|
-
annotations=self._kwargs.get("annotations", {}),
|
|
187
|
-
labels={
|
|
188
|
-
**self._kwargs.get("labels", {}),
|
|
189
|
-
**passwordless_ssh_service_selector, # TODO: necessary?
|
|
190
|
-
# TODO: cluster-name, app.kubernetes.io/name necessary?
|
|
191
|
-
},
|
|
192
|
-
namespace=self._kwargs["namespace"],
|
|
193
|
-
),
|
|
194
|
-
spec=client.V1PodSpec(
|
|
195
|
-
active_deadline_seconds=self._kwargs[
|
|
196
|
-
"timeout_in_seconds"
|
|
197
|
-
],
|
|
198
|
-
containers=[
|
|
199
|
-
client.V1Container(
|
|
200
|
-
command=command,
|
|
201
|
-
ports=[client.V1ContainerPort(container_port=master_port)] if master_port and job_name=="control" else [],
|
|
202
|
-
env=[
|
|
203
|
-
client.V1EnvVar(name=k, value=str(v))
|
|
204
|
-
for k, v in self._kwargs.get(
|
|
205
|
-
"environment_variables", {}
|
|
206
|
-
).items()
|
|
207
|
-
]
|
|
208
|
-
+ [
|
|
209
|
-
client.V1EnvVar(
|
|
210
|
-
name=k,
|
|
211
|
-
value_from=client.V1EnvVarSource(
|
|
212
|
-
field_ref=client.V1ObjectFieldSelector(
|
|
213
|
-
field_path=str(v)
|
|
214
|
-
)
|
|
215
|
-
),
|
|
216
|
-
)
|
|
217
|
-
for k, v in {
|
|
218
|
-
"METAFLOW_KUBERNETES_POD_NAMESPACE": "metadata.namespace",
|
|
219
|
-
"METAFLOW_KUBERNETES_POD_NAME": "metadata.name",
|
|
220
|
-
"METAFLOW_KUBERNETES_POD_ID": "metadata.uid",
|
|
221
|
-
"METAFLOW_KUBERNETES_SERVICE_ACCOUNT_NAME": "spec.serviceAccountName",
|
|
222
|
-
"METAFLOW_KUBERNETES_NODE_IP": "status.hostIP",
|
|
223
|
-
}.items()
|
|
224
|
-
]
|
|
225
|
-
# Mimicking the AWS Batch Multinode env vars.
|
|
226
|
-
+ [
|
|
227
|
-
client.V1EnvVar(
|
|
228
|
-
name="MASTER_ADDR",
|
|
229
|
-
value=jobset_main_addr,
|
|
230
|
-
),
|
|
231
|
-
client.V1EnvVar(
|
|
232
|
-
name="MASTER_PORT",
|
|
233
|
-
value=str(master_port),
|
|
234
|
-
),
|
|
235
|
-
client.V1EnvVar(
|
|
236
|
-
name="RANK",
|
|
237
|
-
value_from=client.V1EnvVarSource(
|
|
238
|
-
field_ref=client.V1ObjectFieldSelector(
|
|
239
|
-
field_path="metadata.annotations['batch.kubernetes.io/job-completion-index']"
|
|
240
|
-
)
|
|
241
|
-
),
|
|
242
|
-
),
|
|
243
|
-
client.V1EnvVar(
|
|
244
|
-
name="WORLD_SIZE",
|
|
245
|
-
value=str(self._kwargs["num_parallel"]),
|
|
246
|
-
),
|
|
247
|
-
client.V1EnvVar(
|
|
248
|
-
name="PYTHONUNBUFFERED",
|
|
249
|
-
value="0",
|
|
250
|
-
),
|
|
251
|
-
],
|
|
252
|
-
env_from=[
|
|
253
|
-
client.V1EnvFromSource(
|
|
254
|
-
secret_ref=client.V1SecretEnvSource(
|
|
255
|
-
name=str(k),
|
|
256
|
-
# optional=True
|
|
257
|
-
)
|
|
258
|
-
)
|
|
259
|
-
for k in list(
|
|
260
|
-
self._kwargs.get("secrets", [])
|
|
261
|
-
)
|
|
262
|
-
+ KUBERNETES_SECRETS.split(",")
|
|
263
|
-
if k
|
|
264
|
-
],
|
|
265
|
-
image=self._kwargs["image"],
|
|
266
|
-
image_pull_policy=self._kwargs[
|
|
267
|
-
"image_pull_policy"
|
|
268
|
-
],
|
|
269
|
-
name=self._kwargs["step_name"].replace(
|
|
270
|
-
"_", "-"
|
|
271
|
-
),
|
|
272
|
-
resources=client.V1ResourceRequirements(
|
|
273
|
-
requests={
|
|
274
|
-
"cpu": str(self._kwargs["cpu"]),
|
|
275
|
-
"memory": "%sM"
|
|
276
|
-
% str(self._kwargs["memory"]),
|
|
277
|
-
"ephemeral-storage": "%sM"
|
|
278
|
-
% str(self._kwargs["disk"]),
|
|
279
|
-
},
|
|
280
|
-
limits={
|
|
281
|
-
"%s.com/gpu".lower()
|
|
282
|
-
% self._kwargs["gpu_vendor"]: str(
|
|
283
|
-
self._kwargs["gpu"]
|
|
284
|
-
)
|
|
285
|
-
for k in [0]
|
|
286
|
-
# Don't set GPU limits if gpu isn't specified.
|
|
287
|
-
if self._kwargs["gpu"] is not None
|
|
288
|
-
},
|
|
289
|
-
),
|
|
290
|
-
volume_mounts=(
|
|
291
|
-
[
|
|
292
|
-
client.V1VolumeMount(
|
|
293
|
-
mount_path=self._kwargs.get(
|
|
294
|
-
"tmpfs_path"
|
|
295
|
-
),
|
|
296
|
-
name="tmpfs-ephemeral-volume",
|
|
297
|
-
)
|
|
298
|
-
]
|
|
299
|
-
if tmpfs_enabled
|
|
300
|
-
else []
|
|
301
|
-
)
|
|
302
|
-
+ (
|
|
303
|
-
[
|
|
304
|
-
client.V1VolumeMount(
|
|
305
|
-
mount_path="/dev/shm",
|
|
306
|
-
name="dhsm"
|
|
307
|
-
)
|
|
308
|
-
]
|
|
309
|
-
if shared_memory else []
|
|
310
|
-
)
|
|
311
|
-
+ (
|
|
312
|
-
[
|
|
313
|
-
client.V1VolumeMount(
|
|
314
|
-
mount_path=path, name=claim
|
|
315
|
-
)
|
|
316
|
-
for claim, path in self._kwargs[
|
|
317
|
-
"persistent_volume_claims"
|
|
318
|
-
].items()
|
|
319
|
-
]
|
|
320
|
-
if self._kwargs["persistent_volume_claims"]
|
|
321
|
-
is not None
|
|
322
|
-
else []
|
|
323
|
-
),
|
|
324
|
-
)
|
|
325
|
-
],
|
|
326
|
-
node_selector=self._kwargs.get("node_selector"),
|
|
327
|
-
restart_policy="Never",
|
|
328
|
-
|
|
329
|
-
set_hostname_as_fqdn=True, # configure pod hostname as pod's FQDN
|
|
330
|
-
share_process_namespace=False, # default
|
|
331
|
-
subdomain=subdomain, # FQDN = <hostname>.<subdomain>.<pod namespace>.svc.<cluster domain>
|
|
332
|
-
|
|
333
|
-
service_account_name=self._kwargs["service_account"],
|
|
334
|
-
termination_grace_period_seconds=0,
|
|
335
|
-
tolerations=[
|
|
336
|
-
client.V1Toleration(**toleration)
|
|
337
|
-
for toleration in self._kwargs.get("tolerations")
|
|
338
|
-
or []
|
|
339
|
-
],
|
|
340
|
-
volumes=(
|
|
341
|
-
[
|
|
342
|
-
client.V1Volume(
|
|
343
|
-
name="tmpfs-ephemeral-volume",
|
|
344
|
-
empty_dir=client.V1EmptyDirVolumeSource(
|
|
345
|
-
medium="Memory",
|
|
346
|
-
size_limit="{}Mi".format(tmpfs_size),
|
|
347
|
-
),
|
|
348
|
-
)
|
|
349
|
-
]
|
|
350
|
-
if tmpfs_enabled
|
|
351
|
-
else []
|
|
352
|
-
)
|
|
353
|
-
+ (
|
|
354
|
-
[
|
|
355
|
-
client.V1Volume(
|
|
356
|
-
name="dhsm",
|
|
357
|
-
empty_dir=client.V1EmptyDirVolumeSource(
|
|
358
|
-
medium="Memory",
|
|
359
|
-
size_limit="{}Mi".format(shared_memory),
|
|
360
|
-
)
|
|
361
|
-
)
|
|
362
|
-
]
|
|
363
|
-
if shared_memory else []
|
|
364
|
-
)
|
|
365
|
-
+ (
|
|
366
|
-
[
|
|
367
|
-
client.V1Volume(
|
|
368
|
-
name=claim,
|
|
369
|
-
persistent_volume_claim=client.V1PersistentVolumeClaimVolumeSource(
|
|
370
|
-
claim_name=claim
|
|
371
|
-
),
|
|
372
|
-
)
|
|
373
|
-
for claim in self._kwargs[
|
|
374
|
-
"persistent_volume_claims"
|
|
375
|
-
].keys()
|
|
376
|
-
]
|
|
377
|
-
if self._kwargs["persistent_volume_claims"]
|
|
378
|
-
is not None
|
|
379
|
-
else []
|
|
380
|
-
),
|
|
381
|
-
),
|
|
382
|
-
),
|
|
383
|
-
),
|
|
384
|
-
),
|
|
385
|
-
)
|
|
386
|
-
|
|
387
|
-
if "num_parallel" in self._kwargs and self._kwargs["num_parallel"] >= 1:
|
|
388
|
-
|
|
389
|
-
try:
|
|
390
|
-
import jobset
|
|
391
|
-
except ImportError:
|
|
392
|
-
_install_jobset()
|
|
393
|
-
import jobset
|
|
394
|
-
|
|
395
|
-
main_commands = copy.copy(self._kwargs["command"])
|
|
396
|
-
main_commands[-1] = main_commands[-1].replace(
|
|
397
|
-
"[multinode-args]", "--split-index 0"
|
|
398
|
-
)
|
|
399
|
-
|
|
400
|
-
task_id = self._kwargs["attrs"]["metaflow.task_id"]
|
|
401
|
-
secondary_commands = copy.copy(self._kwargs["command"])
|
|
402
|
-
# RANK needs +1 because control node is not in the worker index group, yet we want global nodes.
|
|
403
|
-
# Technically, control and worker could be same replicated job type, but cleaner to separate for future use cases.
|
|
404
|
-
secondary_commands[-1] = secondary_commands[-1].replace(
|
|
405
|
-
"[multinode-args]", "--split-index `expr $RANK + 1`"
|
|
406
|
-
)
|
|
407
|
-
secondary_commands[-1] = secondary_commands[-1].replace(
|
|
408
|
-
"ubf_control", "ubf_task"
|
|
409
|
-
)
|
|
410
|
-
secondary_commands[-1] = secondary_commands[-1].replace(
|
|
411
|
-
task_id,
|
|
412
|
-
task_id.replace("control-", "") + "-node-`expr $RANK + 1`",
|
|
413
|
-
)
|
|
414
|
-
|
|
415
|
-
if passwordless_ssh:
|
|
416
|
-
if not os.path.exists("/usr/sbin/sshd"):
|
|
417
|
-
raise KubernetesJobException(
|
|
418
|
-
"This @parallel decorator requires sshd to be installed in the container image."
|
|
419
|
-
"Please install OpenSSH."
|
|
420
|
-
)
|
|
421
|
-
|
|
422
|
-
# run sshd in background
|
|
423
|
-
main_commands[-1] = "/usr/sbin/sshd -D & %s" % main_commands[-1]
|
|
424
|
-
secondary_commands[-1] = "/usr/sbin/sshd -D & %s" % secondary_commands[-1]
|
|
425
|
-
|
|
426
|
-
replicated_jobs = [_get_replicated_job("control", 1, main_commands)]
|
|
427
|
-
if self._kwargs["num_parallel"] > 1:
|
|
428
|
-
replicated_jobs.append(
|
|
429
|
-
_get_replicated_job("worker", self._kwargs["num_parallel"] - 1, secondary_commands)
|
|
430
|
-
)
|
|
431
|
-
|
|
432
|
-
self._jobset = jobset.models.jobset_v1alpha2_job_set.JobsetV1alpha2JobSet(
|
|
433
|
-
api_version="jobset.x-k8s.io/v1alpha2",
|
|
434
|
-
kind="JobSet",
|
|
76
|
+
return client.V1JobSpec(
|
|
77
|
+
# Retries are handled by Metaflow when it is responsible for
|
|
78
|
+
# executing the flow. The responsibility is moved to Kubernetes
|
|
79
|
+
# when Argo Workflows is responsible for the execution.
|
|
80
|
+
backoff_limit=self._kwargs.get("retries", 0),
|
|
81
|
+
completions=self._kwargs.get("completions", 1),
|
|
82
|
+
ttl_seconds_after_finished=7
|
|
83
|
+
* 60
|
|
84
|
+
* 60 # Remove job after a week. TODO: Make this configurable
|
|
85
|
+
* 24,
|
|
86
|
+
template=client.V1PodTemplateSpec(
|
|
435
87
|
metadata=client.V1ObjectMeta(
|
|
436
88
|
annotations=self._kwargs.get("annotations", {}),
|
|
437
89
|
labels=self._kwargs.get("labels", {}),
|
|
438
|
-
name=jobset_name,
|
|
439
90
|
namespace=self._kwargs["namespace"],
|
|
440
91
|
),
|
|
441
|
-
spec=
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
* 24,
|
|
472
|
-
template=client.V1PodTemplateSpec(
|
|
473
|
-
metadata=client.V1ObjectMeta(
|
|
474
|
-
annotations=self._kwargs.get("annotations", {}),
|
|
475
|
-
labels=self._kwargs.get("labels", {}),
|
|
476
|
-
namespace=self._kwargs["namespace"],
|
|
477
|
-
),
|
|
478
|
-
spec=client.V1PodSpec(
|
|
479
|
-
# Timeout is set on the pod and not the job (important!)
|
|
480
|
-
active_deadline_seconds=self._kwargs["timeout_in_seconds"],
|
|
481
|
-
# TODO (savin): Enable affinities for GPU scheduling.
|
|
482
|
-
# affinity=?,
|
|
483
|
-
containers=[
|
|
484
|
-
client.V1Container(
|
|
485
|
-
command=self._kwargs["command"],
|
|
486
|
-
ports=[
|
|
487
|
-
client.V1ContainerPort(
|
|
488
|
-
container_port=int(self._kwargs["port"])
|
|
489
|
-
)
|
|
490
|
-
]
|
|
491
|
-
if "port" in self._kwargs and self._kwargs["port"]
|
|
492
|
-
else None,
|
|
493
|
-
env=[
|
|
494
|
-
client.V1EnvVar(name=k, value=str(v))
|
|
495
|
-
for k, v in self._kwargs.get(
|
|
496
|
-
"environment_variables", {}
|
|
497
|
-
).items()
|
|
498
|
-
]
|
|
499
|
-
# And some downward API magic. Add (key, value)
|
|
500
|
-
# pairs below to make pod metadata available
|
|
501
|
-
# within Kubernetes container.
|
|
502
|
-
+ [
|
|
503
|
-
client.V1EnvVar(
|
|
504
|
-
name=k,
|
|
505
|
-
value_from=client.V1EnvVarSource(
|
|
506
|
-
field_ref=client.V1ObjectFieldSelector(
|
|
507
|
-
field_path=str(v)
|
|
508
|
-
)
|
|
509
|
-
),
|
|
510
|
-
)
|
|
511
|
-
for k, v in {
|
|
512
|
-
"METAFLOW_KUBERNETES_POD_NAMESPACE": "metadata.namespace",
|
|
513
|
-
"METAFLOW_KUBERNETES_POD_NAME": "metadata.name",
|
|
514
|
-
"METAFLOW_KUBERNETES_POD_ID": "metadata.uid",
|
|
515
|
-
"METAFLOW_KUBERNETES_SERVICE_ACCOUNT_NAME": "spec.serviceAccountName",
|
|
516
|
-
"METAFLOW_KUBERNETES_NODE_IP": "status.hostIP",
|
|
517
|
-
}.items()
|
|
518
|
-
],
|
|
519
|
-
env_from=[
|
|
520
|
-
client.V1EnvFromSource(
|
|
521
|
-
secret_ref=client.V1SecretEnvSource(
|
|
522
|
-
name=str(k),
|
|
523
|
-
# optional=True
|
|
524
|
-
)
|
|
92
|
+
spec=client.V1PodSpec(
|
|
93
|
+
# Timeout is set on the pod and not the job (important!)
|
|
94
|
+
active_deadline_seconds=self._kwargs["timeout_in_seconds"],
|
|
95
|
+
# TODO (savin): Enable affinities for GPU scheduling.
|
|
96
|
+
# affinity=?,
|
|
97
|
+
containers=[
|
|
98
|
+
client.V1Container(
|
|
99
|
+
command=self._kwargs["command"],
|
|
100
|
+
ports=[]
|
|
101
|
+
if self._kwargs["port"] is None
|
|
102
|
+
else [
|
|
103
|
+
client.V1ContainerPort(
|
|
104
|
+
container_port=int(self._kwargs["port"])
|
|
105
|
+
)
|
|
106
|
+
],
|
|
107
|
+
env=[
|
|
108
|
+
client.V1EnvVar(name=k, value=str(v))
|
|
109
|
+
for k, v in self._kwargs.get(
|
|
110
|
+
"environment_variables", {}
|
|
111
|
+
).items()
|
|
112
|
+
]
|
|
113
|
+
# And some downward API magic. Add (key, value)
|
|
114
|
+
# pairs below to make pod metadata available
|
|
115
|
+
# within Kubernetes container.
|
|
116
|
+
+ [
|
|
117
|
+
client.V1EnvVar(
|
|
118
|
+
name=k,
|
|
119
|
+
value_from=client.V1EnvVarSource(
|
|
120
|
+
field_ref=client.V1ObjectFieldSelector(
|
|
121
|
+
field_path=str(v)
|
|
525
122
|
)
|
|
526
|
-
for k in list(self._kwargs.get("secrets", []))
|
|
527
|
-
+ KUBERNETES_SECRETS.split(",")
|
|
528
|
-
if k
|
|
529
|
-
],
|
|
530
|
-
image=self._kwargs["image"],
|
|
531
|
-
image_pull_policy=self._kwargs["image_pull_policy"],
|
|
532
|
-
name=self._kwargs["step_name"].replace("_", "-"),
|
|
533
|
-
resources=client.V1ResourceRequirements(
|
|
534
|
-
requests={
|
|
535
|
-
"cpu": str(self._kwargs["cpu"]),
|
|
536
|
-
"memory": "%sM"
|
|
537
|
-
% str(self._kwargs["memory"]),
|
|
538
|
-
"ephemeral-storage": "%sM"
|
|
539
|
-
% str(self._kwargs["disk"]),
|
|
540
|
-
},
|
|
541
|
-
limits={
|
|
542
|
-
"%s.com/gpu".lower()
|
|
543
|
-
% self._kwargs["gpu_vendor"]: str(
|
|
544
|
-
self._kwargs["gpu"]
|
|
545
|
-
)
|
|
546
|
-
for k in [0]
|
|
547
|
-
# Don't set GPU limits if gpu isn't specified.
|
|
548
|
-
if self._kwargs["gpu"] is not None
|
|
549
|
-
},
|
|
550
|
-
),
|
|
551
|
-
volume_mounts=(
|
|
552
|
-
[
|
|
553
|
-
client.V1VolumeMount(
|
|
554
|
-
mount_path=self._kwargs.get(
|
|
555
|
-
"tmpfs_path"
|
|
556
|
-
),
|
|
557
|
-
name="tmpfs-ephemeral-volume",
|
|
558
|
-
)
|
|
559
|
-
]
|
|
560
|
-
if tmpfs_enabled
|
|
561
|
-
else []
|
|
562
|
-
)
|
|
563
|
-
+ (
|
|
564
|
-
[
|
|
565
|
-
client.V1VolumeMount(
|
|
566
|
-
mount_path=path, name=claim
|
|
567
|
-
)
|
|
568
|
-
for claim, path in self._kwargs[
|
|
569
|
-
"persistent_volume_claims"
|
|
570
|
-
].items()
|
|
571
|
-
]
|
|
572
|
-
if self._kwargs["persistent_volume_claims"]
|
|
573
|
-
is not None
|
|
574
|
-
else []
|
|
575
123
|
),
|
|
576
124
|
)
|
|
125
|
+
for k, v in {
|
|
126
|
+
"METAFLOW_KUBERNETES_POD_NAMESPACE": "metadata.namespace",
|
|
127
|
+
"METAFLOW_KUBERNETES_POD_NAME": "metadata.name",
|
|
128
|
+
"METAFLOW_KUBERNETES_POD_ID": "metadata.uid",
|
|
129
|
+
"METAFLOW_KUBERNETES_SERVICE_ACCOUNT_NAME": "spec.serviceAccountName",
|
|
130
|
+
"METAFLOW_KUBERNETES_NODE_IP": "status.hostIP",
|
|
131
|
+
}.items()
|
|
132
|
+
]
|
|
133
|
+
+ [
|
|
134
|
+
client.V1EnvVar(name=k, value=str(v))
|
|
135
|
+
for k, v in inject_tracing_vars({}).items()
|
|
577
136
|
],
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
|
|
587
|
-
|
|
588
|
-
# and let Metaflow handle the retries.
|
|
589
|
-
restart_policy="Never",
|
|
590
|
-
service_account_name=self._kwargs["service_account"],
|
|
591
|
-
# Terminate the container immediately on SIGTERM
|
|
592
|
-
termination_grace_period_seconds=0,
|
|
593
|
-
tolerations=[
|
|
594
|
-
client.V1Toleration(**toleration)
|
|
595
|
-
for toleration in self._kwargs.get("tolerations") or []
|
|
137
|
+
env_from=[
|
|
138
|
+
client.V1EnvFromSource(
|
|
139
|
+
secret_ref=client.V1SecretEnvSource(
|
|
140
|
+
name=str(k),
|
|
141
|
+
# optional=True
|
|
142
|
+
)
|
|
143
|
+
)
|
|
144
|
+
for k in list(self._kwargs.get("secrets", []))
|
|
145
|
+
+ KUBERNETES_SECRETS.split(",")
|
|
146
|
+
if k
|
|
596
147
|
],
|
|
597
|
-
|
|
148
|
+
image=self._kwargs["image"],
|
|
149
|
+
image_pull_policy=self._kwargs["image_pull_policy"],
|
|
150
|
+
name=self._kwargs["step_name"].replace("_", "-"),
|
|
151
|
+
resources=client.V1ResourceRequirements(
|
|
152
|
+
requests={
|
|
153
|
+
"cpu": str(self._kwargs["cpu"]),
|
|
154
|
+
"memory": "%sM" % str(self._kwargs["memory"]),
|
|
155
|
+
"ephemeral-storage": "%sM"
|
|
156
|
+
% str(self._kwargs["disk"]),
|
|
157
|
+
},
|
|
158
|
+
limits={
|
|
159
|
+
"%s.com/gpu".lower()
|
|
160
|
+
% self._kwargs["gpu_vendor"]: str(
|
|
161
|
+
self._kwargs["gpu"]
|
|
162
|
+
)
|
|
163
|
+
for k in [0]
|
|
164
|
+
# Don't set GPU limits if gpu isn't specified.
|
|
165
|
+
if self._kwargs["gpu"] is not None
|
|
166
|
+
},
|
|
167
|
+
),
|
|
168
|
+
volume_mounts=(
|
|
598
169
|
[
|
|
599
|
-
client.
|
|
170
|
+
client.V1VolumeMount(
|
|
171
|
+
mount_path=self._kwargs.get("tmpfs_path"),
|
|
600
172
|
name="tmpfs-ephemeral-volume",
|
|
601
|
-
empty_dir=client.V1EmptyDirVolumeSource(
|
|
602
|
-
medium="Memory",
|
|
603
|
-
# Add default unit as ours differs from Kubernetes default.
|
|
604
|
-
size_limit="{}Mi".format(tmpfs_size),
|
|
605
|
-
),
|
|
606
173
|
)
|
|
607
174
|
]
|
|
608
175
|
if tmpfs_enabled
|
|
@@ -610,24 +177,122 @@ class KubernetesJob(object):
|
|
|
610
177
|
)
|
|
611
178
|
+ (
|
|
612
179
|
[
|
|
613
|
-
client.
|
|
614
|
-
name=
|
|
615
|
-
persistent_volume_claim=client.V1PersistentVolumeClaimVolumeSource(
|
|
616
|
-
claim_name=claim
|
|
617
|
-
),
|
|
180
|
+
client.V1VolumeMount(
|
|
181
|
+
mount_path="/dev/shm", name="dhsm"
|
|
618
182
|
)
|
|
619
|
-
|
|
183
|
+
]
|
|
184
|
+
if shared_memory
|
|
185
|
+
else []
|
|
186
|
+
)
|
|
187
|
+
+ (
|
|
188
|
+
[
|
|
189
|
+
client.V1VolumeMount(mount_path=path, name=claim)
|
|
190
|
+
for claim, path in self._kwargs[
|
|
620
191
|
"persistent_volume_claims"
|
|
621
|
-
].
|
|
192
|
+
].items()
|
|
622
193
|
]
|
|
623
194
|
if self._kwargs["persistent_volume_claims"] is not None
|
|
624
195
|
else []
|
|
625
196
|
),
|
|
626
|
-
|
|
627
|
-
|
|
197
|
+
)
|
|
198
|
+
],
|
|
199
|
+
node_selector=self._kwargs.get("node_selector"),
|
|
200
|
+
# TODO (savin): Support image_pull_secrets
|
|
201
|
+
# image_pull_secrets=?,
|
|
202
|
+
# TODO (savin): Support preemption policies
|
|
203
|
+
# preemption_policy=?,
|
|
204
|
+
#
|
|
205
|
+
# A Container in a Pod may fail for a number of
|
|
206
|
+
# reasons, such as because the process in it exited
|
|
207
|
+
# with a non-zero exit code, or the Container was
|
|
208
|
+
# killed due to OOM etc. If this happens, fail the pod
|
|
209
|
+
# and let Metaflow handle the retries.
|
|
210
|
+
restart_policy="Never",
|
|
211
|
+
service_account_name=self._kwargs["service_account"],
|
|
212
|
+
# Terminate the container immediately on SIGTERM
|
|
213
|
+
termination_grace_period_seconds=0,
|
|
214
|
+
tolerations=[
|
|
215
|
+
client.V1Toleration(**toleration)
|
|
216
|
+
for toleration in self._kwargs.get("tolerations") or []
|
|
217
|
+
],
|
|
218
|
+
volumes=(
|
|
219
|
+
[
|
|
220
|
+
client.V1Volume(
|
|
221
|
+
name="tmpfs-ephemeral-volume",
|
|
222
|
+
empty_dir=client.V1EmptyDirVolumeSource(
|
|
223
|
+
medium="Memory",
|
|
224
|
+
# Add default unit as ours differs from Kubernetes default.
|
|
225
|
+
size_limit="{}Mi".format(tmpfs_size),
|
|
226
|
+
),
|
|
227
|
+
)
|
|
228
|
+
]
|
|
229
|
+
if tmpfs_enabled
|
|
230
|
+
else []
|
|
231
|
+
)
|
|
232
|
+
+ (
|
|
233
|
+
[
|
|
234
|
+
client.V1Volume(
|
|
235
|
+
name="dhsm",
|
|
236
|
+
empty_dir=client.V1EmptyDirVolumeSource(
|
|
237
|
+
medium="Memory",
|
|
238
|
+
size_limit="{}Mi".format(shared_memory),
|
|
239
|
+
),
|
|
240
|
+
)
|
|
241
|
+
]
|
|
242
|
+
if shared_memory
|
|
243
|
+
else []
|
|
244
|
+
)
|
|
245
|
+
+ (
|
|
246
|
+
[
|
|
247
|
+
client.V1Volume(
|
|
248
|
+
name=claim,
|
|
249
|
+
persistent_volume_claim=client.V1PersistentVolumeClaimVolumeSource(
|
|
250
|
+
claim_name=claim
|
|
251
|
+
),
|
|
252
|
+
)
|
|
253
|
+
for claim in self._kwargs["persistent_volume_claims"].keys()
|
|
254
|
+
]
|
|
255
|
+
if self._kwargs["persistent_volume_claims"] is not None
|
|
256
|
+
else []
|
|
628
257
|
),
|
|
258
|
+
# TODO (savin): Set termination_message_policy
|
|
629
259
|
),
|
|
630
|
-
)
|
|
260
|
+
),
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
def create(self):
|
|
264
|
+
# A discerning eye would notice and question the choice of using the
|
|
265
|
+
# V1Job construct over the V1Pod construct given that we don't rely much
|
|
266
|
+
# on any of the V1Job semantics. The major reasons at the moment are -
|
|
267
|
+
# 1. It makes the Kubernetes UIs (Octant, Lens) a bit easier on
|
|
268
|
+
# the eyes, although even that can be questioned.
|
|
269
|
+
# 2. AWS Step Functions, at the moment (Apr' 22) only supports
|
|
270
|
+
# executing Jobs and not Pods as part of it's publicly declared
|
|
271
|
+
# API. When we ship the AWS Step Functions integration with EKS,
|
|
272
|
+
# it will hopefully lessen our workload.
|
|
273
|
+
#
|
|
274
|
+
# Note: This implementation ensures that there is only one unique Pod
|
|
275
|
+
# (unique UID) per Metaflow task attempt.
|
|
276
|
+
client = self._client.get()
|
|
277
|
+
|
|
278
|
+
# tmpfs variables
|
|
279
|
+
use_tmpfs = self._kwargs["use_tmpfs"]
|
|
280
|
+
tmpfs_size = self._kwargs["tmpfs_size"]
|
|
281
|
+
tmpfs_enabled = use_tmpfs or (tmpfs_size and not use_tmpfs)
|
|
282
|
+
|
|
283
|
+
self._job = client.V1Job(
|
|
284
|
+
api_version="batch/v1",
|
|
285
|
+
kind="Job",
|
|
286
|
+
metadata=client.V1ObjectMeta(
|
|
287
|
+
# Annotations are for humans
|
|
288
|
+
annotations=self._kwargs.get("annotations", {}),
|
|
289
|
+
# While labels are for Kubernetes
|
|
290
|
+
labels=self._kwargs.get("labels", {}),
|
|
291
|
+
generate_name=self._kwargs["generate_name"],
|
|
292
|
+
namespace=self._kwargs["namespace"], # Defaults to `default`
|
|
293
|
+
),
|
|
294
|
+
spec=self.create_job_spec(),
|
|
295
|
+
)
|
|
631
296
|
return self
|
|
632
297
|
|
|
633
298
|
def execute(self):
|
|
@@ -638,53 +303,19 @@ class KubernetesJob(object):
|
|
|
638
303
|
# achieve the guarantees that we are seeking.
|
|
639
304
|
# https://github.com/kubernetes/enhancements/issues/1040
|
|
640
305
|
# Hopefully, we will be able to get creative with kube-batch
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
api_instance = client.CoreV1Api()
|
|
646
|
-
api_response = api_instance.create_namespaced_service(namespace=self._kwargs['namespace'], body=self._passwordless_ssh_service)
|
|
647
|
-
|
|
648
|
-
with client.ApiClient() as api_client:
|
|
649
|
-
api_instance = client.CustomObjectsApi(api_client)
|
|
650
|
-
|
|
651
|
-
response = api_instance.create_namespaced_custom_object(
|
|
652
|
-
body=self._jobset,
|
|
653
|
-
group="jobset.x-k8s.io",
|
|
654
|
-
version="v1alpha2",
|
|
655
|
-
namespace=self._kwargs["namespace"],
|
|
656
|
-
plural="jobsets",
|
|
657
|
-
)
|
|
658
|
-
|
|
659
|
-
# HACK: Give K8s some time to actually create the job
|
|
660
|
-
time.sleep(10)
|
|
661
|
-
|
|
662
|
-
# TODO (Eddie): Remove hack and make RunningJobSet.
|
|
663
|
-
# There are many jobs running that should be monitored.
|
|
664
|
-
job_name = "%s-control-0" % response["metadata"]["name"]
|
|
665
|
-
fake_id = 123
|
|
666
|
-
return RunningJob(
|
|
667
|
-
client=self._client,
|
|
668
|
-
name=job_name,
|
|
669
|
-
uid=fake_id,
|
|
670
|
-
namespace=response["metadata"]["namespace"],
|
|
671
|
-
)
|
|
672
|
-
|
|
673
|
-
else:
|
|
674
|
-
response = (
|
|
675
|
-
client.BatchV1Api()
|
|
676
|
-
.create_namespaced_job(
|
|
677
|
-
body=self._job, namespace=self._kwargs["namespace"]
|
|
678
|
-
)
|
|
679
|
-
.to_dict()
|
|
680
|
-
)
|
|
681
|
-
return RunningJob(
|
|
682
|
-
client=self._client,
|
|
683
|
-
name=response["metadata"]["name"],
|
|
684
|
-
uid=response["metadata"]["uid"],
|
|
685
|
-
namespace=response["metadata"]["namespace"],
|
|
306
|
+
response = (
|
|
307
|
+
client.BatchV1Api()
|
|
308
|
+
.create_namespaced_job(
|
|
309
|
+
body=self._job, namespace=self._kwargs["namespace"]
|
|
686
310
|
)
|
|
687
|
-
|
|
311
|
+
.to_dict()
|
|
312
|
+
)
|
|
313
|
+
return RunningJob(
|
|
314
|
+
client=self._client,
|
|
315
|
+
name=response["metadata"]["name"],
|
|
316
|
+
uid=response["metadata"]["uid"],
|
|
317
|
+
namespace=response["metadata"]["namespace"],
|
|
318
|
+
)
|
|
688
319
|
except client.rest.ApiException as e:
|
|
689
320
|
raise KubernetesJobException(
|
|
690
321
|
"Unable to launch Kubernetes job.\n %s"
|
|
@@ -740,6 +371,7 @@ class KubernetesJob(object):
|
|
|
740
371
|
|
|
741
372
|
|
|
742
373
|
class RunningJob(object):
|
|
374
|
+
|
|
743
375
|
# State Machine implementation for the lifecycle behavior documented in
|
|
744
376
|
# https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/
|
|
745
377
|
#
|
|
@@ -793,7 +425,7 @@ class RunningJob(object):
|
|
|
793
425
|
def best_effort_kill():
|
|
794
426
|
try:
|
|
795
427
|
self.kill()
|
|
796
|
-
except:
|
|
428
|
+
except Exception as ex:
|
|
797
429
|
pass
|
|
798
430
|
|
|
799
431
|
atexit.register(best_effort_kill)
|
|
@@ -857,6 +489,7 @@ class RunningJob(object):
|
|
|
857
489
|
# 3. If the pod object hasn't shown up yet, we set the parallelism to 0
|
|
858
490
|
# to preempt it.
|
|
859
491
|
client = self._client.get()
|
|
492
|
+
|
|
860
493
|
if not self.is_done:
|
|
861
494
|
if self.is_running:
|
|
862
495
|
# Case 1.
|