ob-metaflow 2.11.0.1__py2.py3-none-any.whl → 2.11.0.2__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ob-metaflow might be problematic. Click here for more details.
- metaflow/metaflow_config.py +2 -0
- metaflow/plugins/argo/argo_workflows.py +5 -0
- metaflow/plugins/kubernetes/kubernetes.py +43 -6
- metaflow/plugins/kubernetes/kubernetes_cli.py +40 -1
- metaflow/plugins/kubernetes/kubernetes_decorator.py +73 -6
- metaflow/plugins/kubernetes/kubernetes_job.py +536 -161
- metaflow/version.py +1 -1
- {ob_metaflow-2.11.0.1.dist-info → ob_metaflow-2.11.0.2.dist-info}/METADATA +1 -1
- {ob_metaflow-2.11.0.1.dist-info → ob_metaflow-2.11.0.2.dist-info}/RECORD +13 -13
- {ob_metaflow-2.11.0.1.dist-info → ob_metaflow-2.11.0.2.dist-info}/LICENSE +0 -0
- {ob_metaflow-2.11.0.1.dist-info → ob_metaflow-2.11.0.2.dist-info}/WHEEL +0 -0
- {ob_metaflow-2.11.0.1.dist-info → ob_metaflow-2.11.0.2.dist-info}/entry_points.txt +0 -0
- {ob_metaflow-2.11.0.1.dist-info → ob_metaflow-2.11.0.2.dist-info}/top_level.txt +0 -0
metaflow/metaflow_config.py
CHANGED
|
@@ -296,6 +296,8 @@ KUBERNETES_CONTAINER_REGISTRY = from_conf(
|
|
|
296
296
|
)
|
|
297
297
|
# Toggle for trying to fetch EC2 instance metadata
|
|
298
298
|
KUBERNETES_FETCH_EC2_METADATA = from_conf("KUBERNETES_FETCH_EC2_METADATA", False)
|
|
299
|
+
# Default port number to open on the pods
|
|
300
|
+
KUBERNETES_PORT = from_conf("KUBERNETES_PORT", None)
|
|
299
301
|
|
|
300
302
|
ARGO_WORKFLOWS_KUBERNETES_SECRETS = from_conf("ARGO_WORKFLOWS_KUBERNETES_SECRETS", "")
|
|
301
303
|
ARGO_WORKFLOWS_ENV_VARS_TO_SKIP = from_conf("ARGO_WORKFLOWS_ENV_VARS_TO_SKIP", "")
|
|
@@ -836,6 +836,11 @@ class ArgoWorkflows(object):
|
|
|
836
836
|
# Visit every node and yield the uber DAGTemplate(s).
|
|
837
837
|
def _dag_templates(self):
|
|
838
838
|
def _visit(node, exit_node=None, templates=None, dag_tasks=None):
|
|
839
|
+
if node.parallel_foreach:
|
|
840
|
+
raise ArgoWorkflowsException(
|
|
841
|
+
"Deploying flows with @parallel decorator(s) "
|
|
842
|
+
"as Argo Workflows is not supported currently."
|
|
843
|
+
)
|
|
839
844
|
# Every for-each node results in a separate subDAG and an equivalent
|
|
840
845
|
# DAGTemplate rooted at the child of the for-each node. Each DAGTemplate
|
|
841
846
|
# has a unique name - the top-level DAGTemplate is named as the name of
|
|
@@ -4,6 +4,7 @@ import os
|
|
|
4
4
|
import re
|
|
5
5
|
import shlex
|
|
6
6
|
import time
|
|
7
|
+
import copy
|
|
7
8
|
from typing import Dict, List, Optional
|
|
8
9
|
import uuid
|
|
9
10
|
from uuid import uuid4
|
|
@@ -174,6 +175,10 @@ class Kubernetes(object):
|
|
|
174
175
|
persistent_volume_claims=None,
|
|
175
176
|
tolerations=None,
|
|
176
177
|
labels=None,
|
|
178
|
+
annotations=None,
|
|
179
|
+
num_parallel=0,
|
|
180
|
+
attrs={},
|
|
181
|
+
port=None,
|
|
177
182
|
):
|
|
178
183
|
if env is None:
|
|
179
184
|
env = {}
|
|
@@ -213,6 +218,9 @@ class Kubernetes(object):
|
|
|
213
218
|
tmpfs_size=tmpfs_size,
|
|
214
219
|
tmpfs_path=tmpfs_path,
|
|
215
220
|
persistent_volume_claims=persistent_volume_claims,
|
|
221
|
+
num_parallel=num_parallel,
|
|
222
|
+
attrs=attrs,
|
|
223
|
+
port=port,
|
|
216
224
|
)
|
|
217
225
|
.environment_variable("METAFLOW_CODE_SHA", code_package_sha)
|
|
218
226
|
.environment_variable("METAFLOW_CODE_URL", code_package_url)
|
|
@@ -266,6 +274,7 @@ class Kubernetes(object):
|
|
|
266
274
|
# see get_datastore_root_from_config in datastore/local.py).
|
|
267
275
|
)
|
|
268
276
|
|
|
277
|
+
self.num_parallel = num_parallel
|
|
269
278
|
# Temporary passing of *some* environment variables. Do not rely on this
|
|
270
279
|
# mechanism as it will be removed in the near future
|
|
271
280
|
for k, v in config_values():
|
|
@@ -341,7 +350,7 @@ class Kubernetes(object):
|
|
|
341
350
|
sigmoid = 1.0 / (1.0 + math.exp(-0.01 * secs_since_start + 9.0))
|
|
342
351
|
return 0.5 + sigmoid * 30.0
|
|
343
352
|
|
|
344
|
-
def wait_for_launch(job):
|
|
353
|
+
def wait_for_launch(job, child_jobs):
|
|
345
354
|
status = job.status
|
|
346
355
|
echo(
|
|
347
356
|
"Task is starting (%s)..." % status,
|
|
@@ -351,11 +360,38 @@ class Kubernetes(object):
|
|
|
351
360
|
t = time.time()
|
|
352
361
|
start_time = time.time()
|
|
353
362
|
while job.is_waiting:
|
|
354
|
-
new_status = job.status
|
|
355
|
-
if status !=
|
|
356
|
-
|
|
363
|
+
# new_status = job.status
|
|
364
|
+
if status != job.status or (time.time() - t) > 30:
|
|
365
|
+
if not child_jobs:
|
|
366
|
+
child_statuses = ""
|
|
367
|
+
else:
|
|
368
|
+
status_keys = set(
|
|
369
|
+
[child_job.status for child_job in child_jobs]
|
|
370
|
+
)
|
|
371
|
+
status_counts = [
|
|
372
|
+
(
|
|
373
|
+
status,
|
|
374
|
+
len(
|
|
375
|
+
[
|
|
376
|
+
child_job.status == status
|
|
377
|
+
for child_job in child_jobs
|
|
378
|
+
]
|
|
379
|
+
),
|
|
380
|
+
)
|
|
381
|
+
for status in status_keys
|
|
382
|
+
]
|
|
383
|
+
child_statuses = " (parallel node status: [{}])".format(
|
|
384
|
+
", ".join(
|
|
385
|
+
[
|
|
386
|
+
"{}:{}".format(status, num)
|
|
387
|
+
for (status, num) in sorted(status_counts)
|
|
388
|
+
]
|
|
389
|
+
)
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
status = job.status
|
|
357
393
|
echo(
|
|
358
|
-
"Task is starting (%s)..." % status,
|
|
394
|
+
"Task is starting (status %s)... %s" % (status, child_statuses),
|
|
359
395
|
"stderr",
|
|
360
396
|
job_id=job.id,
|
|
361
397
|
)
|
|
@@ -367,8 +403,9 @@ class Kubernetes(object):
|
|
|
367
403
|
stdout_tail = get_log_tailer(stdout_location, self._datastore.TYPE)
|
|
368
404
|
stderr_tail = get_log_tailer(stderr_location, self._datastore.TYPE)
|
|
369
405
|
|
|
406
|
+
child_jobs = []
|
|
370
407
|
# 1) Loop until the job has started
|
|
371
|
-
wait_for_launch(self._job)
|
|
408
|
+
wait_for_launch(self._job, child_jobs)
|
|
372
409
|
|
|
373
410
|
# 2) Tail logs until the job has finished
|
|
374
411
|
tail_logs(
|
|
@@ -107,6 +107,26 @@ def kubernetes():
|
|
|
107
107
|
type=JSONTypeClass(),
|
|
108
108
|
multiple=False,
|
|
109
109
|
)
|
|
110
|
+
@click.option(
|
|
111
|
+
"--labels",
|
|
112
|
+
default=None,
|
|
113
|
+
type=JSONTypeClass(),
|
|
114
|
+
multiple=False,
|
|
115
|
+
)
|
|
116
|
+
@click.option(
|
|
117
|
+
"--annotations",
|
|
118
|
+
default=None,
|
|
119
|
+
type=JSONTypeClass(),
|
|
120
|
+
multiple=False,
|
|
121
|
+
)
|
|
122
|
+
@click.option("--ubf-context", default=None, type=click.Choice([None, "ubf_control"]))
|
|
123
|
+
@click.option(
|
|
124
|
+
"--num-parallel",
|
|
125
|
+
default=0,
|
|
126
|
+
type=int,
|
|
127
|
+
help="Number of parallel nodes to run as a multi-node job.",
|
|
128
|
+
)
|
|
129
|
+
@click.option("--port", default=None, help="port number")
|
|
110
130
|
@click.pass_context
|
|
111
131
|
def step(
|
|
112
132
|
ctx,
|
|
@@ -132,6 +152,10 @@ def step(
|
|
|
132
152
|
run_time_limit=None,
|
|
133
153
|
persistent_volume_claims=None,
|
|
134
154
|
tolerations=None,
|
|
155
|
+
labels=None,
|
|
156
|
+
annotations=None,
|
|
157
|
+
num_parallel=None,
|
|
158
|
+
port=None,
|
|
135
159
|
**kwargs
|
|
136
160
|
):
|
|
137
161
|
def echo(msg, stream="stderr", job_id=None, **kwargs):
|
|
@@ -177,11 +201,17 @@ def step(
|
|
|
177
201
|
)
|
|
178
202
|
time.sleep(minutes_between_retries * 60)
|
|
179
203
|
|
|
204
|
+
step_args = " ".join(util.dict_to_cli_options(kwargs))
|
|
205
|
+
num_parallel = num_parallel or 0
|
|
206
|
+
if num_parallel and num_parallel > 1:
|
|
207
|
+
# For multinode, we need to add a placeholder that can be mutated by the caller
|
|
208
|
+
step_args += " [multinode-args]"
|
|
209
|
+
|
|
180
210
|
step_cli = "{entrypoint} {top_args} step {step} {step_args}".format(
|
|
181
211
|
entrypoint="%s -u %s" % (executable, os.path.basename(sys.argv[0])),
|
|
182
212
|
top_args=" ".join(util.dict_to_cli_options(ctx.parent.parent.params)),
|
|
183
213
|
step=step_name,
|
|
184
|
-
step_args=
|
|
214
|
+
step_args=step_args,
|
|
185
215
|
)
|
|
186
216
|
|
|
187
217
|
# Set log tailing.
|
|
@@ -207,6 +237,10 @@ def step(
|
|
|
207
237
|
),
|
|
208
238
|
)
|
|
209
239
|
|
|
240
|
+
attrs = {
|
|
241
|
+
"metaflow.task_id": kwargs["task_id"],
|
|
242
|
+
"requires_passwordless_ssh": any([getattr(deco, "requires_passwordless_ssh", False) for deco in node.decorators]),
|
|
243
|
+
}
|
|
210
244
|
try:
|
|
211
245
|
kubernetes = Kubernetes(
|
|
212
246
|
datastore=ctx.obj.flow_datastore,
|
|
@@ -245,6 +279,11 @@ def step(
|
|
|
245
279
|
env=env,
|
|
246
280
|
persistent_volume_claims=persistent_volume_claims,
|
|
247
281
|
tolerations=tolerations,
|
|
282
|
+
labels=labels,
|
|
283
|
+
annotations=annotations,
|
|
284
|
+
num_parallel=num_parallel,
|
|
285
|
+
port=port,
|
|
286
|
+
attrs=attrs,
|
|
248
287
|
)
|
|
249
288
|
except Exception as e:
|
|
250
289
|
traceback.print_exc(chain=False)
|
|
@@ -2,6 +2,7 @@ import json
|
|
|
2
2
|
import os
|
|
3
3
|
import platform
|
|
4
4
|
import sys
|
|
5
|
+
import time
|
|
5
6
|
|
|
6
7
|
from metaflow import current
|
|
7
8
|
from metaflow.decorators import StepDecorator
|
|
@@ -20,10 +21,12 @@ from metaflow.metaflow_config import (
|
|
|
20
21
|
KUBERNETES_PERSISTENT_VOLUME_CLAIMS,
|
|
21
22
|
KUBERNETES_TOLERATIONS,
|
|
22
23
|
KUBERNETES_SERVICE_ACCOUNT,
|
|
24
|
+
KUBERNETES_PORT,
|
|
23
25
|
)
|
|
24
26
|
from metaflow.plugins.resources_decorator import ResourcesDecorator
|
|
25
27
|
from metaflow.plugins.timeout_decorator import get_run_time_limit_for_task
|
|
26
28
|
from metaflow.sidecar import Sidecar
|
|
29
|
+
from metaflow.unbounded_foreach import UBF_CONTROL
|
|
27
30
|
|
|
28
31
|
from ..aws.aws_utils import get_docker_registry, get_ec2_instance_metadata
|
|
29
32
|
from .kubernetes import KubernetesException, parse_kube_keyvalue_list
|
|
@@ -88,6 +91,8 @@ class KubernetesDecorator(StepDecorator):
|
|
|
88
91
|
persistent_volume_claims: Dict[str, str], optional
|
|
89
92
|
A map (dictionary) of persistent volumes to be mounted to the pod for this step. The map is from persistent
|
|
90
93
|
volumes to the path to which the volume is to be mounted, e.g., `{'pvc-name': '/path/to/mount/on'}`.
|
|
94
|
+
port: int, optional
|
|
95
|
+
Number of the port to specify in the Kubernetes job object
|
|
91
96
|
"""
|
|
92
97
|
|
|
93
98
|
name = "kubernetes"
|
|
@@ -110,6 +115,7 @@ class KubernetesDecorator(StepDecorator):
|
|
|
110
115
|
"tmpfs_size": None,
|
|
111
116
|
"tmpfs_path": "/metaflow_temp",
|
|
112
117
|
"persistent_volume_claims": None, # e.g., {"pvc-name": "/mnt/vol", "another-pvc": "/mnt/vol2"}
|
|
118
|
+
"port": None,
|
|
113
119
|
}
|
|
114
120
|
package_url = None
|
|
115
121
|
package_sha = None
|
|
@@ -195,6 +201,8 @@ class KubernetesDecorator(StepDecorator):
|
|
|
195
201
|
if not self.attributes["tmpfs_size"]:
|
|
196
202
|
# default tmpfs behavior - https://man7.org/linux/man-pages/man5/tmpfs.5.html
|
|
197
203
|
self.attributes["tmpfs_size"] = int(self.attributes["memory"]) // 2
|
|
204
|
+
if not self.attributes["port"]:
|
|
205
|
+
self.attributes["port"] = KUBERNETES_PORT
|
|
198
206
|
|
|
199
207
|
# Refer https://github.com/Netflix/metaflow/blob/master/docs/lifecycle.png
|
|
200
208
|
def step_init(self, flow, graph, step, decos, environment, flow_datastore, logger):
|
|
@@ -216,12 +224,6 @@ class KubernetesDecorator(StepDecorator):
|
|
|
216
224
|
"Kubernetes. Please use one or the other.".format(step=step)
|
|
217
225
|
)
|
|
218
226
|
|
|
219
|
-
for deco in decos:
|
|
220
|
-
if getattr(deco, "IS_PARALLEL", False):
|
|
221
|
-
raise KubernetesException(
|
|
222
|
-
"@kubernetes does not support parallel execution currently."
|
|
223
|
-
)
|
|
224
|
-
|
|
225
227
|
# Set run time limit for the Kubernetes job.
|
|
226
228
|
self.run_time_limit = get_run_time_limit_for_task(decos)
|
|
227
229
|
if self.run_time_limit < 60:
|
|
@@ -432,6 +434,27 @@ class KubernetesDecorator(StepDecorator):
|
|
|
432
434
|
self._save_logs_sidecar = Sidecar("save_logs_periodically")
|
|
433
435
|
self._save_logs_sidecar.start()
|
|
434
436
|
|
|
437
|
+
num_parallel = int(os.environ.get("WORLD_SIZE", 0))
|
|
438
|
+
if num_parallel >= 1:
|
|
439
|
+
if ubf_context == UBF_CONTROL:
|
|
440
|
+
control_task_id = current.task_id
|
|
441
|
+
top_task_id = control_task_id.replace("control-", "")
|
|
442
|
+
mapper_task_ids = [control_task_id] + [
|
|
443
|
+
"%s-node-%d" % (top_task_id, node_idx)
|
|
444
|
+
for node_idx in range(1, num_parallel)
|
|
445
|
+
]
|
|
446
|
+
flow._control_mapper_tasks = [
|
|
447
|
+
"%s/%s/%s" % (run_id, step_name, mapper_task_id)
|
|
448
|
+
for mapper_task_id in mapper_task_ids
|
|
449
|
+
]
|
|
450
|
+
flow._control_task_is_mapper_zero = True
|
|
451
|
+
else:
|
|
452
|
+
worker_job_rank = int(os.environ["RANK"])
|
|
453
|
+
os.environ["RANK"] = str(worker_job_rank + 1)
|
|
454
|
+
|
|
455
|
+
if num_parallel >= 1:
|
|
456
|
+
_setup_multinode_environment()
|
|
457
|
+
|
|
435
458
|
def task_finished(
|
|
436
459
|
self, step_name, flow, graph, is_task_ok, retry_count, max_retries
|
|
437
460
|
):
|
|
@@ -459,9 +482,53 @@ class KubernetesDecorator(StepDecorator):
|
|
|
459
482
|
# Best effort kill
|
|
460
483
|
pass
|
|
461
484
|
|
|
485
|
+
if is_task_ok and len(getattr(flow, "_control_mapper_tasks", [])) > 1:
|
|
486
|
+
self._wait_for_mapper_tasks(flow, step_name)
|
|
487
|
+
|
|
488
|
+
def _wait_for_mapper_tasks(self, flow, step_name):
|
|
489
|
+
"""
|
|
490
|
+
When launching multinode task with UBF, need to wait for the secondary
|
|
491
|
+
tasks to finish cleanly and produce their output before exiting the
|
|
492
|
+
main task. Otherwise, the main task finishing will cause secondary nodes
|
|
493
|
+
to terminate immediately, and possibly prematurely.
|
|
494
|
+
"""
|
|
495
|
+
from metaflow import Step # avoid circular dependency
|
|
496
|
+
|
|
497
|
+
TIMEOUT = 600
|
|
498
|
+
last_completion_timeout = time.time() + TIMEOUT
|
|
499
|
+
print("Waiting for batch secondary tasks to finish")
|
|
500
|
+
while last_completion_timeout > time.time():
|
|
501
|
+
time.sleep(2)
|
|
502
|
+
try:
|
|
503
|
+
step_path = "%s/%s/%s" % (flow.name, current.run_id, step_name)
|
|
504
|
+
tasks = [task for task in Step(step_path)]
|
|
505
|
+
if len(tasks) == len(flow._control_mapper_tasks):
|
|
506
|
+
if all(
|
|
507
|
+
task.finished_at is not None for task in tasks
|
|
508
|
+
): # for some reason task.finished fails
|
|
509
|
+
return True
|
|
510
|
+
else:
|
|
511
|
+
print(
|
|
512
|
+
"Waiting for all parallel tasks to finish. Finished: {}/{}".format(
|
|
513
|
+
len(tasks),
|
|
514
|
+
len(flow._control_mapper_tasks),
|
|
515
|
+
)
|
|
516
|
+
)
|
|
517
|
+
except Exception as e:
|
|
518
|
+
pass
|
|
519
|
+
raise Exception(
|
|
520
|
+
"Batch secondary workers did not finish in %s seconds" % TIMEOUT
|
|
521
|
+
)
|
|
522
|
+
|
|
462
523
|
@classmethod
|
|
463
524
|
def _save_package_once(cls, flow_datastore, package):
|
|
464
525
|
if cls.package_url is None:
|
|
465
526
|
cls.package_url, cls.package_sha = flow_datastore.save_data(
|
|
466
527
|
[package.blob], len_hint=1
|
|
467
528
|
)[0]
|
|
529
|
+
|
|
530
|
+
def _setup_multinode_environment():
|
|
531
|
+
import socket
|
|
532
|
+
os.environ["MF_PARALLEL_MAIN_IP"] = socket.gethostbyname(os.environ["MASTER_ADDR"])
|
|
533
|
+
os.environ["MF_PARALLEL_NUM_NODES"] = os.environ["WORLD_SIZE"]
|
|
534
|
+
os.environ["MF_PARALLEL_NODE_INDEX"] = os.environ["RANK"]
|
|
@@ -2,20 +2,18 @@ import json
|
|
|
2
2
|
import math
|
|
3
3
|
import random
|
|
4
4
|
import time
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
5
|
+
import os
|
|
6
|
+
import socket
|
|
7
|
+
import copy
|
|
8
8
|
|
|
9
9
|
from metaflow.exception import MetaflowException
|
|
10
10
|
from metaflow.metaflow_config import KUBERNETES_SECRETS
|
|
11
11
|
|
|
12
12
|
CLIENT_REFRESH_INTERVAL_SECONDS = 300
|
|
13
13
|
|
|
14
|
-
|
|
15
14
|
class KubernetesJobException(MetaflowException):
|
|
16
15
|
headline = "Kubernetes job error"
|
|
17
16
|
|
|
18
|
-
|
|
19
17
|
# Implements truncated exponential backoff from
|
|
20
18
|
# https://cloud.google.com/storage/docs/retry-strategy#exponential-backoff
|
|
21
19
|
def k8s_retry(deadline_seconds=60, max_backoff=32):
|
|
@@ -78,107 +76,260 @@ class KubernetesJob(object):
|
|
|
78
76
|
tmpfs_size = self._kwargs["tmpfs_size"]
|
|
79
77
|
tmpfs_enabled = use_tmpfs or (tmpfs_size and not use_tmpfs)
|
|
80
78
|
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
79
|
+
jobset_name = "js-%s" % self._kwargs["attrs"]["metaflow.task_id"].split('-')[-1]
|
|
80
|
+
main_job_name = "control"
|
|
81
|
+
main_job_index = 0
|
|
82
|
+
main_pod_index = 0
|
|
83
|
+
subdomain = jobset_name
|
|
84
|
+
master_port = int(self._kwargs['port']) if self._kwargs['port'] else None
|
|
85
|
+
|
|
86
|
+
passwordless_ssh = self._kwargs["attrs"]["requires_passwordless_ssh"]
|
|
87
|
+
if passwordless_ssh:
|
|
88
|
+
passwordless_ssh_service_name = subdomain
|
|
89
|
+
passwordless_ssh_service_selector = {
|
|
90
|
+
"passwordless-ssh-jobset": "true"
|
|
91
|
+
}
|
|
92
|
+
else:
|
|
93
|
+
passwordless_ssh_service_name = None
|
|
94
|
+
passwordless_ssh_service_selector = {}
|
|
95
|
+
|
|
96
|
+
fqdn_suffix = "%s.svc.cluster.local" % self._kwargs["namespace"]
|
|
97
|
+
jobset_main_addr = "%s-%s-%s-%s.%s.%s" % (
|
|
98
|
+
jobset_name,
|
|
99
|
+
main_job_name,
|
|
100
|
+
main_job_index,
|
|
101
|
+
main_pod_index,
|
|
102
|
+
subdomain,
|
|
103
|
+
fqdn_suffix,
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
def _install_jobset(
|
|
107
|
+
repo_url="https://github.com/kubernetes-sigs/jobset",
|
|
108
|
+
python_sdk_path="jobset/sdk/python",
|
|
109
|
+
):
|
|
110
|
+
|
|
111
|
+
# TODO (Eddie): Remove this and suggest to user.
|
|
112
|
+
|
|
113
|
+
import subprocess
|
|
114
|
+
import tempfile
|
|
115
|
+
import shutil
|
|
116
|
+
import os
|
|
117
|
+
|
|
118
|
+
with open(os.devnull, "wb") as devnull:
|
|
119
|
+
cwd = os.getcwd()
|
|
120
|
+
tmp_dir = tempfile.mkdtemp()
|
|
121
|
+
os.chdir(tmp_dir)
|
|
122
|
+
subprocess.check_call(
|
|
123
|
+
["git", "clone", repo_url], stdout=devnull, stderr=subprocess.STDOUT
|
|
124
|
+
)
|
|
125
|
+
tmp_python_sdk_path = os.path.join(tmp_dir, python_sdk_path)
|
|
126
|
+
os.chdir(tmp_python_sdk_path)
|
|
127
|
+
subprocess.check_call(
|
|
128
|
+
["pip", "install", "."], stdout=devnull, stderr=subprocess.STDOUT
|
|
129
|
+
)
|
|
130
|
+
os.chdir(cwd)
|
|
131
|
+
shutil.rmtree(tmp_dir)
|
|
132
|
+
|
|
133
|
+
def _get_passwordless_ssh_service():
|
|
134
|
+
|
|
135
|
+
return client.V1Service(
|
|
136
|
+
api_version="v1",
|
|
137
|
+
kind="Service",
|
|
138
|
+
metadata=client.V1ObjectMeta(
|
|
139
|
+
name=passwordless_ssh_service_name,
|
|
140
|
+
namespace=self._kwargs["namespace"]
|
|
141
|
+
),
|
|
142
|
+
spec=client.V1ServiceSpec(
|
|
143
|
+
cluster_ip="None",
|
|
144
|
+
internal_traffic_policy="Cluster",
|
|
145
|
+
ip_families=["IPv4"],
|
|
146
|
+
ip_family_policy="SingleStack",
|
|
147
|
+
selector=passwordless_ssh_service_selector,
|
|
148
|
+
session_affinity="None",
|
|
149
|
+
type="ClusterIP",
|
|
150
|
+
ports=[
|
|
151
|
+
client.V1ServicePort(
|
|
152
|
+
name="control",
|
|
153
|
+
port=22,
|
|
154
|
+
protocol="TCP",
|
|
155
|
+
target_port=22
|
|
156
|
+
)
|
|
157
|
+
]
|
|
158
|
+
)
|
|
159
|
+
)
|
|
160
|
+
|
|
161
|
+
def _get_replicated_job(job_name, parallelism, command):
|
|
162
|
+
return jobset.models.jobset_v1alpha2_replicated_job.JobsetV1alpha2ReplicatedJob(
|
|
163
|
+
name=job_name,
|
|
164
|
+
template=client.V1JobTemplateSpec(
|
|
103
165
|
metadata=client.V1ObjectMeta(
|
|
104
166
|
annotations=self._kwargs.get("annotations", {}),
|
|
105
167
|
labels=self._kwargs.get("labels", {}),
|
|
106
168
|
namespace=self._kwargs["namespace"],
|
|
107
169
|
),
|
|
108
|
-
spec=client.
|
|
109
|
-
#
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
170
|
+
spec=client.V1JobSpec(
|
|
171
|
+
parallelism=parallelism, # how many jobs can run at once
|
|
172
|
+
completions=parallelism, # how many Pods the JobSet creates in total
|
|
173
|
+
backoff_limit=0,
|
|
174
|
+
ttl_seconds_after_finished=7
|
|
175
|
+
* 60
|
|
176
|
+
* 60
|
|
177
|
+
* 24,
|
|
178
|
+
template=client.V1PodTemplateSpec(
|
|
179
|
+
metadata=client.V1ObjectMeta(
|
|
180
|
+
annotations=self._kwargs.get("annotations", {}),
|
|
181
|
+
labels={
|
|
182
|
+
**self._kwargs.get("labels", {}),
|
|
183
|
+
**passwordless_ssh_service_selector, # TODO: necessary?
|
|
184
|
+
# TODO: cluster-name, app.kubernetes.io/name necessary?
|
|
185
|
+
},
|
|
186
|
+
namespace=self._kwargs["namespace"],
|
|
187
|
+
),
|
|
188
|
+
spec=client.V1PodSpec(
|
|
189
|
+
active_deadline_seconds=self._kwargs[
|
|
190
|
+
"timeout_in_seconds"
|
|
191
|
+
],
|
|
192
|
+
containers=[
|
|
193
|
+
client.V1Container(
|
|
194
|
+
command=command,
|
|
195
|
+
ports=[client.V1ContainerPort(container_port=master_port)] if master_port and job_name=="control" else [],
|
|
196
|
+
env=[
|
|
197
|
+
client.V1EnvVar(name=k, value=str(v))
|
|
198
|
+
for k, v in self._kwargs.get(
|
|
199
|
+
"environment_variables", {}
|
|
200
|
+
).items()
|
|
201
|
+
]
|
|
202
|
+
+ [
|
|
203
|
+
client.V1EnvVar(
|
|
204
|
+
name=k,
|
|
205
|
+
value_from=client.V1EnvVarSource(
|
|
206
|
+
field_ref=client.V1ObjectFieldSelector(
|
|
207
|
+
field_path=str(v)
|
|
208
|
+
)
|
|
209
|
+
),
|
|
210
|
+
)
|
|
211
|
+
for k, v in {
|
|
212
|
+
"METAFLOW_KUBERNETES_POD_NAMESPACE": "metadata.namespace",
|
|
213
|
+
"METAFLOW_KUBERNETES_POD_NAME": "metadata.name",
|
|
214
|
+
"METAFLOW_KUBERNETES_POD_ID": "metadata.uid",
|
|
215
|
+
"METAFLOW_KUBERNETES_SERVICE_ACCOUNT_NAME": "spec.serviceAccountName",
|
|
216
|
+
"METAFLOW_KUBERNETES_NODE_IP": "status.hostIP",
|
|
217
|
+
}.items()
|
|
218
|
+
]
|
|
219
|
+
# Mimicking the AWS Batch Multinode env vars.
|
|
220
|
+
+ [
|
|
221
|
+
client.V1EnvVar(
|
|
222
|
+
name="MASTER_ADDR",
|
|
223
|
+
value=jobset_main_addr,
|
|
224
|
+
),
|
|
225
|
+
client.V1EnvVar(
|
|
226
|
+
name="MASTER_PORT",
|
|
227
|
+
value=str(master_port),
|
|
228
|
+
),
|
|
229
|
+
client.V1EnvVar(
|
|
230
|
+
name="RANK",
|
|
231
|
+
value_from=client.V1EnvVarSource(
|
|
232
|
+
field_ref=client.V1ObjectFieldSelector(
|
|
233
|
+
field_path="metadata.annotations['batch.kubernetes.io/job-completion-index']"
|
|
234
|
+
)
|
|
235
|
+
),
|
|
236
|
+
),
|
|
237
|
+
client.V1EnvVar(
|
|
238
|
+
name="WORLD_SIZE",
|
|
239
|
+
value=str(self._kwargs["num_parallel"]),
|
|
240
|
+
),
|
|
241
|
+
client.V1EnvVar(
|
|
242
|
+
name="PYTHONUNBUFFERED",
|
|
243
|
+
value="0",
|
|
244
|
+
),
|
|
245
|
+
],
|
|
246
|
+
env_from=[
|
|
247
|
+
client.V1EnvFromSource(
|
|
248
|
+
secret_ref=client.V1SecretEnvSource(
|
|
249
|
+
name=str(k),
|
|
250
|
+
# optional=True
|
|
251
|
+
)
|
|
131
252
|
)
|
|
253
|
+
for k in list(
|
|
254
|
+
self._kwargs.get("secrets", [])
|
|
255
|
+
)
|
|
256
|
+
+ KUBERNETES_SECRETS.split(",")
|
|
257
|
+
if k
|
|
258
|
+
],
|
|
259
|
+
image=self._kwargs["image"],
|
|
260
|
+
image_pull_policy=self._kwargs[
|
|
261
|
+
"image_pull_policy"
|
|
262
|
+
],
|
|
263
|
+
name=self._kwargs["step_name"].replace(
|
|
264
|
+
"_", "-"
|
|
132
265
|
),
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
266
|
+
resources=client.V1ResourceRequirements(
|
|
267
|
+
requests={
|
|
268
|
+
"cpu": str(self._kwargs["cpu"]),
|
|
269
|
+
"memory": "%sM"
|
|
270
|
+
% str(self._kwargs["memory"]),
|
|
271
|
+
"ephemeral-storage": "%sM"
|
|
272
|
+
% str(self._kwargs["disk"]),
|
|
273
|
+
},
|
|
274
|
+
limits={
|
|
275
|
+
"%s.com/gpu".lower()
|
|
276
|
+
% self._kwargs["gpu_vendor"]: str(
|
|
277
|
+
self._kwargs["gpu"]
|
|
278
|
+
)
|
|
279
|
+
for k in [0]
|
|
280
|
+
# Don't set GPU limits if gpu isn't specified.
|
|
281
|
+
if self._kwargs["gpu"] is not None
|
|
282
|
+
},
|
|
283
|
+
),
|
|
284
|
+
volume_mounts=(
|
|
285
|
+
[
|
|
286
|
+
client.V1VolumeMount(
|
|
287
|
+
mount_path=self._kwargs.get(
|
|
288
|
+
"tmpfs_path"
|
|
289
|
+
),
|
|
290
|
+
name="tmpfs-ephemeral-volume",
|
|
291
|
+
)
|
|
292
|
+
]
|
|
293
|
+
if tmpfs_enabled
|
|
294
|
+
else []
|
|
151
295
|
)
|
|
296
|
+
+ (
|
|
297
|
+
[
|
|
298
|
+
client.V1VolumeMount(
|
|
299
|
+
mount_path=path, name=claim
|
|
300
|
+
)
|
|
301
|
+
for claim, path in self._kwargs[
|
|
302
|
+
"persistent_volume_claims"
|
|
303
|
+
].items()
|
|
304
|
+
]
|
|
305
|
+
if self._kwargs["persistent_volume_claims"]
|
|
306
|
+
is not None
|
|
307
|
+
else []
|
|
308
|
+
),
|
|
152
309
|
)
|
|
153
|
-
for k in list(self._kwargs.get("secrets", []))
|
|
154
|
-
+ KUBERNETES_SECRETS.split(",")
|
|
155
|
-
if k
|
|
156
310
|
],
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
for k in [0]
|
|
173
|
-
# Don't set GPU limits if gpu isn't specified.
|
|
174
|
-
if self._kwargs["gpu"] is not None
|
|
175
|
-
},
|
|
176
|
-
),
|
|
177
|
-
volume_mounts=(
|
|
311
|
+
node_selector=self._kwargs.get("node_selector"),
|
|
312
|
+
restart_policy="Never",
|
|
313
|
+
|
|
314
|
+
set_hostname_as_fqdn=True, # configure pod hostname as pod's FQDN
|
|
315
|
+
share_process_namespace=False, # default
|
|
316
|
+
subdomain=subdomain, # FQDN = <hostname>.<subdomain>.<pod namespace>.svc.<cluster domain>
|
|
317
|
+
|
|
318
|
+
service_account_name=self._kwargs["service_account"],
|
|
319
|
+
termination_grace_period_seconds=0,
|
|
320
|
+
tolerations=[
|
|
321
|
+
client.V1Toleration(**toleration)
|
|
322
|
+
for toleration in self._kwargs.get("tolerations")
|
|
323
|
+
or []
|
|
324
|
+
],
|
|
325
|
+
volumes=(
|
|
178
326
|
[
|
|
179
|
-
client.
|
|
180
|
-
mount_path=self._kwargs.get("tmpfs_path"),
|
|
327
|
+
client.V1Volume(
|
|
181
328
|
name="tmpfs-ephemeral-volume",
|
|
329
|
+
empty_dir=client.V1EmptyDirVolumeSource(
|
|
330
|
+
medium="Memory",
|
|
331
|
+
size_limit="{}Mi".format(tmpfs_size),
|
|
332
|
+
),
|
|
182
333
|
)
|
|
183
334
|
]
|
|
184
335
|
if tmpfs_enabled
|
|
@@ -186,72 +337,264 @@ class KubernetesJob(object):
|
|
|
186
337
|
)
|
|
187
338
|
+ (
|
|
188
339
|
[
|
|
189
|
-
client.
|
|
190
|
-
|
|
340
|
+
client.V1Volume(
|
|
341
|
+
name=claim,
|
|
342
|
+
persistent_volume_claim=client.V1PersistentVolumeClaimVolumeSource(
|
|
343
|
+
claim_name=claim
|
|
344
|
+
),
|
|
191
345
|
)
|
|
192
|
-
for claim
|
|
346
|
+
for claim in self._kwargs[
|
|
193
347
|
"persistent_volume_claims"
|
|
194
|
-
].
|
|
348
|
+
].keys()
|
|
195
349
|
]
|
|
196
350
|
if self._kwargs["persistent_volume_claims"]
|
|
197
351
|
is not None
|
|
198
352
|
else []
|
|
199
353
|
),
|
|
200
|
-
)
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
354
|
+
),
|
|
355
|
+
),
|
|
356
|
+
),
|
|
357
|
+
),
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
if "num_parallel" in self._kwargs and self._kwargs["num_parallel"] >= 1:
|
|
361
|
+
|
|
362
|
+
try:
|
|
363
|
+
import jobset
|
|
364
|
+
except ImportError:
|
|
365
|
+
_install_jobset()
|
|
366
|
+
import jobset
|
|
367
|
+
|
|
368
|
+
main_commands = copy.copy(self._kwargs["command"])
|
|
369
|
+
main_commands[-1] = main_commands[-1].replace(
|
|
370
|
+
"[multinode-args]", "--split-index 0"
|
|
371
|
+
)
|
|
372
|
+
|
|
373
|
+
task_id = self._kwargs["attrs"]["metaflow.task_id"]
|
|
374
|
+
secondary_commands = copy.copy(self._kwargs["command"])
|
|
375
|
+
# RANK needs +1 because control node is not in the worker index group, yet we want global nodes.
|
|
376
|
+
# Technically, control and worker could be same replicated job type, but cleaner to separate for future use cases.
|
|
377
|
+
secondary_commands[-1] = secondary_commands[-1].replace(
|
|
378
|
+
"[multinode-args]", "--split-index `expr $RANK + 1`"
|
|
379
|
+
)
|
|
380
|
+
secondary_commands[-1] = secondary_commands[-1].replace(
|
|
381
|
+
"ubf_control", "ubf_task"
|
|
382
|
+
)
|
|
383
|
+
secondary_commands[-1] = secondary_commands[-1].replace(
|
|
384
|
+
task_id,
|
|
385
|
+
task_id.replace("control-", "") + "-node-`expr $RANK + 1`",
|
|
386
|
+
)
|
|
387
|
+
|
|
388
|
+
if passwordless_ssh:
|
|
389
|
+
if not os.path.exists("/usr/sbin/sshd"):
|
|
390
|
+
raise KubernetesJobException(
|
|
391
|
+
"This @parallel decorator requires sshd to be installed in the container image."
|
|
392
|
+
"Please install OpenSSH."
|
|
393
|
+
)
|
|
394
|
+
|
|
395
|
+
# run sshd in background
|
|
396
|
+
main_commands[-1] = "/usr/sbin/sshd -D & %s" % main_commands[-1]
|
|
397
|
+
secondary_commands[-1] = "/usr/sbin/sshd -D & %s" % secondary_commands[-1]
|
|
398
|
+
|
|
399
|
+
self._jobset = jobset.models.jobset_v1alpha2_job_set.JobsetV1alpha2JobSet(
|
|
400
|
+
api_version="jobset.x-k8s.io/v1alpha2",
|
|
401
|
+
kind="JobSet",
|
|
402
|
+
metadata=client.V1ObjectMeta(
|
|
403
|
+
annotations=self._kwargs.get("annotations", {}),
|
|
404
|
+
labels=self._kwargs.get("labels", {}),
|
|
405
|
+
name=jobset_name,
|
|
406
|
+
namespace=self._kwargs["namespace"],
|
|
407
|
+
),
|
|
408
|
+
spec=jobset.models.jobset_v1alpha2_job_set_spec.JobsetV1alpha2JobSetSpec(
|
|
409
|
+
network=jobset.models.jobset_v1alpha2_network.JobsetV1alpha2Network(
|
|
410
|
+
enable_dns_hostnames=True if not self._kwargs['attrs']['requires_passwordless_ssh'] else False,
|
|
411
|
+
subdomain=subdomain
|
|
412
|
+
),
|
|
413
|
+
replicated_jobs=[
|
|
414
|
+
_get_replicated_job("control", 1, main_commands),
|
|
415
|
+
_get_replicated_job(
|
|
416
|
+
"worker",
|
|
417
|
+
self._kwargs["num_parallel"] - 1,
|
|
418
|
+
secondary_commands,
|
|
419
|
+
),
|
|
420
|
+
],
|
|
421
|
+
),
|
|
422
|
+
)
|
|
423
|
+
self._passwordless_ssh_service = _get_passwordless_ssh_service()
|
|
424
|
+
else:
|
|
425
|
+
self._job = client.V1Job(
|
|
426
|
+
api_version="batch/v1",
|
|
427
|
+
kind="Job",
|
|
428
|
+
metadata=client.V1ObjectMeta(
|
|
429
|
+
# Annotations are for humans
|
|
430
|
+
annotations=self._kwargs.get("annotations", {}),
|
|
431
|
+
# While labels are for Kubernetes
|
|
432
|
+
labels=self._kwargs.get("labels", {}),
|
|
433
|
+
generate_name=self._kwargs["generate_name"],
|
|
434
|
+
namespace=self._kwargs["namespace"], # Defaults to `default`
|
|
435
|
+
),
|
|
436
|
+
spec=client.V1JobSpec(
|
|
437
|
+
# Retries are handled by Metaflow when it is responsible for
|
|
438
|
+
# executing the flow. The responsibility is moved to Kubernetes
|
|
439
|
+
# when Argo Workflows is responsible for the execution.
|
|
440
|
+
backoff_limit=self._kwargs.get("retries", 0),
|
|
441
|
+
completions=1, # A single non-indexed pod job
|
|
442
|
+
ttl_seconds_after_finished=7
|
|
443
|
+
* 60
|
|
444
|
+
* 60 # Remove job after a week. TODO: Make this configurable
|
|
445
|
+
* 24,
|
|
446
|
+
template=client.V1PodTemplateSpec(
|
|
447
|
+
metadata=client.V1ObjectMeta(
|
|
448
|
+
annotations=self._kwargs.get("annotations", {}),
|
|
449
|
+
labels=self._kwargs.get("labels", {}),
|
|
450
|
+
namespace=self._kwargs["namespace"],
|
|
451
|
+
),
|
|
452
|
+
spec=client.V1PodSpec(
|
|
453
|
+
# Timeout is set on the pod and not the job (important!)
|
|
454
|
+
active_deadline_seconds=self._kwargs["timeout_in_seconds"],
|
|
455
|
+
# TODO (savin): Enable affinities for GPU scheduling.
|
|
456
|
+
# affinity=?,
|
|
457
|
+
containers=[
|
|
458
|
+
client.V1Container(
|
|
459
|
+
command=self._kwargs["command"],
|
|
460
|
+
env=[
|
|
461
|
+
client.V1EnvVar(name=k, value=str(v))
|
|
462
|
+
for k, v in self._kwargs.get(
|
|
463
|
+
"environment_variables", {}
|
|
464
|
+
).items()
|
|
465
|
+
]
|
|
466
|
+
# And some downward API magic. Add (key, value)
|
|
467
|
+
# pairs below to make pod metadata available
|
|
468
|
+
# within Kubernetes container.
|
|
469
|
+
+ [
|
|
470
|
+
client.V1EnvVar(
|
|
471
|
+
name=k,
|
|
472
|
+
value_from=client.V1EnvVarSource(
|
|
473
|
+
field_ref=client.V1ObjectFieldSelector(
|
|
474
|
+
field_path=str(v)
|
|
475
|
+
)
|
|
476
|
+
),
|
|
477
|
+
)
|
|
478
|
+
for k, v in {
|
|
479
|
+
"METAFLOW_KUBERNETES_POD_NAMESPACE": "metadata.namespace",
|
|
480
|
+
"METAFLOW_KUBERNETES_POD_NAME": "metadata.name",
|
|
481
|
+
"METAFLOW_KUBERNETES_POD_ID": "metadata.uid",
|
|
482
|
+
"METAFLOW_KUBERNETES_SERVICE_ACCOUNT_NAME": "spec.serviceAccountName",
|
|
483
|
+
"METAFLOW_KUBERNETES_NODE_IP": "status.hostIP",
|
|
484
|
+
}.items()
|
|
485
|
+
],
|
|
486
|
+
env_from=[
|
|
487
|
+
client.V1EnvFromSource(
|
|
488
|
+
secret_ref=client.V1SecretEnvSource(
|
|
489
|
+
name=str(k),
|
|
490
|
+
# optional=True
|
|
491
|
+
)
|
|
492
|
+
)
|
|
493
|
+
for k in list(self._kwargs.get("secrets", []))
|
|
494
|
+
+ KUBERNETES_SECRETS.split(",")
|
|
495
|
+
if k
|
|
496
|
+
],
|
|
497
|
+
image=self._kwargs["image"],
|
|
498
|
+
image_pull_policy=self._kwargs["image_pull_policy"],
|
|
499
|
+
name=self._kwargs["step_name"].replace("_", "-"),
|
|
500
|
+
resources=client.V1ResourceRequirements(
|
|
501
|
+
requests={
|
|
502
|
+
"cpu": str(self._kwargs["cpu"]),
|
|
503
|
+
"memory": "%sM"
|
|
504
|
+
% str(self._kwargs["memory"]),
|
|
505
|
+
"ephemeral-storage": "%sM"
|
|
506
|
+
% str(self._kwargs["disk"]),
|
|
507
|
+
},
|
|
508
|
+
limits={
|
|
509
|
+
"%s.com/gpu".lower()
|
|
510
|
+
% self._kwargs["gpu_vendor"]: str(
|
|
511
|
+
self._kwargs["gpu"]
|
|
512
|
+
)
|
|
513
|
+
for k in [0]
|
|
514
|
+
# Don't set GPU limits if gpu isn't specified.
|
|
515
|
+
if self._kwargs["gpu"] is not None
|
|
516
|
+
},
|
|
229
517
|
),
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
518
|
+
volume_mounts=(
|
|
519
|
+
[
|
|
520
|
+
client.V1VolumeMount(
|
|
521
|
+
mount_path=self._kwargs.get(
|
|
522
|
+
"tmpfs_path"
|
|
523
|
+
),
|
|
524
|
+
name="tmpfs-ephemeral-volume",
|
|
525
|
+
)
|
|
526
|
+
]
|
|
527
|
+
if tmpfs_enabled
|
|
528
|
+
else []
|
|
529
|
+
)
|
|
530
|
+
+ (
|
|
531
|
+
[
|
|
532
|
+
client.V1VolumeMount(
|
|
533
|
+
mount_path=path, name=claim
|
|
534
|
+
)
|
|
535
|
+
for claim, path in self._kwargs[
|
|
536
|
+
"persistent_volume_claims"
|
|
537
|
+
].items()
|
|
538
|
+
]
|
|
539
|
+
if self._kwargs["persistent_volume_claims"]
|
|
540
|
+
is not None
|
|
541
|
+
else []
|
|
241
542
|
),
|
|
242
543
|
)
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
544
|
+
],
|
|
545
|
+
node_selector=self._kwargs.get("node_selector"),
|
|
546
|
+
# TODO (savin): Support image_pull_secrets
|
|
547
|
+
# image_pull_secrets=?,
|
|
548
|
+
# TODO (savin): Support preemption policies
|
|
549
|
+
# preemption_policy=?,
|
|
550
|
+
#
|
|
551
|
+
# A Container in a Pod may fail for a number of
|
|
552
|
+
# reasons, such as because the process in it exited
|
|
553
|
+
# with a non-zero exit code, or the Container was
|
|
554
|
+
# killed due to OOM etc. If this happens, fail the pod
|
|
555
|
+
# and let Metaflow handle the retries.
|
|
556
|
+
restart_policy="Never",
|
|
557
|
+
service_account_name=self._kwargs["service_account"],
|
|
558
|
+
# Terminate the container immediately on SIGTERM
|
|
559
|
+
termination_grace_period_seconds=0,
|
|
560
|
+
tolerations=[
|
|
561
|
+
client.V1Toleration(**toleration)
|
|
562
|
+
for toleration in self._kwargs.get("tolerations") or []
|
|
563
|
+
],
|
|
564
|
+
volumes=(
|
|
565
|
+
[
|
|
566
|
+
client.V1Volume(
|
|
567
|
+
name="tmpfs-ephemeral-volume",
|
|
568
|
+
empty_dir=client.V1EmptyDirVolumeSource(
|
|
569
|
+
medium="Memory",
|
|
570
|
+
# Add default unit as ours differs from Kubernetes default.
|
|
571
|
+
size_limit="{}Mi".format(tmpfs_size),
|
|
572
|
+
),
|
|
573
|
+
)
|
|
574
|
+
]
|
|
575
|
+
if tmpfs_enabled
|
|
576
|
+
else []
|
|
577
|
+
)
|
|
578
|
+
+ (
|
|
579
|
+
[
|
|
580
|
+
client.V1Volume(
|
|
581
|
+
name=claim,
|
|
582
|
+
persistent_volume_claim=client.V1PersistentVolumeClaimVolumeSource(
|
|
583
|
+
claim_name=claim
|
|
584
|
+
),
|
|
585
|
+
)
|
|
586
|
+
for claim in self._kwargs[
|
|
587
|
+
"persistent_volume_claims"
|
|
588
|
+
].keys()
|
|
589
|
+
]
|
|
590
|
+
if self._kwargs["persistent_volume_claims"] is not None
|
|
591
|
+
else []
|
|
592
|
+
),
|
|
593
|
+
# TODO (savin): Set termination_message_policy
|
|
249
594
|
),
|
|
250
|
-
# TODO (savin): Set termination_message_policy
|
|
251
595
|
),
|
|
252
596
|
),
|
|
253
|
-
)
|
|
254
|
-
)
|
|
597
|
+
)
|
|
255
598
|
return self
|
|
256
599
|
|
|
257
600
|
def execute(self):
|
|
@@ -262,19 +605,53 @@ class KubernetesJob(object):
|
|
|
262
605
|
# achieve the guarantees that we are seeking.
|
|
263
606
|
# https://github.com/kubernetes/enhancements/issues/1040
|
|
264
607
|
# Hopefully, we will be able to get creative with kube-batch
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
.
|
|
268
|
-
|
|
608
|
+
|
|
609
|
+
if "num_parallel" in self._kwargs and self._kwargs["num_parallel"] >= 1:
|
|
610
|
+
# TODO (Eddie): this is kinda gross. fix it.
|
|
611
|
+
if self._kwargs["attrs"]["requires_passwordless_ssh"]:
|
|
612
|
+
api_instance = client.CoreV1Api()
|
|
613
|
+
api_response = api_instance.create_namespaced_service(namespace=self._kwargs['namespace'], body=self._passwordless_ssh_service)
|
|
614
|
+
|
|
615
|
+
with client.ApiClient() as api_client:
|
|
616
|
+
api_instance = client.CustomObjectsApi(api_client)
|
|
617
|
+
|
|
618
|
+
response = api_instance.create_namespaced_custom_object(
|
|
619
|
+
body=self._jobset,
|
|
620
|
+
group="jobset.x-k8s.io",
|
|
621
|
+
version="v1alpha2",
|
|
622
|
+
namespace=self._kwargs["namespace"],
|
|
623
|
+
plural="jobsets",
|
|
269
624
|
)
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
625
|
+
|
|
626
|
+
# HACK: Give K8s some time to actually create the job
|
|
627
|
+
time.sleep(10)
|
|
628
|
+
|
|
629
|
+
# TODO (Eddie): Remove hack and make RunningJobSet.
|
|
630
|
+
# There are many jobs running that should be monitored.
|
|
631
|
+
job_name = "%s-control-0" % response["metadata"]["name"]
|
|
632
|
+
fake_id = 123
|
|
633
|
+
return RunningJob(
|
|
634
|
+
client=self._client,
|
|
635
|
+
name=job_name,
|
|
636
|
+
uid=fake_id,
|
|
637
|
+
namespace=response["metadata"]["namespace"],
|
|
638
|
+
)
|
|
639
|
+
|
|
640
|
+
else:
|
|
641
|
+
response = (
|
|
642
|
+
client.BatchV1Api()
|
|
643
|
+
.create_namespaced_job(
|
|
644
|
+
body=self._job, namespace=self._kwargs["namespace"]
|
|
645
|
+
)
|
|
646
|
+
.to_dict()
|
|
647
|
+
)
|
|
648
|
+
return RunningJob(
|
|
649
|
+
client=self._client,
|
|
650
|
+
name=response["metadata"]["name"],
|
|
651
|
+
uid=response["metadata"]["uid"],
|
|
652
|
+
namespace=response["metadata"]["namespace"],
|
|
653
|
+
)
|
|
654
|
+
|
|
278
655
|
except client.rest.ApiException as e:
|
|
279
656
|
raise KubernetesJobException(
|
|
280
657
|
"Unable to launch Kubernetes job.\n %s"
|
|
@@ -330,7 +707,6 @@ class KubernetesJob(object):
|
|
|
330
707
|
|
|
331
708
|
|
|
332
709
|
class RunningJob(object):
|
|
333
|
-
|
|
334
710
|
# State Machine implementation for the lifecycle behavior documented in
|
|
335
711
|
# https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/
|
|
336
712
|
#
|
|
@@ -450,7 +826,6 @@ class RunningJob(object):
|
|
|
450
826
|
client = self._client.get()
|
|
451
827
|
if not self.is_done:
|
|
452
828
|
if self.is_running:
|
|
453
|
-
|
|
454
829
|
# Case 1.
|
|
455
830
|
from kubernetes.stream import stream
|
|
456
831
|
|
metaflow/version.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
metaflow_version = "2.11.0.
|
|
1
|
+
metaflow_version = "2.11.0.2"
|
|
@@ -15,7 +15,7 @@ metaflow/graph.py,sha256=ZPxyG8uwVMk5YYgX4pQEQaPZtZM5Wy-G4NtJK73IEuA,11818
|
|
|
15
15
|
metaflow/includefile.py,sha256=BVQLYTLZN7m3ibFnsTU70dPj9YskxZeQb1FosV3k4-o,19721
|
|
16
16
|
metaflow/integrations.py,sha256=LlsaoePRg03DjENnmLxZDYto3NwWc9z_PtU6nJxLldg,1480
|
|
17
17
|
metaflow/lint.py,sha256=_kYAbAtsP7IG1Rd0FqNbo8I8Zs66_0WXbaZJFARO3dE,10394
|
|
18
|
-
metaflow/metaflow_config.py,sha256=
|
|
18
|
+
metaflow/metaflow_config.py,sha256=LBEDdQskwtstZxhtSP9ONInccjZAjB7nWBrBce_Fpg0,19081
|
|
19
19
|
metaflow/metaflow_config_funcs.py,sha256=pCaiQ2ez9wXixJI3ehmf3QiW9lUqFrZnBZx1my_0wIg,4874
|
|
20
20
|
metaflow/metaflow_environment.py,sha256=JdsmQsYp1SDQniQ0-q1mKRrmzSFfYuzrf6jLEHmyaiM,7352
|
|
21
21
|
metaflow/metaflow_profile.py,sha256=jKPEW-hmAQO-htSxb9hXaeloLacAh41A35rMZH6G8pA,418
|
|
@@ -33,7 +33,7 @@ metaflow/task.py,sha256=yGNU3T3giKiG--vE0DUj_K-8jur2TclCS45XjPVLcq4,25314
|
|
|
33
33
|
metaflow/unbounded_foreach.py,sha256=p184WMbrMJ3xKYHwewj27ZhRUsSj_kw1jlye5gA9xJk,387
|
|
34
34
|
metaflow/util.py,sha256=jbMJ17rK-dFTBCjimWqxkfcr3v__bHa3tZtX0g8iS2c,13257
|
|
35
35
|
metaflow/vendor.py,sha256=LZgXrh7ZSDmD32D1T5jj3OKKpXIqqxKzdMAOc5V0SD4,5162
|
|
36
|
-
metaflow/version.py,sha256=
|
|
36
|
+
metaflow/version.py,sha256=gXS_wIDHs2sEK4Lt7UOfOM6t13X5UUPilPOmvUUcpgA,30
|
|
37
37
|
metaflow/_vendor/__init__.py,sha256=y_CiwUD3l4eAKvTVDZeqgVujMy31cAM1qjAB-HfI-9s,353
|
|
38
38
|
metaflow/_vendor/click/__init__.py,sha256=FkyGDQ-cbiQxP_lxgUspyFYS48f2S_pTcfKPz-d_RMo,2463
|
|
39
39
|
metaflow/_vendor/click/_bashcomplete.py,sha256=9J98IHQYmCAr2Jup6TDshUr5FJEen-AoQCZR0K5nKxQ,12309
|
|
@@ -145,7 +145,7 @@ metaflow/plugins/airflow/sensors/s3_sensor.py,sha256=zym4mUm_f_gBsvHHVqGtX_OOxRj
|
|
|
145
145
|
metaflow/plugins/argo/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
146
146
|
metaflow/plugins/argo/argo_client.py,sha256=MKKhMCbWOPzf6z5zQQiyDRHHkAXcO7ipboDZDqAAvOk,15849
|
|
147
147
|
metaflow/plugins/argo/argo_events.py,sha256=TIEOHrvUQ079YzzpzpFFtZjeU2x7hoofyZ6ytCorm2c,5911
|
|
148
|
-
metaflow/plugins/argo/argo_workflows.py,sha256=
|
|
148
|
+
metaflow/plugins/argo/argo_workflows.py,sha256=h-zXFauJce-44eKA8vh9UaW5kIdaN7irz4QwfrfFCNQ,119978
|
|
149
149
|
metaflow/plugins/argo/argo_workflows_cli.py,sha256=sZTpgfmc50eT3e0qIxpVqUgWhTcYlO1HM4gU6Oaya8g,33259
|
|
150
150
|
metaflow/plugins/argo/argo_workflows_decorator.py,sha256=CfKVoHCOsCCQMghhPE30xw15gacwp3hR23HCo9ZZFVg,6580
|
|
151
151
|
metaflow/plugins/argo/process_input_paths.py,sha256=LjUSP8PVU-DRGEPxjas99nzyAO-fI82Bxxbr_QETE88,565
|
|
@@ -243,11 +243,11 @@ metaflow/plugins/gcp/gs_tail.py,sha256=Jl_wvnzU7dub07A-DOAuP5FeccNIrPM-CeL1xKFs1
|
|
|
243
243
|
metaflow/plugins/gcp/gs_utils.py,sha256=ZmIGFse1qYyvAVrwga23PQUzF6dXEDLLsZ2F-YRmvow,2030
|
|
244
244
|
metaflow/plugins/gcp/includefile_support.py,sha256=vIDeR-MiJuUh-2S2pV7Z7FBkhIWwtHXaRrj76MWGRiY,3869
|
|
245
245
|
metaflow/plugins/kubernetes/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
246
|
-
metaflow/plugins/kubernetes/kubernetes.py,sha256=
|
|
247
|
-
metaflow/plugins/kubernetes/kubernetes_cli.py,sha256=
|
|
246
|
+
metaflow/plugins/kubernetes/kubernetes.py,sha256=ePh4vzHDJFkooJKT75zJgipjQlwslyRs1VFcZVYkabE,18834
|
|
247
|
+
metaflow/plugins/kubernetes/kubernetes_cli.py,sha256=RugVe3UHWFGd03OM76fSzxSt3QYAT8KHQ5-iiKzQrGA,10092
|
|
248
248
|
metaflow/plugins/kubernetes/kubernetes_client.py,sha256=dV3TEGQMBbljmv6Gs1EKfmHTorKt21lhSiYsNx0To08,1901
|
|
249
|
-
metaflow/plugins/kubernetes/kubernetes_decorator.py,sha256=
|
|
250
|
-
metaflow/plugins/kubernetes/kubernetes_job.py,sha256=
|
|
249
|
+
metaflow/plugins/kubernetes/kubernetes_decorator.py,sha256=Rs2KGy0yInQmMq9W2jEockiq2eOrrnd1TAMmpu1Q9pA,24103
|
|
250
|
+
metaflow/plugins/kubernetes/kubernetes_job.py,sha256=8LNMwZSz1afbQXrPNJnDo_nTMIA0SQza6yjxkf2N2_k,50853
|
|
251
251
|
metaflow/plugins/metadata/__init__.py,sha256=AbpHGcgLb-kRsJGnwFEktk7uzpZOCcBY74-YBdrKVGs,1
|
|
252
252
|
metaflow/plugins/metadata/local.py,sha256=YhLJC5zjVJrvQFIyQ92ZBByiUmhCC762RUX7ITX12O8,22428
|
|
253
253
|
metaflow/plugins/metadata/service.py,sha256=ihq5F7KQZlxvYwzH_-jyP2aWN_I96i2vp92j_d697s8,20204
|
|
@@ -295,9 +295,9 @@ metaflow/tutorials/07-worldview/README.md,sha256=5vQTrFqulJ7rWN6r20dhot9lI2sVj9W
|
|
|
295
295
|
metaflow/tutorials/07-worldview/worldview.ipynb,sha256=ztPZPI9BXxvW1QdS2Tfe7LBuVzvFvv0AToDnsDJhLdE,2237
|
|
296
296
|
metaflow/tutorials/08-autopilot/README.md,sha256=GnePFp_q76jPs991lMUqfIIh5zSorIeWznyiUxzeUVE,1039
|
|
297
297
|
metaflow/tutorials/08-autopilot/autopilot.ipynb,sha256=DQoJlILV7Mq9vfPBGW-QV_kNhWPjS5n6SJLqePjFYLY,3191
|
|
298
|
-
ob_metaflow-2.11.0.
|
|
299
|
-
ob_metaflow-2.11.0.
|
|
300
|
-
ob_metaflow-2.11.0.
|
|
301
|
-
ob_metaflow-2.11.0.
|
|
302
|
-
ob_metaflow-2.11.0.
|
|
303
|
-
ob_metaflow-2.11.0.
|
|
298
|
+
ob_metaflow-2.11.0.2.dist-info/LICENSE,sha256=nl_Lt5v9VvJ-5lWJDT4ddKAG-VZ-2IaLmbzpgYDz2hU,11343
|
|
299
|
+
ob_metaflow-2.11.0.2.dist-info/METADATA,sha256=K4SO4xxndoBOLUCcxWPSyonbeAL6FnLqqTLCOq0CSZU,5061
|
|
300
|
+
ob_metaflow-2.11.0.2.dist-info/WHEEL,sha256=-G_t0oGuE7UD0DrSpVZnq1hHMBV9DD2XkS5v7XpmTnk,110
|
|
301
|
+
ob_metaflow-2.11.0.2.dist-info/entry_points.txt,sha256=IKwTN1T3I5eJL3uo_vnkyxVffcgnRdFbKwlghZfn27k,57
|
|
302
|
+
ob_metaflow-2.11.0.2.dist-info/top_level.txt,sha256=v1pDHoWaSaKeuc5fKTRSfsXCKSdW1zvNVmvA-i0if3o,9
|
|
303
|
+
ob_metaflow-2.11.0.2.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|