ob-metaflow 2.11.15.2__py2.py3-none-any.whl → 2.11.16.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ob-metaflow might be problematic. Click here for more details.
- metaflow/__init__.py +3 -0
- metaflow/clone_util.py +6 -0
- metaflow/extension_support/plugins.py +1 -1
- metaflow/metaflow_config.py +6 -4
- metaflow/metaflow_environment.py +3 -3
- metaflow/plugins/__init__.py +4 -4
- metaflow/plugins/azure/azure_secret_manager_secrets_provider.py +18 -14
- metaflow/plugins/datatools/s3/s3.py +1 -1
- metaflow/plugins/gcp/__init__.py +1 -1
- metaflow/plugins/gcp/gcp_secret_manager_secrets_provider.py +11 -6
- metaflow/plugins/kubernetes/kubernetes.py +79 -49
- metaflow/plugins/kubernetes/kubernetes_cli.py +20 -33
- metaflow/plugins/kubernetes/kubernetes_client.py +4 -1
- metaflow/plugins/kubernetes/kubernetes_decorator.py +44 -61
- metaflow/plugins/kubernetes/kubernetes_job.py +217 -584
- metaflow/plugins/kubernetes/kubernetes_jobsets.py +784 -0
- metaflow/plugins/timeout_decorator.py +2 -1
- metaflow/task.py +1 -12
- metaflow/tuple_util.py +27 -0
- metaflow/util.py +0 -15
- metaflow/version.py +1 -1
- {ob_metaflow-2.11.15.2.dist-info → ob_metaflow-2.11.16.1.dist-info}/METADATA +2 -2
- {ob_metaflow-2.11.15.2.dist-info → ob_metaflow-2.11.16.1.dist-info}/RECORD +27 -25
- {ob_metaflow-2.11.15.2.dist-info → ob_metaflow-2.11.16.1.dist-info}/LICENSE +0 -0
- {ob_metaflow-2.11.15.2.dist-info → ob_metaflow-2.11.16.1.dist-info}/WHEEL +0 -0
- {ob_metaflow-2.11.15.2.dist-info → ob_metaflow-2.11.16.1.dist-info}/entry_points.txt +0 -0
- {ob_metaflow-2.11.15.2.dist-info → ob_metaflow-2.11.16.1.dist-info}/top_level.txt +0 -0
|
@@ -2,7 +2,6 @@ import json
|
|
|
2
2
|
import os
|
|
3
3
|
import platform
|
|
4
4
|
import sys
|
|
5
|
-
import time
|
|
6
5
|
|
|
7
6
|
from metaflow import current
|
|
8
7
|
from metaflow.decorators import StepDecorator
|
|
@@ -21,7 +20,6 @@ from metaflow.metaflow_config import (
|
|
|
21
20
|
KUBERNETES_PERSISTENT_VOLUME_CLAIMS,
|
|
22
21
|
KUBERNETES_TOLERATIONS,
|
|
23
22
|
KUBERNETES_SERVICE_ACCOUNT,
|
|
24
|
-
KUBERNETES_PORT,
|
|
25
23
|
KUBERNETES_SHARED_MEMORY,
|
|
26
24
|
KUBERNETES_PORT,
|
|
27
25
|
KUBERNETES_CPU,
|
|
@@ -31,10 +29,11 @@ from metaflow.metaflow_config import (
|
|
|
31
29
|
from metaflow.plugins.resources_decorator import ResourcesDecorator
|
|
32
30
|
from metaflow.plugins.timeout_decorator import get_run_time_limit_for_task
|
|
33
31
|
from metaflow.sidecar import Sidecar
|
|
34
|
-
from metaflow.unbounded_foreach import UBF_CONTROL
|
|
35
32
|
|
|
36
33
|
from ..aws.aws_utils import get_docker_registry, get_ec2_instance_metadata
|
|
37
34
|
from .kubernetes import KubernetesException, parse_kube_keyvalue_list
|
|
35
|
+
from metaflow.unbounded_foreach import UBF_CONTROL
|
|
36
|
+
from .kubernetes_jobsets import TaskIdConstructor
|
|
38
37
|
|
|
39
38
|
from metaflow.metaflow_config import MAX_MEMORY_PER_TASK, MAX_CPU_PER_TASK
|
|
40
39
|
|
|
@@ -244,6 +243,16 @@ class KubernetesDecorator(StepDecorator):
|
|
|
244
243
|
"Kubernetes. Please use one or the other.".format(step=step)
|
|
245
244
|
)
|
|
246
245
|
|
|
246
|
+
if any([deco.name == "parallel" for deco in decos]) and any(
|
|
247
|
+
[deco.name == "catch" for deco in decos]
|
|
248
|
+
):
|
|
249
|
+
raise MetaflowException(
|
|
250
|
+
"Step *{step}* contains a @parallel decorator "
|
|
251
|
+
"with the @catch decorator. @catch is not supported with @parallel on Kubernetes.".format(
|
|
252
|
+
step=step
|
|
253
|
+
)
|
|
254
|
+
)
|
|
255
|
+
|
|
247
256
|
# Set run time limit for the Kubernetes job.
|
|
248
257
|
self.run_time_limit = get_run_time_limit_for_task(decos)
|
|
249
258
|
if self.run_time_limit < 60:
|
|
@@ -327,7 +336,7 @@ class KubernetesDecorator(StepDecorator):
|
|
|
327
336
|
|
|
328
337
|
if self.attributes["shared_memory"]:
|
|
329
338
|
if not (
|
|
330
|
-
isinstance(self.attributes["shared_memory"],
|
|
339
|
+
isinstance(self.attributes["shared_memory"], int)
|
|
331
340
|
and int(self.attributes["shared_memory"]) > 0
|
|
332
341
|
):
|
|
333
342
|
raise KubernetesException(
|
|
@@ -433,6 +442,10 @@ class KubernetesDecorator(StepDecorator):
|
|
|
433
442
|
"METAFLOW_KUBERNETES_SERVICE_ACCOUNT_NAME"
|
|
434
443
|
]
|
|
435
444
|
meta["kubernetes-node-ip"] = os.environ["METAFLOW_KUBERNETES_NODE_IP"]
|
|
445
|
+
if os.environ.get("METAFLOW_KUBERNETES_JOBSET_NAME"):
|
|
446
|
+
meta["kubernetes-jobset-name"] = os.environ[
|
|
447
|
+
"METAFLOW_KUBERNETES_JOBSET_NAME"
|
|
448
|
+
]
|
|
436
449
|
|
|
437
450
|
# TODO (savin): Introduce equivalent support for Microsoft Azure and
|
|
438
451
|
# Google Cloud Platform
|
|
@@ -465,25 +478,22 @@ class KubernetesDecorator(StepDecorator):
|
|
|
465
478
|
self._save_logs_sidecar = Sidecar("save_logs_periodically")
|
|
466
479
|
self._save_logs_sidecar.start()
|
|
467
480
|
|
|
468
|
-
num_parallel =
|
|
469
|
-
if
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
else:
|
|
483
|
-
worker_job_rank = int(os.environ["RANK"])
|
|
484
|
-
os.environ["RANK"] = str(worker_job_rank + 1)
|
|
481
|
+
num_parallel = None
|
|
482
|
+
if hasattr(flow, "_parallel_ubf_iter"):
|
|
483
|
+
num_parallel = flow._parallel_ubf_iter.num_parallel
|
|
484
|
+
|
|
485
|
+
if num_parallel and num_parallel >= 1 and ubf_context == UBF_CONTROL:
|
|
486
|
+
control_task_id, worker_task_ids = TaskIdConstructor.join_step_task_ids(
|
|
487
|
+
num_parallel
|
|
488
|
+
)
|
|
489
|
+
mapper_task_ids = [control_task_id] + worker_task_ids
|
|
490
|
+
flow._control_mapper_tasks = [
|
|
491
|
+
"%s/%s/%s" % (run_id, step_name, mapper_task_id)
|
|
492
|
+
for mapper_task_id in mapper_task_ids
|
|
493
|
+
]
|
|
494
|
+
flow._control_task_is_mapper_zero = True
|
|
485
495
|
|
|
486
|
-
if num_parallel
|
|
496
|
+
if num_parallel and num_parallel > 1:
|
|
487
497
|
_setup_multinode_environment()
|
|
488
498
|
|
|
489
499
|
def task_finished(
|
|
@@ -513,44 +523,6 @@ class KubernetesDecorator(StepDecorator):
|
|
|
513
523
|
# Best effort kill
|
|
514
524
|
pass
|
|
515
525
|
|
|
516
|
-
if is_task_ok and len(getattr(flow, "_control_mapper_tasks", [])) > 1:
|
|
517
|
-
self._wait_for_mapper_tasks(flow, step_name)
|
|
518
|
-
|
|
519
|
-
def _wait_for_mapper_tasks(self, flow, step_name):
|
|
520
|
-
"""
|
|
521
|
-
When launching multinode task with UBF, need to wait for the secondary
|
|
522
|
-
tasks to finish cleanly and produce their output before exiting the
|
|
523
|
-
main task. Otherwise, the main task finishing will cause secondary nodes
|
|
524
|
-
to terminate immediately, and possibly prematurely.
|
|
525
|
-
"""
|
|
526
|
-
from metaflow import Step # avoid circular dependency
|
|
527
|
-
|
|
528
|
-
TIMEOUT = 600
|
|
529
|
-
last_completion_timeout = time.time() + TIMEOUT
|
|
530
|
-
print("Waiting for batch secondary tasks to finish")
|
|
531
|
-
while last_completion_timeout > time.time():
|
|
532
|
-
time.sleep(2)
|
|
533
|
-
try:
|
|
534
|
-
step_path = "%s/%s/%s" % (flow.name, current.run_id, step_name)
|
|
535
|
-
tasks = [task for task in Step(step_path)]
|
|
536
|
-
if len(tasks) == len(flow._control_mapper_tasks):
|
|
537
|
-
if all(
|
|
538
|
-
task.finished_at is not None for task in tasks
|
|
539
|
-
): # for some reason task.finished fails
|
|
540
|
-
return True
|
|
541
|
-
else:
|
|
542
|
-
print(
|
|
543
|
-
"Waiting for all parallel tasks to finish. Finished: {}/{}".format(
|
|
544
|
-
len(tasks),
|
|
545
|
-
len(flow._control_mapper_tasks),
|
|
546
|
-
)
|
|
547
|
-
)
|
|
548
|
-
except Exception as e:
|
|
549
|
-
pass
|
|
550
|
-
raise Exception(
|
|
551
|
-
"Batch secondary workers did not finish in %s seconds" % TIMEOUT
|
|
552
|
-
)
|
|
553
|
-
|
|
554
526
|
@classmethod
|
|
555
527
|
def _save_package_once(cls, flow_datastore, package):
|
|
556
528
|
if cls.package_url is None:
|
|
@@ -558,8 +530,19 @@ class KubernetesDecorator(StepDecorator):
|
|
|
558
530
|
[package.blob], len_hint=1
|
|
559
531
|
)[0]
|
|
560
532
|
|
|
533
|
+
|
|
561
534
|
def _setup_multinode_environment():
|
|
562
535
|
import socket
|
|
536
|
+
|
|
563
537
|
os.environ["MF_PARALLEL_MAIN_IP"] = socket.gethostbyname(os.environ["MASTER_ADDR"])
|
|
564
538
|
os.environ["MF_PARALLEL_NUM_NODES"] = os.environ["WORLD_SIZE"]
|
|
565
|
-
os.environ
|
|
539
|
+
if os.environ.get("CONTROL_INDEX") is not None:
|
|
540
|
+
os.environ["MF_PARALLEL_NODE_INDEX"] = str(0)
|
|
541
|
+
elif os.environ.get("WORKER_REPLICA_INDEX") is not None:
|
|
542
|
+
os.environ["MF_PARALLEL_NODE_INDEX"] = str(
|
|
543
|
+
int(os.environ["WORKER_REPLICA_INDEX"]) + 1
|
|
544
|
+
)
|
|
545
|
+
else:
|
|
546
|
+
raise MetaflowException(
|
|
547
|
+
"Jobset related ENV vars called $CONTROL_INDEX or $WORKER_REPLICA_INDEX not found"
|
|
548
|
+
)
|