ob-metaflow 2.12.7.1__py2.py3-none-any.whl → 2.12.9.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ob-metaflow might be problematic. Click here for more details.
- metaflow/__init__.py +2 -0
- metaflow/cli.py +12 -4
- metaflow/extension_support/plugins.py +1 -0
- metaflow/flowspec.py +8 -1
- metaflow/lint.py +13 -0
- metaflow/metaflow_current.py +0 -8
- metaflow/plugins/__init__.py +12 -0
- metaflow/plugins/argo/argo_workflows.py +463 -42
- metaflow/plugins/argo/argo_workflows_cli.py +60 -3
- metaflow/plugins/argo/argo_workflows_decorator.py +38 -7
- metaflow/plugins/argo/argo_workflows_deployer.py +290 -0
- metaflow/plugins/argo/jobset_input_paths.py +16 -0
- metaflow/plugins/aws/batch/batch_decorator.py +16 -13
- metaflow/plugins/aws/step_functions/step_functions_cli.py +45 -3
- metaflow/plugins/aws/step_functions/step_functions_deployer.py +251 -0
- metaflow/plugins/cards/card_cli.py +1 -1
- metaflow/plugins/kubernetes/kubernetes.py +279 -52
- metaflow/plugins/kubernetes/kubernetes_cli.py +26 -8
- metaflow/plugins/kubernetes/kubernetes_client.py +0 -1
- metaflow/plugins/kubernetes/kubernetes_decorator.py +56 -44
- metaflow/plugins/kubernetes/kubernetes_job.py +6 -6
- metaflow/plugins/kubernetes/kubernetes_jobsets.py +510 -272
- metaflow/plugins/parallel_decorator.py +108 -8
- metaflow/plugins/pypi/bootstrap.py +1 -1
- metaflow/plugins/pypi/micromamba.py +1 -1
- metaflow/plugins/secrets/secrets_decorator.py +12 -3
- metaflow/plugins/test_unbounded_foreach_decorator.py +39 -4
- metaflow/runner/deployer.py +386 -0
- metaflow/runner/metaflow_runner.py +1 -20
- metaflow/runner/nbdeploy.py +130 -0
- metaflow/runner/nbrun.py +4 -28
- metaflow/runner/utils.py +49 -0
- metaflow/runtime.py +246 -134
- metaflow/version.py +1 -1
- {ob_metaflow-2.12.7.1.dist-info → ob_metaflow-2.12.9.1.dist-info}/METADATA +2 -2
- {ob_metaflow-2.12.7.1.dist-info → ob_metaflow-2.12.9.1.dist-info}/RECORD +40 -34
- {ob_metaflow-2.12.7.1.dist-info → ob_metaflow-2.12.9.1.dist-info}/WHEEL +1 -1
- {ob_metaflow-2.12.7.1.dist-info → ob_metaflow-2.12.9.1.dist-info}/LICENSE +0 -0
- {ob_metaflow-2.12.7.1.dist-info → ob_metaflow-2.12.9.1.dist-info}/entry_points.txt +0 -0
- {ob_metaflow-2.12.7.1.dist-info → ob_metaflow-2.12.9.1.dist-info}/top_level.txt +0 -0
|
@@ -3,20 +3,20 @@ import sys
|
|
|
3
3
|
import time
|
|
4
4
|
import traceback
|
|
5
5
|
|
|
6
|
+
import metaflow.tracing as tracing
|
|
6
7
|
from metaflow import JSONTypeClass, util
|
|
7
8
|
from metaflow._vendor import click
|
|
8
9
|
from metaflow.exception import METAFLOW_EXIT_DISALLOW_RETRY, CommandException
|
|
9
10
|
from metaflow.metadata.util import sync_local_metadata_from_datastore
|
|
10
|
-
from metaflow.unbounded_foreach import UBF_CONTROL, UBF_TASK
|
|
11
11
|
from metaflow.metaflow_config import DATASTORE_LOCAL_DIR, KUBERNETES_LABELS
|
|
12
12
|
from metaflow.mflog import TASK_LOG_SOURCE
|
|
13
|
-
|
|
13
|
+
from metaflow.unbounded_foreach import UBF_CONTROL, UBF_TASK
|
|
14
14
|
|
|
15
15
|
from .kubernetes import (
|
|
16
16
|
Kubernetes,
|
|
17
|
+
KubernetesException,
|
|
17
18
|
KubernetesKilledException,
|
|
18
19
|
parse_kube_keyvalue_list,
|
|
19
|
-
KubernetesException,
|
|
20
20
|
)
|
|
21
21
|
from .kubernetes_decorator import KubernetesDecorator
|
|
22
22
|
|
|
@@ -185,8 +185,8 @@ def step(
|
|
|
185
185
|
|
|
186
186
|
if num_parallel is not None and num_parallel <= 1:
|
|
187
187
|
raise KubernetesException(
|
|
188
|
-
"Using @parallel with `num_parallel` <= 1 is not supported with
|
|
189
|
-
"Please set the value of `num_parallel` to be greater than 1."
|
|
188
|
+
"Using @parallel with `num_parallel` <= 1 is not supported with "
|
|
189
|
+
"@kubernetes. Please set the value of `num_parallel` to be greater than 1."
|
|
190
190
|
)
|
|
191
191
|
|
|
192
192
|
# Set retry policy.
|
|
@@ -203,19 +203,37 @@ def step(
|
|
|
203
203
|
)
|
|
204
204
|
time.sleep(minutes_between_retries * 60)
|
|
205
205
|
|
|
206
|
+
# Explicitly Remove `ubf_context` from `kwargs` so that it's not passed as a commandline option
|
|
207
|
+
# If an underlying step command is executing a vanilla Kubernetes job, then it should never need
|
|
208
|
+
# to know about the UBF context.
|
|
209
|
+
# If it is a jobset which is executing a multi-node job, then the UBF context is set based on the
|
|
210
|
+
# `ubf_context` parameter passed to the jobset.
|
|
211
|
+
kwargs.pop("ubf_context", None)
|
|
212
|
+
# `task_id` is also need to be removed from `kwargs` as it needs to be dynamically
|
|
213
|
+
# set in the downstream code IF num_parallel is > 1
|
|
214
|
+
task_id = kwargs["task_id"]
|
|
215
|
+
if num_parallel:
|
|
216
|
+
kwargs.pop("task_id")
|
|
217
|
+
|
|
206
218
|
step_cli = "{entrypoint} {top_args} step {step} {step_args}".format(
|
|
207
219
|
entrypoint="%s -u %s" % (executable, os.path.basename(sys.argv[0])),
|
|
208
220
|
top_args=" ".join(util.dict_to_cli_options(ctx.parent.parent.params)),
|
|
209
221
|
step=step_name,
|
|
210
222
|
step_args=" ".join(util.dict_to_cli_options(kwargs)),
|
|
211
223
|
)
|
|
224
|
+
# Since it is a parallel step there are some parts of the step_cli that need to be modified
|
|
225
|
+
# based on the type of worker in the JobSet. This is why we will create a placeholder string
|
|
226
|
+
# in the template which will be replaced based on the type of worker.
|
|
227
|
+
|
|
228
|
+
if num_parallel:
|
|
229
|
+
step_cli = "%s {METAFLOW_PARALLEL_STEP_CLI_OPTIONS_TEMPLATE}" % step_cli
|
|
212
230
|
|
|
213
231
|
# Set log tailing.
|
|
214
232
|
ds = ctx.obj.flow_datastore.get_task_datastore(
|
|
215
233
|
mode="w",
|
|
216
234
|
run_id=kwargs["run_id"],
|
|
217
235
|
step_name=step_name,
|
|
218
|
-
task_id=
|
|
236
|
+
task_id=task_id,
|
|
219
237
|
attempt=int(retry_count),
|
|
220
238
|
)
|
|
221
239
|
stdout_location = ds.get_log_location(TASK_LOG_SOURCE, "stdout")
|
|
@@ -229,7 +247,7 @@ def step(
|
|
|
229
247
|
sync_local_metadata_from_datastore(
|
|
230
248
|
DATASTORE_LOCAL_DIR,
|
|
231
249
|
ctx.obj.flow_datastore.get_task_datastore(
|
|
232
|
-
kwargs["run_id"], step_name,
|
|
250
|
+
kwargs["run_id"], step_name, task_id
|
|
233
251
|
),
|
|
234
252
|
)
|
|
235
253
|
|
|
@@ -245,7 +263,7 @@ def step(
|
|
|
245
263
|
flow_name=ctx.obj.flow.name,
|
|
246
264
|
run_id=kwargs["run_id"],
|
|
247
265
|
step_name=step_name,
|
|
248
|
-
task_id=
|
|
266
|
+
task_id=task_id,
|
|
249
267
|
attempt=str(retry_count),
|
|
250
268
|
user=util.get_username(),
|
|
251
269
|
code_package_sha=code_package_sha,
|
|
@@ -12,28 +12,27 @@ from metaflow.metaflow_config import (
|
|
|
12
12
|
DATASTORE_LOCAL_DIR,
|
|
13
13
|
KUBERNETES_CONTAINER_IMAGE,
|
|
14
14
|
KUBERNETES_CONTAINER_REGISTRY,
|
|
15
|
+
KUBERNETES_CPU,
|
|
16
|
+
KUBERNETES_DISK,
|
|
15
17
|
KUBERNETES_FETCH_EC2_METADATA,
|
|
16
|
-
KUBERNETES_IMAGE_PULL_POLICY,
|
|
17
18
|
KUBERNETES_GPU_VENDOR,
|
|
19
|
+
KUBERNETES_IMAGE_PULL_POLICY,
|
|
20
|
+
KUBERNETES_MEMORY,
|
|
18
21
|
KUBERNETES_NAMESPACE,
|
|
19
22
|
KUBERNETES_NODE_SELECTOR,
|
|
20
23
|
KUBERNETES_PERSISTENT_VOLUME_CLAIMS,
|
|
21
|
-
|
|
24
|
+
KUBERNETES_PORT,
|
|
22
25
|
KUBERNETES_SERVICE_ACCOUNT,
|
|
23
26
|
KUBERNETES_SHARED_MEMORY,
|
|
24
|
-
|
|
25
|
-
KUBERNETES_CPU,
|
|
26
|
-
KUBERNETES_MEMORY,
|
|
27
|
-
KUBERNETES_DISK,
|
|
27
|
+
KUBERNETES_TOLERATIONS,
|
|
28
28
|
)
|
|
29
29
|
from metaflow.plugins.resources_decorator import ResourcesDecorator
|
|
30
30
|
from metaflow.plugins.timeout_decorator import get_run_time_limit_for_task
|
|
31
31
|
from metaflow.sidecar import Sidecar
|
|
32
|
+
from metaflow.unbounded_foreach import UBF_CONTROL
|
|
32
33
|
|
|
33
34
|
from ..aws.aws_utils import get_docker_registry, get_ec2_instance_metadata
|
|
34
35
|
from .kubernetes import KubernetesException, parse_kube_keyvalue_list
|
|
35
|
-
from metaflow.unbounded_foreach import UBF_CONTROL
|
|
36
|
-
from .kubernetes_jobsets import TaskIdConstructor
|
|
37
36
|
|
|
38
37
|
from metaflow.metaflow_config import MAX_MEMORY_PER_TASK, MAX_CPU_PER_TASK
|
|
39
38
|
|
|
@@ -431,8 +430,8 @@ class KubernetesDecorator(StepDecorator):
|
|
|
431
430
|
# check for the existence of METAFLOW_KUBERNETES_WORKLOAD environment
|
|
432
431
|
# variable.
|
|
433
432
|
|
|
433
|
+
meta = {}
|
|
434
434
|
if "METAFLOW_KUBERNETES_WORKLOAD" in os.environ:
|
|
435
|
-
meta = {}
|
|
436
435
|
meta["kubernetes-pod-name"] = os.environ["METAFLOW_KUBERNETES_POD_NAME"]
|
|
437
436
|
meta["kubernetes-pod-namespace"] = os.environ[
|
|
438
437
|
"METAFLOW_KUBERNETES_POD_NAMESPACE"
|
|
@@ -442,15 +441,15 @@ class KubernetesDecorator(StepDecorator):
|
|
|
442
441
|
"METAFLOW_KUBERNETES_SERVICE_ACCOUNT_NAME"
|
|
443
442
|
]
|
|
444
443
|
meta["kubernetes-node-ip"] = os.environ["METAFLOW_KUBERNETES_NODE_IP"]
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
444
|
+
|
|
445
|
+
meta["kubernetes-jobset-name"] = os.environ.get(
|
|
446
|
+
"METAFLOW_KUBERNETES_JOBSET_NAME"
|
|
447
|
+
)
|
|
449
448
|
|
|
450
449
|
# TODO (savin): Introduce equivalent support for Microsoft Azure and
|
|
451
450
|
# Google Cloud Platform
|
|
452
|
-
# TODO: Introduce a way to detect Cloud Provider, so unnecessary requests
|
|
453
|
-
# can be avoided by not having to try out all providers.
|
|
451
|
+
# TODO: Introduce a way to detect Cloud Provider, so unnecessary requests
|
|
452
|
+
# (and delays) can be avoided by not having to try out all providers.
|
|
454
453
|
if KUBERNETES_FETCH_EC2_METADATA:
|
|
455
454
|
instance_meta = get_ec2_instance_metadata()
|
|
456
455
|
meta.update(instance_meta)
|
|
@@ -466,14 +465,6 @@ class KubernetesDecorator(StepDecorator):
|
|
|
466
465
|
# "METAFLOW_KUBERNETES_POD_NAME"
|
|
467
466
|
# ].rpartition("-")[0]
|
|
468
467
|
|
|
469
|
-
entries = [
|
|
470
|
-
MetaDatum(field=k, value=v, type=k, tags=[])
|
|
471
|
-
for k, v in meta.items()
|
|
472
|
-
if v is not None
|
|
473
|
-
]
|
|
474
|
-
# Register book-keeping metadata for debugging.
|
|
475
|
-
metadata.register_metadata(run_id, step_name, task_id, entries)
|
|
476
|
-
|
|
477
468
|
# Start MFLog sidecar to collect task logs.
|
|
478
469
|
self._save_logs_sidecar = Sidecar("save_logs_periodically")
|
|
479
470
|
self._save_logs_sidecar.start()
|
|
@@ -482,19 +473,34 @@ class KubernetesDecorator(StepDecorator):
|
|
|
482
473
|
if hasattr(flow, "_parallel_ubf_iter"):
|
|
483
474
|
num_parallel = flow._parallel_ubf_iter.num_parallel
|
|
484
475
|
|
|
485
|
-
if num_parallel and num_parallel >= 1 and ubf_context == UBF_CONTROL:
|
|
486
|
-
control_task_id, worker_task_ids = TaskIdConstructor.join_step_task_ids(
|
|
487
|
-
num_parallel
|
|
488
|
-
)
|
|
489
|
-
mapper_task_ids = [control_task_id] + worker_task_ids
|
|
490
|
-
flow._control_mapper_tasks = [
|
|
491
|
-
"%s/%s/%s" % (run_id, step_name, mapper_task_id)
|
|
492
|
-
for mapper_task_id in mapper_task_ids
|
|
493
|
-
]
|
|
494
|
-
flow._control_task_is_mapper_zero = True
|
|
495
|
-
|
|
496
476
|
if num_parallel and num_parallel > 1:
|
|
497
477
|
_setup_multinode_environment()
|
|
478
|
+
# current.parallel.node_index will be correctly available over here.
|
|
479
|
+
meta.update({"parallel-node-index": current.parallel.node_index})
|
|
480
|
+
if ubf_context == UBF_CONTROL:
|
|
481
|
+
flow._control_mapper_tasks = [
|
|
482
|
+
"{}/{}/{}".format(run_id, step_name, task_id)
|
|
483
|
+
for task_id in [task_id]
|
|
484
|
+
+ [
|
|
485
|
+
"%s-worker-%d" % (task_id, idx)
|
|
486
|
+
for idx in range(num_parallel - 1)
|
|
487
|
+
]
|
|
488
|
+
]
|
|
489
|
+
flow._control_task_is_mapper_zero = True
|
|
490
|
+
|
|
491
|
+
if len(meta) > 0:
|
|
492
|
+
entries = [
|
|
493
|
+
MetaDatum(
|
|
494
|
+
field=k,
|
|
495
|
+
value=v,
|
|
496
|
+
type=k,
|
|
497
|
+
tags=["attempt_id:{0}".format(retry_count)],
|
|
498
|
+
)
|
|
499
|
+
for k, v in meta.items()
|
|
500
|
+
if v is not None
|
|
501
|
+
]
|
|
502
|
+
# Register book-keeping metadata for debugging.
|
|
503
|
+
metadata.register_metadata(run_id, step_name, task_id, entries)
|
|
498
504
|
|
|
499
505
|
def task_finished(
|
|
500
506
|
self, step_name, flow, graph, is_task_ok, retry_count, max_retries
|
|
@@ -531,18 +537,24 @@ class KubernetesDecorator(StepDecorator):
|
|
|
531
537
|
)[0]
|
|
532
538
|
|
|
533
539
|
|
|
540
|
+
# TODO: Unify this method with the multi-node setup in @batch
|
|
534
541
|
def _setup_multinode_environment():
|
|
542
|
+
# FIXME: what about MF_MASTER_PORT
|
|
535
543
|
import socket
|
|
536
544
|
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
540
|
-
os.environ["MF_PARALLEL_NODE_INDEX"] = str(0)
|
|
541
|
-
elif os.environ.get("WORKER_REPLICA_INDEX") is not None:
|
|
542
|
-
os.environ["MF_PARALLEL_NODE_INDEX"] = str(
|
|
543
|
-
int(os.environ["WORKER_REPLICA_INDEX"]) + 1
|
|
545
|
+
try:
|
|
546
|
+
os.environ["MF_PARALLEL_MAIN_IP"] = socket.gethostbyname(
|
|
547
|
+
os.environ["MF_MASTER_ADDR"]
|
|
544
548
|
)
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
549
|
+
os.environ["MF_PARALLEL_NUM_NODES"] = os.environ["MF_WORLD_SIZE"]
|
|
550
|
+
os.environ["MF_PARALLEL_NODE_INDEX"] = (
|
|
551
|
+
str(0)
|
|
552
|
+
if "MF_CONTROL_INDEX" in os.environ
|
|
553
|
+
else str(int(os.environ["MF_WORKER_REPLICA_INDEX"]) + 1)
|
|
548
554
|
)
|
|
555
|
+
except KeyError as e:
|
|
556
|
+
raise MetaflowException("Environment variable {} is missing.".format(e))
|
|
557
|
+
except socket.gaierror:
|
|
558
|
+
raise MetaflowException("Failed to get host by name for MF_MASTER_ADDR.")
|
|
559
|
+
except ValueError:
|
|
560
|
+
raise MetaflowException("Invalid value for MF_WORKER_REPLICA_INDEX.")
|
|
@@ -1,18 +1,19 @@
|
|
|
1
|
+
import copy
|
|
1
2
|
import json
|
|
2
3
|
import math
|
|
3
4
|
import random
|
|
4
|
-
import time
|
|
5
|
-
import copy
|
|
6
5
|
import sys
|
|
7
|
-
|
|
6
|
+
import time
|
|
7
|
+
|
|
8
8
|
from metaflow.exception import MetaflowException
|
|
9
9
|
from metaflow.metaflow_config import KUBERNETES_SECRETS
|
|
10
|
+
from metaflow.tracing import inject_tracing_vars
|
|
10
11
|
from metaflow.unbounded_foreach import UBF_CONTROL, UBF_TASK
|
|
11
12
|
|
|
12
13
|
CLIENT_REFRESH_INTERVAL_SECONDS = 300
|
|
13
14
|
from .kubernetes_jobsets import (
|
|
14
|
-
KubernetesJobSet,
|
|
15
|
-
)
|
|
15
|
+
KubernetesJobSet,
|
|
16
|
+
) # We need this import for Kubernetes Client.
|
|
16
17
|
|
|
17
18
|
|
|
18
19
|
class KubernetesJobException(MetaflowException):
|
|
@@ -366,7 +367,6 @@ class KubernetesJob(object):
|
|
|
366
367
|
|
|
367
368
|
|
|
368
369
|
class RunningJob(object):
|
|
369
|
-
|
|
370
370
|
# State Machine implementation for the lifecycle behavior documented in
|
|
371
371
|
# https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/
|
|
372
372
|
#
|