metaflow 2.12.7__py2.py3-none-any.whl → 2.12.9__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metaflow/__init__.py +2 -0
- metaflow/cli.py +12 -4
- metaflow/extension_support/plugins.py +1 -0
- metaflow/flowspec.py +8 -1
- metaflow/lint.py +13 -0
- metaflow/metaflow_current.py +0 -8
- metaflow/plugins/__init__.py +12 -0
- metaflow/plugins/argo/argo_workflows.py +462 -42
- metaflow/plugins/argo/argo_workflows_cli.py +60 -3
- metaflow/plugins/argo/argo_workflows_decorator.py +38 -7
- metaflow/plugins/argo/argo_workflows_deployer.py +290 -0
- metaflow/plugins/argo/jobset_input_paths.py +16 -0
- metaflow/plugins/aws/batch/batch_decorator.py +16 -13
- metaflow/plugins/aws/step_functions/step_functions_cli.py +45 -3
- metaflow/plugins/aws/step_functions/step_functions_deployer.py +251 -0
- metaflow/plugins/cards/card_cli.py +1 -1
- metaflow/plugins/kubernetes/kubernetes.py +279 -52
- metaflow/plugins/kubernetes/kubernetes_cli.py +26 -8
- metaflow/plugins/kubernetes/kubernetes_client.py +0 -1
- metaflow/plugins/kubernetes/kubernetes_decorator.py +56 -44
- metaflow/plugins/kubernetes/kubernetes_job.py +6 -6
- metaflow/plugins/kubernetes/kubernetes_jobsets.py +510 -272
- metaflow/plugins/parallel_decorator.py +108 -8
- metaflow/plugins/pypi/bootstrap.py +1 -1
- metaflow/plugins/pypi/micromamba.py +1 -1
- metaflow/plugins/secrets/secrets_decorator.py +12 -3
- metaflow/plugins/test_unbounded_foreach_decorator.py +39 -4
- metaflow/runner/deployer.py +386 -0
- metaflow/runner/metaflow_runner.py +1 -20
- metaflow/runner/nbdeploy.py +130 -0
- metaflow/runner/nbrun.py +4 -28
- metaflow/runner/utils.py +49 -0
- metaflow/runtime.py +246 -134
- metaflow/version.py +1 -1
- {metaflow-2.12.7.dist-info → metaflow-2.12.9.dist-info}/METADATA +2 -2
- {metaflow-2.12.7.dist-info → metaflow-2.12.9.dist-info}/RECORD +40 -34
- {metaflow-2.12.7.dist-info → metaflow-2.12.9.dist-info}/WHEEL +1 -1
- {metaflow-2.12.7.dist-info → metaflow-2.12.9.dist-info}/LICENSE +0 -0
- {metaflow-2.12.7.dist-info → metaflow-2.12.9.dist-info}/entry_points.txt +0 -0
- {metaflow-2.12.7.dist-info → metaflow-2.12.9.dist-info}/top_level.txt +0 -0
@@ -3,20 +3,20 @@ import sys
|
|
3
3
|
import time
|
4
4
|
import traceback
|
5
5
|
|
6
|
+
import metaflow.tracing as tracing
|
6
7
|
from metaflow import JSONTypeClass, util
|
7
8
|
from metaflow._vendor import click
|
8
9
|
from metaflow.exception import METAFLOW_EXIT_DISALLOW_RETRY, CommandException
|
9
10
|
from metaflow.metadata.util import sync_local_metadata_from_datastore
|
10
|
-
from metaflow.unbounded_foreach import UBF_CONTROL, UBF_TASK
|
11
11
|
from metaflow.metaflow_config import DATASTORE_LOCAL_DIR, KUBERNETES_LABELS
|
12
12
|
from metaflow.mflog import TASK_LOG_SOURCE
|
13
|
-
|
13
|
+
from metaflow.unbounded_foreach import UBF_CONTROL, UBF_TASK
|
14
14
|
|
15
15
|
from .kubernetes import (
|
16
16
|
Kubernetes,
|
17
|
+
KubernetesException,
|
17
18
|
KubernetesKilledException,
|
18
19
|
parse_kube_keyvalue_list,
|
19
|
-
KubernetesException,
|
20
20
|
)
|
21
21
|
from .kubernetes_decorator import KubernetesDecorator
|
22
22
|
|
@@ -185,8 +185,8 @@ def step(
|
|
185
185
|
|
186
186
|
if num_parallel is not None and num_parallel <= 1:
|
187
187
|
raise KubernetesException(
|
188
|
-
"Using @parallel with `num_parallel` <= 1 is not supported with
|
189
|
-
"Please set the value of `num_parallel` to be greater than 1."
|
188
|
+
"Using @parallel with `num_parallel` <= 1 is not supported with "
|
189
|
+
"@kubernetes. Please set the value of `num_parallel` to be greater than 1."
|
190
190
|
)
|
191
191
|
|
192
192
|
# Set retry policy.
|
@@ -203,19 +203,37 @@ def step(
|
|
203
203
|
)
|
204
204
|
time.sleep(minutes_between_retries * 60)
|
205
205
|
|
206
|
+
# Explicitly Remove `ubf_context` from `kwargs` so that it's not passed as a commandline option
|
207
|
+
# If an underlying step command is executing a vanilla Kubernetes job, then it should never need
|
208
|
+
# to know about the UBF context.
|
209
|
+
# If it is a jobset which is executing a multi-node job, then the UBF context is set based on the
|
210
|
+
# `ubf_context` parameter passed to the jobset.
|
211
|
+
kwargs.pop("ubf_context", None)
|
212
|
+
# `task_id` is also need to be removed from `kwargs` as it needs to be dynamically
|
213
|
+
# set in the downstream code IF num_parallel is > 1
|
214
|
+
task_id = kwargs["task_id"]
|
215
|
+
if num_parallel:
|
216
|
+
kwargs.pop("task_id")
|
217
|
+
|
206
218
|
step_cli = "{entrypoint} {top_args} step {step} {step_args}".format(
|
207
219
|
entrypoint="%s -u %s" % (executable, os.path.basename(sys.argv[0])),
|
208
220
|
top_args=" ".join(util.dict_to_cli_options(ctx.parent.parent.params)),
|
209
221
|
step=step_name,
|
210
222
|
step_args=" ".join(util.dict_to_cli_options(kwargs)),
|
211
223
|
)
|
224
|
+
# Since it is a parallel step there are some parts of the step_cli that need to be modified
|
225
|
+
# based on the type of worker in the JobSet. This is why we will create a placeholder string
|
226
|
+
# in the template which will be replaced based on the type of worker.
|
227
|
+
|
228
|
+
if num_parallel:
|
229
|
+
step_cli = "%s {METAFLOW_PARALLEL_STEP_CLI_OPTIONS_TEMPLATE}" % step_cli
|
212
230
|
|
213
231
|
# Set log tailing.
|
214
232
|
ds = ctx.obj.flow_datastore.get_task_datastore(
|
215
233
|
mode="w",
|
216
234
|
run_id=kwargs["run_id"],
|
217
235
|
step_name=step_name,
|
218
|
-
task_id=
|
236
|
+
task_id=task_id,
|
219
237
|
attempt=int(retry_count),
|
220
238
|
)
|
221
239
|
stdout_location = ds.get_log_location(TASK_LOG_SOURCE, "stdout")
|
@@ -229,7 +247,7 @@ def step(
|
|
229
247
|
sync_local_metadata_from_datastore(
|
230
248
|
DATASTORE_LOCAL_DIR,
|
231
249
|
ctx.obj.flow_datastore.get_task_datastore(
|
232
|
-
kwargs["run_id"], step_name,
|
250
|
+
kwargs["run_id"], step_name, task_id
|
233
251
|
),
|
234
252
|
)
|
235
253
|
|
@@ -245,7 +263,7 @@ def step(
|
|
245
263
|
flow_name=ctx.obj.flow.name,
|
246
264
|
run_id=kwargs["run_id"],
|
247
265
|
step_name=step_name,
|
248
|
-
task_id=
|
266
|
+
task_id=task_id,
|
249
267
|
attempt=str(retry_count),
|
250
268
|
user=util.get_username(),
|
251
269
|
code_package_sha=code_package_sha,
|
@@ -12,28 +12,27 @@ from metaflow.metaflow_config import (
|
|
12
12
|
DATASTORE_LOCAL_DIR,
|
13
13
|
KUBERNETES_CONTAINER_IMAGE,
|
14
14
|
KUBERNETES_CONTAINER_REGISTRY,
|
15
|
+
KUBERNETES_CPU,
|
16
|
+
KUBERNETES_DISK,
|
15
17
|
KUBERNETES_FETCH_EC2_METADATA,
|
16
|
-
KUBERNETES_IMAGE_PULL_POLICY,
|
17
18
|
KUBERNETES_GPU_VENDOR,
|
19
|
+
KUBERNETES_IMAGE_PULL_POLICY,
|
20
|
+
KUBERNETES_MEMORY,
|
18
21
|
KUBERNETES_NAMESPACE,
|
19
22
|
KUBERNETES_NODE_SELECTOR,
|
20
23
|
KUBERNETES_PERSISTENT_VOLUME_CLAIMS,
|
21
|
-
|
24
|
+
KUBERNETES_PORT,
|
22
25
|
KUBERNETES_SERVICE_ACCOUNT,
|
23
26
|
KUBERNETES_SHARED_MEMORY,
|
24
|
-
|
25
|
-
KUBERNETES_CPU,
|
26
|
-
KUBERNETES_MEMORY,
|
27
|
-
KUBERNETES_DISK,
|
27
|
+
KUBERNETES_TOLERATIONS,
|
28
28
|
)
|
29
29
|
from metaflow.plugins.resources_decorator import ResourcesDecorator
|
30
30
|
from metaflow.plugins.timeout_decorator import get_run_time_limit_for_task
|
31
31
|
from metaflow.sidecar import Sidecar
|
32
|
+
from metaflow.unbounded_foreach import UBF_CONTROL
|
32
33
|
|
33
34
|
from ..aws.aws_utils import get_docker_registry, get_ec2_instance_metadata
|
34
35
|
from .kubernetes import KubernetesException, parse_kube_keyvalue_list
|
35
|
-
from metaflow.unbounded_foreach import UBF_CONTROL
|
36
|
-
from .kubernetes_jobsets import TaskIdConstructor
|
37
36
|
|
38
37
|
try:
|
39
38
|
unicode
|
@@ -416,8 +415,8 @@ class KubernetesDecorator(StepDecorator):
|
|
416
415
|
# check for the existence of METAFLOW_KUBERNETES_WORKLOAD environment
|
417
416
|
# variable.
|
418
417
|
|
418
|
+
meta = {}
|
419
419
|
if "METAFLOW_KUBERNETES_WORKLOAD" in os.environ:
|
420
|
-
meta = {}
|
421
420
|
meta["kubernetes-pod-name"] = os.environ["METAFLOW_KUBERNETES_POD_NAME"]
|
422
421
|
meta["kubernetes-pod-namespace"] = os.environ[
|
423
422
|
"METAFLOW_KUBERNETES_POD_NAMESPACE"
|
@@ -427,15 +426,15 @@ class KubernetesDecorator(StepDecorator):
|
|
427
426
|
"METAFLOW_KUBERNETES_SERVICE_ACCOUNT_NAME"
|
428
427
|
]
|
429
428
|
meta["kubernetes-node-ip"] = os.environ["METAFLOW_KUBERNETES_NODE_IP"]
|
430
|
-
|
431
|
-
|
432
|
-
|
433
|
-
|
429
|
+
|
430
|
+
meta["kubernetes-jobset-name"] = os.environ.get(
|
431
|
+
"METAFLOW_KUBERNETES_JOBSET_NAME"
|
432
|
+
)
|
434
433
|
|
435
434
|
# TODO (savin): Introduce equivalent support for Microsoft Azure and
|
436
435
|
# Google Cloud Platform
|
437
|
-
# TODO: Introduce a way to detect Cloud Provider, so unnecessary requests
|
438
|
-
# can be avoided by not having to try out all providers.
|
436
|
+
# TODO: Introduce a way to detect Cloud Provider, so unnecessary requests
|
437
|
+
# (and delays) can be avoided by not having to try out all providers.
|
439
438
|
if KUBERNETES_FETCH_EC2_METADATA:
|
440
439
|
instance_meta = get_ec2_instance_metadata()
|
441
440
|
meta.update(instance_meta)
|
@@ -451,14 +450,6 @@ class KubernetesDecorator(StepDecorator):
|
|
451
450
|
# "METAFLOW_KUBERNETES_POD_NAME"
|
452
451
|
# ].rpartition("-")[0]
|
453
452
|
|
454
|
-
entries = [
|
455
|
-
MetaDatum(field=k, value=v, type=k, tags=[])
|
456
|
-
for k, v in meta.items()
|
457
|
-
if v is not None
|
458
|
-
]
|
459
|
-
# Register book-keeping metadata for debugging.
|
460
|
-
metadata.register_metadata(run_id, step_name, task_id, entries)
|
461
|
-
|
462
453
|
# Start MFLog sidecar to collect task logs.
|
463
454
|
self._save_logs_sidecar = Sidecar("save_logs_periodically")
|
464
455
|
self._save_logs_sidecar.start()
|
@@ -467,19 +458,34 @@ class KubernetesDecorator(StepDecorator):
|
|
467
458
|
if hasattr(flow, "_parallel_ubf_iter"):
|
468
459
|
num_parallel = flow._parallel_ubf_iter.num_parallel
|
469
460
|
|
470
|
-
if num_parallel and num_parallel >= 1 and ubf_context == UBF_CONTROL:
|
471
|
-
control_task_id, worker_task_ids = TaskIdConstructor.join_step_task_ids(
|
472
|
-
num_parallel
|
473
|
-
)
|
474
|
-
mapper_task_ids = [control_task_id] + worker_task_ids
|
475
|
-
flow._control_mapper_tasks = [
|
476
|
-
"%s/%s/%s" % (run_id, step_name, mapper_task_id)
|
477
|
-
for mapper_task_id in mapper_task_ids
|
478
|
-
]
|
479
|
-
flow._control_task_is_mapper_zero = True
|
480
|
-
|
481
461
|
if num_parallel and num_parallel > 1:
|
482
462
|
_setup_multinode_environment()
|
463
|
+
# current.parallel.node_index will be correctly available over here.
|
464
|
+
meta.update({"parallel-node-index": current.parallel.node_index})
|
465
|
+
if ubf_context == UBF_CONTROL:
|
466
|
+
flow._control_mapper_tasks = [
|
467
|
+
"{}/{}/{}".format(run_id, step_name, task_id)
|
468
|
+
for task_id in [task_id]
|
469
|
+
+ [
|
470
|
+
"%s-worker-%d" % (task_id, idx)
|
471
|
+
for idx in range(num_parallel - 1)
|
472
|
+
]
|
473
|
+
]
|
474
|
+
flow._control_task_is_mapper_zero = True
|
475
|
+
|
476
|
+
if len(meta) > 0:
|
477
|
+
entries = [
|
478
|
+
MetaDatum(
|
479
|
+
field=k,
|
480
|
+
value=v,
|
481
|
+
type=k,
|
482
|
+
tags=["attempt_id:{0}".format(retry_count)],
|
483
|
+
)
|
484
|
+
for k, v in meta.items()
|
485
|
+
if v is not None
|
486
|
+
]
|
487
|
+
# Register book-keeping metadata for debugging.
|
488
|
+
metadata.register_metadata(run_id, step_name, task_id, entries)
|
483
489
|
|
484
490
|
def task_finished(
|
485
491
|
self, step_name, flow, graph, is_task_ok, retry_count, max_retries
|
@@ -516,18 +522,24 @@ class KubernetesDecorator(StepDecorator):
|
|
516
522
|
)[0]
|
517
523
|
|
518
524
|
|
525
|
+
# TODO: Unify this method with the multi-node setup in @batch
|
519
526
|
def _setup_multinode_environment():
|
527
|
+
# FIXME: what about MF_MASTER_PORT
|
520
528
|
import socket
|
521
529
|
|
522
|
-
|
523
|
-
|
524
|
-
|
525
|
-
os.environ["MF_PARALLEL_NODE_INDEX"] = str(0)
|
526
|
-
elif os.environ.get("WORKER_REPLICA_INDEX") is not None:
|
527
|
-
os.environ["MF_PARALLEL_NODE_INDEX"] = str(
|
528
|
-
int(os.environ["WORKER_REPLICA_INDEX"]) + 1
|
530
|
+
try:
|
531
|
+
os.environ["MF_PARALLEL_MAIN_IP"] = socket.gethostbyname(
|
532
|
+
os.environ["MF_MASTER_ADDR"]
|
529
533
|
)
|
530
|
-
|
531
|
-
|
532
|
-
|
534
|
+
os.environ["MF_PARALLEL_NUM_NODES"] = os.environ["MF_WORLD_SIZE"]
|
535
|
+
os.environ["MF_PARALLEL_NODE_INDEX"] = (
|
536
|
+
str(0)
|
537
|
+
if "MF_CONTROL_INDEX" in os.environ
|
538
|
+
else str(int(os.environ["MF_WORKER_REPLICA_INDEX"]) + 1)
|
533
539
|
)
|
540
|
+
except KeyError as e:
|
541
|
+
raise MetaflowException("Environment variable {} is missing.".format(e))
|
542
|
+
except socket.gaierror:
|
543
|
+
raise MetaflowException("Failed to get host by name for MF_MASTER_ADDR.")
|
544
|
+
except ValueError:
|
545
|
+
raise MetaflowException("Invalid value for MF_WORKER_REPLICA_INDEX.")
|
@@ -1,18 +1,19 @@
|
|
1
|
+
import copy
|
1
2
|
import json
|
2
3
|
import math
|
3
4
|
import random
|
4
|
-
import time
|
5
|
-
import copy
|
6
5
|
import sys
|
7
|
-
|
6
|
+
import time
|
7
|
+
|
8
8
|
from metaflow.exception import MetaflowException
|
9
9
|
from metaflow.metaflow_config import KUBERNETES_SECRETS
|
10
|
+
from metaflow.tracing import inject_tracing_vars
|
10
11
|
from metaflow.unbounded_foreach import UBF_CONTROL, UBF_TASK
|
11
12
|
|
12
13
|
CLIENT_REFRESH_INTERVAL_SECONDS = 300
|
13
14
|
from .kubernetes_jobsets import (
|
14
|
-
KubernetesJobSet,
|
15
|
-
)
|
15
|
+
KubernetesJobSet,
|
16
|
+
) # We need this import for Kubernetes Client.
|
16
17
|
|
17
18
|
|
18
19
|
class KubernetesJobException(MetaflowException):
|
@@ -366,7 +367,6 @@ class KubernetesJob(object):
|
|
366
367
|
|
367
368
|
|
368
369
|
class RunningJob(object):
|
369
|
-
|
370
370
|
# State Machine implementation for the lifecycle behavior documented in
|
371
371
|
# https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/
|
372
372
|
#
|