metaflow 2.12.8__py2.py3-none-any.whl → 2.12.9__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. metaflow/__init__.py +2 -0
  2. metaflow/cli.py +12 -4
  3. metaflow/extension_support/plugins.py +1 -0
  4. metaflow/flowspec.py +8 -1
  5. metaflow/lint.py +13 -0
  6. metaflow/metaflow_current.py +0 -8
  7. metaflow/plugins/__init__.py +12 -0
  8. metaflow/plugins/argo/argo_workflows.py +462 -42
  9. metaflow/plugins/argo/argo_workflows_cli.py +60 -3
  10. metaflow/plugins/argo/argo_workflows_decorator.py +38 -7
  11. metaflow/plugins/argo/argo_workflows_deployer.py +290 -0
  12. metaflow/plugins/argo/jobset_input_paths.py +16 -0
  13. metaflow/plugins/aws/batch/batch_decorator.py +16 -13
  14. metaflow/plugins/aws/step_functions/step_functions_cli.py +45 -3
  15. metaflow/plugins/aws/step_functions/step_functions_deployer.py +251 -0
  16. metaflow/plugins/cards/card_cli.py +1 -1
  17. metaflow/plugins/kubernetes/kubernetes.py +279 -52
  18. metaflow/plugins/kubernetes/kubernetes_cli.py +26 -8
  19. metaflow/plugins/kubernetes/kubernetes_client.py +0 -1
  20. metaflow/plugins/kubernetes/kubernetes_decorator.py +56 -44
  21. metaflow/plugins/kubernetes/kubernetes_job.py +6 -6
  22. metaflow/plugins/kubernetes/kubernetes_jobsets.py +510 -272
  23. metaflow/plugins/parallel_decorator.py +108 -8
  24. metaflow/plugins/secrets/secrets_decorator.py +12 -3
  25. metaflow/plugins/test_unbounded_foreach_decorator.py +39 -4
  26. metaflow/runner/deployer.py +386 -0
  27. metaflow/runner/metaflow_runner.py +1 -20
  28. metaflow/runner/nbdeploy.py +130 -0
  29. metaflow/runner/nbrun.py +4 -28
  30. metaflow/runner/utils.py +49 -0
  31. metaflow/runtime.py +246 -134
  32. metaflow/version.py +1 -1
  33. {metaflow-2.12.8.dist-info → metaflow-2.12.9.dist-info}/METADATA +2 -2
  34. {metaflow-2.12.8.dist-info → metaflow-2.12.9.dist-info}/RECORD +38 -32
  35. {metaflow-2.12.8.dist-info → metaflow-2.12.9.dist-info}/WHEEL +1 -1
  36. {metaflow-2.12.8.dist-info → metaflow-2.12.9.dist-info}/LICENSE +0 -0
  37. {metaflow-2.12.8.dist-info → metaflow-2.12.9.dist-info}/entry_points.txt +0 -0
  38. {metaflow-2.12.8.dist-info → metaflow-2.12.9.dist-info}/top_level.txt +0 -0
@@ -3,20 +3,20 @@ import sys
3
3
  import time
4
4
  import traceback
5
5
 
6
+ import metaflow.tracing as tracing
6
7
  from metaflow import JSONTypeClass, util
7
8
  from metaflow._vendor import click
8
9
  from metaflow.exception import METAFLOW_EXIT_DISALLOW_RETRY, CommandException
9
10
  from metaflow.metadata.util import sync_local_metadata_from_datastore
10
- from metaflow.unbounded_foreach import UBF_CONTROL, UBF_TASK
11
11
  from metaflow.metaflow_config import DATASTORE_LOCAL_DIR, KUBERNETES_LABELS
12
12
  from metaflow.mflog import TASK_LOG_SOURCE
13
- import metaflow.tracing as tracing
13
+ from metaflow.unbounded_foreach import UBF_CONTROL, UBF_TASK
14
14
 
15
15
  from .kubernetes import (
16
16
  Kubernetes,
17
+ KubernetesException,
17
18
  KubernetesKilledException,
18
19
  parse_kube_keyvalue_list,
19
- KubernetesException,
20
20
  )
21
21
  from .kubernetes_decorator import KubernetesDecorator
22
22
 
@@ -185,8 +185,8 @@ def step(
185
185
 
186
186
  if num_parallel is not None and num_parallel <= 1:
187
187
  raise KubernetesException(
188
- "Using @parallel with `num_parallel` <= 1 is not supported with Kubernetes. "
189
- "Please set the value of `num_parallel` to be greater than 1."
188
+ "Using @parallel with `num_parallel` <= 1 is not supported with "
189
+ "@kubernetes. Please set the value of `num_parallel` to be greater than 1."
190
190
  )
191
191
 
192
192
  # Set retry policy.
@@ -203,19 +203,37 @@ def step(
203
203
  )
204
204
  time.sleep(minutes_between_retries * 60)
205
205
 
206
+ # Explicitly Remove `ubf_context` from `kwargs` so that it's not passed as a commandline option
207
+ # If an underlying step command is executing a vanilla Kubernetes job, then it should never need
208
+ # to know about the UBF context.
209
+ # If it is a jobset which is executing a multi-node job, then the UBF context is set based on the
210
+ # `ubf_context` parameter passed to the jobset.
211
+ kwargs.pop("ubf_context", None)
212
+ # `task_id` is also need to be removed from `kwargs` as it needs to be dynamically
213
+ # set in the downstream code IF num_parallel is > 1
214
+ task_id = kwargs["task_id"]
215
+ if num_parallel:
216
+ kwargs.pop("task_id")
217
+
206
218
  step_cli = "{entrypoint} {top_args} step {step} {step_args}".format(
207
219
  entrypoint="%s -u %s" % (executable, os.path.basename(sys.argv[0])),
208
220
  top_args=" ".join(util.dict_to_cli_options(ctx.parent.parent.params)),
209
221
  step=step_name,
210
222
  step_args=" ".join(util.dict_to_cli_options(kwargs)),
211
223
  )
224
+ # Since it is a parallel step there are some parts of the step_cli that need to be modified
225
+ # based on the type of worker in the JobSet. This is why we will create a placeholder string
226
+ # in the template which will be replaced based on the type of worker.
227
+
228
+ if num_parallel:
229
+ step_cli = "%s {METAFLOW_PARALLEL_STEP_CLI_OPTIONS_TEMPLATE}" % step_cli
212
230
 
213
231
  # Set log tailing.
214
232
  ds = ctx.obj.flow_datastore.get_task_datastore(
215
233
  mode="w",
216
234
  run_id=kwargs["run_id"],
217
235
  step_name=step_name,
218
- task_id=kwargs["task_id"],
236
+ task_id=task_id,
219
237
  attempt=int(retry_count),
220
238
  )
221
239
  stdout_location = ds.get_log_location(TASK_LOG_SOURCE, "stdout")
@@ -229,7 +247,7 @@ def step(
229
247
  sync_local_metadata_from_datastore(
230
248
  DATASTORE_LOCAL_DIR,
231
249
  ctx.obj.flow_datastore.get_task_datastore(
232
- kwargs["run_id"], step_name, kwargs["task_id"]
250
+ kwargs["run_id"], step_name, task_id
233
251
  ),
234
252
  )
235
253
 
@@ -245,7 +263,7 @@ def step(
245
263
  flow_name=ctx.obj.flow.name,
246
264
  run_id=kwargs["run_id"],
247
265
  step_name=step_name,
248
- task_id=kwargs["task_id"],
266
+ task_id=task_id,
249
267
  attempt=str(retry_count),
250
268
  user=util.get_username(),
251
269
  code_package_sha=code_package_sha,
@@ -6,7 +6,6 @@ from metaflow.exception import MetaflowException
6
6
 
7
7
  from .kubernetes_job import KubernetesJob, KubernetesJobSet
8
8
 
9
-
10
9
  CLIENT_REFRESH_INTERVAL_SECONDS = 300
11
10
 
12
11
 
@@ -12,28 +12,27 @@ from metaflow.metaflow_config import (
12
12
  DATASTORE_LOCAL_DIR,
13
13
  KUBERNETES_CONTAINER_IMAGE,
14
14
  KUBERNETES_CONTAINER_REGISTRY,
15
+ KUBERNETES_CPU,
16
+ KUBERNETES_DISK,
15
17
  KUBERNETES_FETCH_EC2_METADATA,
16
- KUBERNETES_IMAGE_PULL_POLICY,
17
18
  KUBERNETES_GPU_VENDOR,
19
+ KUBERNETES_IMAGE_PULL_POLICY,
20
+ KUBERNETES_MEMORY,
18
21
  KUBERNETES_NAMESPACE,
19
22
  KUBERNETES_NODE_SELECTOR,
20
23
  KUBERNETES_PERSISTENT_VOLUME_CLAIMS,
21
- KUBERNETES_TOLERATIONS,
24
+ KUBERNETES_PORT,
22
25
  KUBERNETES_SERVICE_ACCOUNT,
23
26
  KUBERNETES_SHARED_MEMORY,
24
- KUBERNETES_PORT,
25
- KUBERNETES_CPU,
26
- KUBERNETES_MEMORY,
27
- KUBERNETES_DISK,
27
+ KUBERNETES_TOLERATIONS,
28
28
  )
29
29
  from metaflow.plugins.resources_decorator import ResourcesDecorator
30
30
  from metaflow.plugins.timeout_decorator import get_run_time_limit_for_task
31
31
  from metaflow.sidecar import Sidecar
32
+ from metaflow.unbounded_foreach import UBF_CONTROL
32
33
 
33
34
  from ..aws.aws_utils import get_docker_registry, get_ec2_instance_metadata
34
35
  from .kubernetes import KubernetesException, parse_kube_keyvalue_list
35
- from metaflow.unbounded_foreach import UBF_CONTROL
36
- from .kubernetes_jobsets import TaskIdConstructor
37
36
 
38
37
  try:
39
38
  unicode
@@ -416,8 +415,8 @@ class KubernetesDecorator(StepDecorator):
416
415
  # check for the existence of METAFLOW_KUBERNETES_WORKLOAD environment
417
416
  # variable.
418
417
 
418
+ meta = {}
419
419
  if "METAFLOW_KUBERNETES_WORKLOAD" in os.environ:
420
- meta = {}
421
420
  meta["kubernetes-pod-name"] = os.environ["METAFLOW_KUBERNETES_POD_NAME"]
422
421
  meta["kubernetes-pod-namespace"] = os.environ[
423
422
  "METAFLOW_KUBERNETES_POD_NAMESPACE"
@@ -427,15 +426,15 @@ class KubernetesDecorator(StepDecorator):
427
426
  "METAFLOW_KUBERNETES_SERVICE_ACCOUNT_NAME"
428
427
  ]
429
428
  meta["kubernetes-node-ip"] = os.environ["METAFLOW_KUBERNETES_NODE_IP"]
430
- if os.environ.get("METAFLOW_KUBERNETES_JOBSET_NAME"):
431
- meta["kubernetes-jobset-name"] = os.environ[
432
- "METAFLOW_KUBERNETES_JOBSET_NAME"
433
- ]
429
+
430
+ meta["kubernetes-jobset-name"] = os.environ.get(
431
+ "METAFLOW_KUBERNETES_JOBSET_NAME"
432
+ )
434
433
 
435
434
  # TODO (savin): Introduce equivalent support for Microsoft Azure and
436
435
  # Google Cloud Platform
437
- # TODO: Introduce a way to detect Cloud Provider, so unnecessary requests (and delays)
438
- # can be avoided by not having to try out all providers.
436
+ # TODO: Introduce a way to detect Cloud Provider, so unnecessary requests
437
+ # (and delays) can be avoided by not having to try out all providers.
439
438
  if KUBERNETES_FETCH_EC2_METADATA:
440
439
  instance_meta = get_ec2_instance_metadata()
441
440
  meta.update(instance_meta)
@@ -451,14 +450,6 @@ class KubernetesDecorator(StepDecorator):
451
450
  # "METAFLOW_KUBERNETES_POD_NAME"
452
451
  # ].rpartition("-")[0]
453
452
 
454
- entries = [
455
- MetaDatum(field=k, value=v, type=k, tags=[])
456
- for k, v in meta.items()
457
- if v is not None
458
- ]
459
- # Register book-keeping metadata for debugging.
460
- metadata.register_metadata(run_id, step_name, task_id, entries)
461
-
462
453
  # Start MFLog sidecar to collect task logs.
463
454
  self._save_logs_sidecar = Sidecar("save_logs_periodically")
464
455
  self._save_logs_sidecar.start()
@@ -467,19 +458,34 @@ class KubernetesDecorator(StepDecorator):
467
458
  if hasattr(flow, "_parallel_ubf_iter"):
468
459
  num_parallel = flow._parallel_ubf_iter.num_parallel
469
460
 
470
- if num_parallel and num_parallel >= 1 and ubf_context == UBF_CONTROL:
471
- control_task_id, worker_task_ids = TaskIdConstructor.join_step_task_ids(
472
- num_parallel
473
- )
474
- mapper_task_ids = [control_task_id] + worker_task_ids
475
- flow._control_mapper_tasks = [
476
- "%s/%s/%s" % (run_id, step_name, mapper_task_id)
477
- for mapper_task_id in mapper_task_ids
478
- ]
479
- flow._control_task_is_mapper_zero = True
480
-
481
461
  if num_parallel and num_parallel > 1:
482
462
  _setup_multinode_environment()
463
+ # current.parallel.node_index will be correctly available over here.
464
+ meta.update({"parallel-node-index": current.parallel.node_index})
465
+ if ubf_context == UBF_CONTROL:
466
+ flow._control_mapper_tasks = [
467
+ "{}/{}/{}".format(run_id, step_name, task_id)
468
+ for task_id in [task_id]
469
+ + [
470
+ "%s-worker-%d" % (task_id, idx)
471
+ for idx in range(num_parallel - 1)
472
+ ]
473
+ ]
474
+ flow._control_task_is_mapper_zero = True
475
+
476
+ if len(meta) > 0:
477
+ entries = [
478
+ MetaDatum(
479
+ field=k,
480
+ value=v,
481
+ type=k,
482
+ tags=["attempt_id:{0}".format(retry_count)],
483
+ )
484
+ for k, v in meta.items()
485
+ if v is not None
486
+ ]
487
+ # Register book-keeping metadata for debugging.
488
+ metadata.register_metadata(run_id, step_name, task_id, entries)
483
489
 
484
490
  def task_finished(
485
491
  self, step_name, flow, graph, is_task_ok, retry_count, max_retries
@@ -516,18 +522,24 @@ class KubernetesDecorator(StepDecorator):
516
522
  )[0]
517
523
 
518
524
 
525
+ # TODO: Unify this method with the multi-node setup in @batch
519
526
  def _setup_multinode_environment():
527
+ # FIXME: what about MF_MASTER_PORT
520
528
  import socket
521
529
 
522
- os.environ["MF_PARALLEL_MAIN_IP"] = socket.gethostbyname(os.environ["MASTER_ADDR"])
523
- os.environ["MF_PARALLEL_NUM_NODES"] = os.environ["WORLD_SIZE"]
524
- if os.environ.get("CONTROL_INDEX") is not None:
525
- os.environ["MF_PARALLEL_NODE_INDEX"] = str(0)
526
- elif os.environ.get("WORKER_REPLICA_INDEX") is not None:
527
- os.environ["MF_PARALLEL_NODE_INDEX"] = str(
528
- int(os.environ["WORKER_REPLICA_INDEX"]) + 1
530
+ try:
531
+ os.environ["MF_PARALLEL_MAIN_IP"] = socket.gethostbyname(
532
+ os.environ["MF_MASTER_ADDR"]
529
533
  )
530
- else:
531
- raise MetaflowException(
532
- "Jobset related ENV vars called $CONTROL_INDEX or $WORKER_REPLICA_INDEX not found"
534
+ os.environ["MF_PARALLEL_NUM_NODES"] = os.environ["MF_WORLD_SIZE"]
535
+ os.environ["MF_PARALLEL_NODE_INDEX"] = (
536
+ str(0)
537
+ if "MF_CONTROL_INDEX" in os.environ
538
+ else str(int(os.environ["MF_WORKER_REPLICA_INDEX"]) + 1)
533
539
  )
540
+ except KeyError as e:
541
+ raise MetaflowException("Environment variable {} is missing.".format(e))
542
+ except socket.gaierror:
543
+ raise MetaflowException("Failed to get host by name for MF_MASTER_ADDR.")
544
+ except ValueError:
545
+ raise MetaflowException("Invalid value for MF_WORKER_REPLICA_INDEX.")
@@ -1,18 +1,19 @@
1
+ import copy
1
2
  import json
2
3
  import math
3
4
  import random
4
- import time
5
- import copy
6
5
  import sys
7
- from metaflow.tracing import inject_tracing_vars
6
+ import time
7
+
8
8
  from metaflow.exception import MetaflowException
9
9
  from metaflow.metaflow_config import KUBERNETES_SECRETS
10
+ from metaflow.tracing import inject_tracing_vars
10
11
  from metaflow.unbounded_foreach import UBF_CONTROL, UBF_TASK
11
12
 
12
13
  CLIENT_REFRESH_INTERVAL_SECONDS = 300
13
14
  from .kubernetes_jobsets import (
14
- KubernetesJobSet, # We need this import for Kubernetes Client.
15
- )
15
+ KubernetesJobSet,
16
+ ) # We need this import for Kubernetes Client.
16
17
 
17
18
 
18
19
  class KubernetesJobException(MetaflowException):
@@ -366,7 +367,6 @@ class KubernetesJob(object):
366
367
 
367
368
 
368
369
  class RunningJob(object):
369
-
370
370
  # State Machine implementation for the lifecycle behavior documented in
371
371
  # https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/
372
372
  #