ob-metaflow 2.11.15.3__py2.py3-none-any.whl → 2.11.16.2__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ob-metaflow might be problematic. Click here for more details.

Files changed (28) hide show
  1. metaflow/__init__.py +3 -0
  2. metaflow/clone_util.py +6 -0
  3. metaflow/extension_support/plugins.py +1 -1
  4. metaflow/metaflow_config.py +5 -3
  5. metaflow/metaflow_environment.py +3 -3
  6. metaflow/plugins/__init__.py +4 -4
  7. metaflow/plugins/argo/argo_workflows.py +8 -0
  8. metaflow/plugins/azure/azure_secret_manager_secrets_provider.py +18 -14
  9. metaflow/plugins/datatools/s3/s3.py +1 -1
  10. metaflow/plugins/gcp/__init__.py +1 -1
  11. metaflow/plugins/gcp/gcp_secret_manager_secrets_provider.py +11 -6
  12. metaflow/plugins/kubernetes/kubernetes.py +79 -49
  13. metaflow/plugins/kubernetes/kubernetes_cli.py +20 -33
  14. metaflow/plugins/kubernetes/kubernetes_client.py +4 -1
  15. metaflow/plugins/kubernetes/kubernetes_decorator.py +44 -61
  16. metaflow/plugins/kubernetes/kubernetes_job.py +217 -584
  17. metaflow/plugins/kubernetes/kubernetes_jobsets.py +784 -0
  18. metaflow/plugins/timeout_decorator.py +2 -1
  19. metaflow/task.py +1 -12
  20. metaflow/tuple_util.py +27 -0
  21. metaflow/util.py +0 -15
  22. metaflow/version.py +1 -1
  23. {ob_metaflow-2.11.15.3.dist-info → ob_metaflow-2.11.16.2.dist-info}/METADATA +2 -2
  24. {ob_metaflow-2.11.15.3.dist-info → ob_metaflow-2.11.16.2.dist-info}/RECORD +28 -26
  25. {ob_metaflow-2.11.15.3.dist-info → ob_metaflow-2.11.16.2.dist-info}/LICENSE +0 -0
  26. {ob_metaflow-2.11.15.3.dist-info → ob_metaflow-2.11.16.2.dist-info}/WHEEL +0 -0
  27. {ob_metaflow-2.11.15.3.dist-info → ob_metaflow-2.11.16.2.dist-info}/entry_points.txt +0 -0
  28. {ob_metaflow-2.11.15.3.dist-info → ob_metaflow-2.11.16.2.dist-info}/top_level.txt +0 -0
@@ -2,7 +2,6 @@ import json
2
2
  import os
3
3
  import platform
4
4
  import sys
5
- import time
6
5
 
7
6
  from metaflow import current
8
7
  from metaflow.decorators import StepDecorator
@@ -21,7 +20,6 @@ from metaflow.metaflow_config import (
21
20
  KUBERNETES_PERSISTENT_VOLUME_CLAIMS,
22
21
  KUBERNETES_TOLERATIONS,
23
22
  KUBERNETES_SERVICE_ACCOUNT,
24
- KUBERNETES_PORT,
25
23
  KUBERNETES_SHARED_MEMORY,
26
24
  KUBERNETES_PORT,
27
25
  KUBERNETES_CPU,
@@ -31,10 +29,11 @@ from metaflow.metaflow_config import (
31
29
  from metaflow.plugins.resources_decorator import ResourcesDecorator
32
30
  from metaflow.plugins.timeout_decorator import get_run_time_limit_for_task
33
31
  from metaflow.sidecar import Sidecar
34
- from metaflow.unbounded_foreach import UBF_CONTROL
35
32
 
36
33
  from ..aws.aws_utils import get_docker_registry, get_ec2_instance_metadata
37
34
  from .kubernetes import KubernetesException, parse_kube_keyvalue_list
35
+ from metaflow.unbounded_foreach import UBF_CONTROL
36
+ from .kubernetes_jobsets import TaskIdConstructor
38
37
 
39
38
  from metaflow.metaflow_config import MAX_MEMORY_PER_TASK, MAX_CPU_PER_TASK
40
39
 
@@ -244,6 +243,16 @@ class KubernetesDecorator(StepDecorator):
244
243
  "Kubernetes. Please use one or the other.".format(step=step)
245
244
  )
246
245
 
246
+ if any([deco.name == "parallel" for deco in decos]) and any(
247
+ [deco.name == "catch" for deco in decos]
248
+ ):
249
+ raise MetaflowException(
250
+ "Step *{step}* contains a @parallel decorator "
251
+ "with the @catch decorator. @catch is not supported with @parallel on Kubernetes.".format(
252
+ step=step
253
+ )
254
+ )
255
+
247
256
  # Set run time limit for the Kubernetes job.
248
257
  self.run_time_limit = get_run_time_limit_for_task(decos)
249
258
  if self.run_time_limit < 60:
@@ -327,7 +336,7 @@ class KubernetesDecorator(StepDecorator):
327
336
 
328
337
  if self.attributes["shared_memory"]:
329
338
  if not (
330
- isinstance(self.attributes["shared_memory"], (int, unicode, basestring))
339
+ isinstance(self.attributes["shared_memory"], int)
331
340
  and int(self.attributes["shared_memory"]) > 0
332
341
  ):
333
342
  raise KubernetesException(
@@ -433,6 +442,10 @@ class KubernetesDecorator(StepDecorator):
433
442
  "METAFLOW_KUBERNETES_SERVICE_ACCOUNT_NAME"
434
443
  ]
435
444
  meta["kubernetes-node-ip"] = os.environ["METAFLOW_KUBERNETES_NODE_IP"]
445
+ if os.environ.get("METAFLOW_KUBERNETES_JOBSET_NAME"):
446
+ meta["kubernetes-jobset-name"] = os.environ[
447
+ "METAFLOW_KUBERNETES_JOBSET_NAME"
448
+ ]
436
449
 
437
450
  # TODO (savin): Introduce equivalent support for Microsoft Azure and
438
451
  # Google Cloud Platform
@@ -465,25 +478,22 @@ class KubernetesDecorator(StepDecorator):
465
478
  self._save_logs_sidecar = Sidecar("save_logs_periodically")
466
479
  self._save_logs_sidecar.start()
467
480
 
468
- num_parallel = int(os.environ.get("WORLD_SIZE", 0))
469
- if num_parallel >= 1:
470
- if ubf_context == UBF_CONTROL:
471
- control_task_id = current.task_id
472
- top_task_id = control_task_id.replace("control-", "")
473
- mapper_task_ids = [control_task_id] + [
474
- "%s-node-%d" % (top_task_id, node_idx)
475
- for node_idx in range(1, num_parallel)
476
- ]
477
- flow._control_mapper_tasks = [
478
- "%s/%s/%s" % (run_id, step_name, mapper_task_id)
479
- for mapper_task_id in mapper_task_ids
480
- ]
481
- flow._control_task_is_mapper_zero = True
482
- else:
483
- worker_job_rank = int(os.environ["RANK"])
484
- os.environ["RANK"] = str(worker_job_rank + 1)
481
+ num_parallel = None
482
+ if hasattr(flow, "_parallel_ubf_iter"):
483
+ num_parallel = flow._parallel_ubf_iter.num_parallel
484
+
485
+ if num_parallel and num_parallel >= 1 and ubf_context == UBF_CONTROL:
486
+ control_task_id, worker_task_ids = TaskIdConstructor.join_step_task_ids(
487
+ num_parallel
488
+ )
489
+ mapper_task_ids = [control_task_id] + worker_task_ids
490
+ flow._control_mapper_tasks = [
491
+ "%s/%s/%s" % (run_id, step_name, mapper_task_id)
492
+ for mapper_task_id in mapper_task_ids
493
+ ]
494
+ flow._control_task_is_mapper_zero = True
485
495
 
486
- if num_parallel >= 1:
496
+ if num_parallel and num_parallel > 1:
487
497
  _setup_multinode_environment()
488
498
 
489
499
  def task_finished(
@@ -513,44 +523,6 @@ class KubernetesDecorator(StepDecorator):
513
523
  # Best effort kill
514
524
  pass
515
525
 
516
- if is_task_ok and len(getattr(flow, "_control_mapper_tasks", [])) > 1:
517
- self._wait_for_mapper_tasks(flow, step_name)
518
-
519
- def _wait_for_mapper_tasks(self, flow, step_name):
520
- """
521
- When launching multinode task with UBF, need to wait for the secondary
522
- tasks to finish cleanly and produce their output before exiting the
523
- main task. Otherwise, the main task finishing will cause secondary nodes
524
- to terminate immediately, and possibly prematurely.
525
- """
526
- from metaflow import Step # avoid circular dependency
527
-
528
- TIMEOUT = 600
529
- last_completion_timeout = time.time() + TIMEOUT
530
- print("Waiting for batch secondary tasks to finish")
531
- while last_completion_timeout > time.time():
532
- time.sleep(2)
533
- try:
534
- step_path = "%s/%s/%s" % (flow.name, current.run_id, step_name)
535
- tasks = [task for task in Step(step_path)]
536
- if len(tasks) == len(flow._control_mapper_tasks):
537
- if all(
538
- task.finished_at is not None for task in tasks
539
- ): # for some reason task.finished fails
540
- return True
541
- else:
542
- print(
543
- "Waiting for all parallel tasks to finish. Finished: {}/{}".format(
544
- len(tasks),
545
- len(flow._control_mapper_tasks),
546
- )
547
- )
548
- except Exception as e:
549
- pass
550
- raise Exception(
551
- "Batch secondary workers did not finish in %s seconds" % TIMEOUT
552
- )
553
-
554
526
  @classmethod
555
527
  def _save_package_once(cls, flow_datastore, package):
556
528
  if cls.package_url is None:
@@ -558,8 +530,19 @@ class KubernetesDecorator(StepDecorator):
558
530
  [package.blob], len_hint=1
559
531
  )[0]
560
532
 
533
+
561
534
  def _setup_multinode_environment():
562
535
  import socket
536
+
563
537
  os.environ["MF_PARALLEL_MAIN_IP"] = socket.gethostbyname(os.environ["MASTER_ADDR"])
564
538
  os.environ["MF_PARALLEL_NUM_NODES"] = os.environ["WORLD_SIZE"]
565
- os.environ["MF_PARALLEL_NODE_INDEX"] = os.environ["RANK"]
539
+ if os.environ.get("CONTROL_INDEX") is not None:
540
+ os.environ["MF_PARALLEL_NODE_INDEX"] = str(0)
541
+ elif os.environ.get("WORKER_REPLICA_INDEX") is not None:
542
+ os.environ["MF_PARALLEL_NODE_INDEX"] = str(
543
+ int(os.environ["WORKER_REPLICA_INDEX"]) + 1
544
+ )
545
+ else:
546
+ raise MetaflowException(
547
+ "Jobset related ENV vars called $CONTROL_INDEX or $WORKER_REPLICA_INDEX not found"
548
+ )