PyPI - ob-metaflow - Versions diffs - 2.11.13.1__py2.py3-none-any.whl → 2.19.7.1rc0__py2.py3-none-any.whl - Mend

ob-metaflow 2.11.13.1py2.py3-none-any.whl → 2.19.7.1rc0py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (289) hide show

metaflow/R.py +10 -7
metaflow/__init__.py +40 -25
metaflow/_vendor/imghdr/__init__.py +186 -0
metaflow/_vendor/importlib_metadata/__init__.py +1063 -0
metaflow/_vendor/importlib_metadata/_adapters.py +68 -0
metaflow/_vendor/importlib_metadata/_collections.py +30 -0
metaflow/_vendor/importlib_metadata/_compat.py +71 -0
metaflow/_vendor/importlib_metadata/_functools.py +104 -0
metaflow/_vendor/importlib_metadata/_itertools.py +73 -0
metaflow/_vendor/importlib_metadata/_meta.py +48 -0
metaflow/_vendor/importlib_metadata/_text.py +99 -0
metaflow/_vendor/importlib_metadata/py.typed +0 -0
metaflow/_vendor/typeguard/__init__.py +48 -0
metaflow/_vendor/typeguard/_checkers.py +1070 -0
metaflow/_vendor/typeguard/_config.py +108 -0
metaflow/_vendor/typeguard/_decorators.py +233 -0
metaflow/_vendor/typeguard/_exceptions.py +42 -0
metaflow/_vendor/typeguard/_functions.py +308 -0
metaflow/_vendor/typeguard/_importhook.py +213 -0
metaflow/_vendor/typeguard/_memo.py +48 -0
metaflow/_vendor/typeguard/_pytest_plugin.py +127 -0
metaflow/_vendor/typeguard/_suppression.py +86 -0
metaflow/_vendor/typeguard/_transformer.py +1229 -0
metaflow/_vendor/typeguard/_union_transformer.py +55 -0
metaflow/_vendor/typeguard/_utils.py +173 -0
metaflow/_vendor/typeguard/py.typed +0 -0
metaflow/_vendor/typing_extensions.py +3641 -0
metaflow/_vendor/v3_7/importlib_metadata/__init__.py +1063 -0
metaflow/_vendor/v3_7/importlib_metadata/_adapters.py +68 -0
metaflow/_vendor/v3_7/importlib_metadata/_collections.py +30 -0
metaflow/_vendor/v3_7/importlib_metadata/_compat.py +71 -0
metaflow/_vendor/v3_7/importlib_metadata/_functools.py +104 -0
metaflow/_vendor/v3_7/importlib_metadata/_itertools.py +73 -0
metaflow/_vendor/v3_7/importlib_metadata/_meta.py +48 -0
metaflow/_vendor/v3_7/importlib_metadata/_text.py +99 -0
metaflow/_vendor/v3_7/importlib_metadata/py.typed +0 -0
metaflow/_vendor/v3_7/typeguard/__init__.py +48 -0
metaflow/_vendor/v3_7/typeguard/_checkers.py +906 -0
metaflow/_vendor/v3_7/typeguard/_config.py +108 -0
metaflow/_vendor/v3_7/typeguard/_decorators.py +237 -0
metaflow/_vendor/v3_7/typeguard/_exceptions.py +42 -0
metaflow/_vendor/v3_7/typeguard/_functions.py +310 -0
metaflow/_vendor/v3_7/typeguard/_importhook.py +213 -0
metaflow/_vendor/v3_7/typeguard/_memo.py +48 -0
metaflow/_vendor/v3_7/typeguard/_pytest_plugin.py +100 -0
metaflow/_vendor/v3_7/typeguard/_suppression.py +88 -0
metaflow/_vendor/v3_7/typeguard/_transformer.py +1207 -0
metaflow/_vendor/v3_7/typeguard/_union_transformer.py +54 -0
metaflow/_vendor/v3_7/typeguard/_utils.py +169 -0
metaflow/_vendor/v3_7/typeguard/py.typed +0 -0
metaflow/_vendor/v3_7/typing_extensions.py +3072 -0
metaflow/_vendor/yaml/__init__.py +427 -0
metaflow/_vendor/yaml/composer.py +139 -0
metaflow/_vendor/yaml/constructor.py +748 -0
metaflow/_vendor/yaml/cyaml.py +101 -0
metaflow/_vendor/yaml/dumper.py +62 -0
metaflow/_vendor/yaml/emitter.py +1137 -0
metaflow/_vendor/yaml/error.py +75 -0
metaflow/_vendor/yaml/events.py +86 -0
metaflow/_vendor/yaml/loader.py +63 -0
metaflow/_vendor/yaml/nodes.py +49 -0
metaflow/_vendor/yaml/parser.py +589 -0
metaflow/_vendor/yaml/reader.py +185 -0
metaflow/_vendor/yaml/representer.py +389 -0
metaflow/_vendor/yaml/resolver.py +227 -0
metaflow/_vendor/yaml/scanner.py +1435 -0
metaflow/_vendor/yaml/serializer.py +111 -0
metaflow/_vendor/yaml/tokens.py +104 -0
metaflow/cards.py +5 -0
metaflow/cli.py +331 -785
metaflow/cli_args.py +17 -0
metaflow/cli_components/__init__.py +0 -0
metaflow/cli_components/dump_cmd.py +96 -0
metaflow/cli_components/init_cmd.py +52 -0
metaflow/cli_components/run_cmds.py +546 -0
metaflow/cli_components/step_cmd.py +334 -0
metaflow/cli_components/utils.py +140 -0
metaflow/client/__init__.py +1 -0
metaflow/client/core.py +467 -73
metaflow/client/filecache.py +75 -35
metaflow/clone_util.py +7 -1
metaflow/cmd/code/__init__.py +231 -0
metaflow/cmd/develop/stub_generator.py +756 -288
metaflow/cmd/develop/stubs.py +12 -28
metaflow/cmd/main_cli.py +6 -4
metaflow/cmd/make_wrapper.py +78 -0
metaflow/datastore/__init__.py +1 -0
metaflow/datastore/content_addressed_store.py +41 -10
metaflow/datastore/datastore_set.py +11 -2
metaflow/datastore/flow_datastore.py +156 -10
metaflow/datastore/spin_datastore.py +91 -0
metaflow/datastore/task_datastore.py +154 -39
metaflow/debug.py +5 -0
metaflow/decorators.py +404 -78
metaflow/exception.py +8 -2
metaflow/extension_support/__init__.py +527 -376
metaflow/extension_support/_empty_file.py +2 -2
metaflow/extension_support/plugins.py +49 -31
metaflow/flowspec.py +482 -33
metaflow/graph.py +210 -42
metaflow/includefile.py +84 -40
metaflow/lint.py +141 -22
metaflow/meta_files.py +13 -0
metaflow/{metadata → metadata_provider}/heartbeat.py +24 -8
metaflow/{metadata → metadata_provider}/metadata.py +86 -1
metaflow/metaflow_config.py +175 -28
metaflow/metaflow_config_funcs.py +51 -3
metaflow/metaflow_current.py +4 -10
metaflow/metaflow_environment.py +139 -53
metaflow/metaflow_git.py +115 -0
metaflow/metaflow_profile.py +18 -0
metaflow/metaflow_version.py +150 -66
metaflow/mflog/__init__.py +4 -3
metaflow/mflog/save_logs.py +2 -2
metaflow/multicore_utils.py +31 -14
metaflow/package/__init__.py +673 -0
metaflow/packaging_sys/__init__.py +880 -0
metaflow/packaging_sys/backend.py +128 -0
metaflow/packaging_sys/distribution_support.py +153 -0
metaflow/packaging_sys/tar_backend.py +99 -0
metaflow/packaging_sys/utils.py +54 -0
metaflow/packaging_sys/v1.py +527 -0
metaflow/parameters.py +149 -28
metaflow/plugins/__init__.py +74 -5
metaflow/plugins/airflow/airflow.py +40 -25
metaflow/plugins/airflow/airflow_cli.py +22 -5
metaflow/plugins/airflow/airflow_decorator.py +1 -1
metaflow/plugins/airflow/airflow_utils.py +5 -3
metaflow/plugins/airflow/sensors/base_sensor.py +4 -4
metaflow/plugins/airflow/sensors/external_task_sensor.py +2 -2
metaflow/plugins/airflow/sensors/s3_sensor.py +2 -2
metaflow/plugins/argo/argo_client.py +78 -33
metaflow/plugins/argo/argo_events.py +6 -6
metaflow/plugins/argo/argo_workflows.py +2410 -527
metaflow/plugins/argo/argo_workflows_cli.py +571 -121
metaflow/plugins/argo/argo_workflows_decorator.py +43 -12
metaflow/plugins/argo/argo_workflows_deployer.py +106 -0
metaflow/plugins/argo/argo_workflows_deployer_objects.py +453 -0
metaflow/plugins/argo/capture_error.py +73 -0
metaflow/plugins/argo/conditional_input_paths.py +35 -0
metaflow/plugins/argo/exit_hooks.py +209 -0
metaflow/plugins/argo/jobset_input_paths.py +15 -0
metaflow/plugins/argo/param_val.py +19 -0
metaflow/plugins/aws/aws_client.py +10 -3
metaflow/plugins/aws/aws_utils.py +55 -2
metaflow/plugins/aws/batch/batch.py +72 -5
metaflow/plugins/aws/batch/batch_cli.py +33 -10
metaflow/plugins/aws/batch/batch_client.py +4 -3
metaflow/plugins/aws/batch/batch_decorator.py +102 -35
metaflow/plugins/aws/secrets_manager/aws_secrets_manager_secrets_provider.py +13 -10
metaflow/plugins/aws/step_functions/dynamo_db_client.py +0 -3
metaflow/plugins/aws/step_functions/production_token.py +1 -1
metaflow/plugins/aws/step_functions/step_functions.py +65 -8
metaflow/plugins/aws/step_functions/step_functions_cli.py +101 -7
metaflow/plugins/aws/step_functions/step_functions_decorator.py +1 -2
metaflow/plugins/aws/step_functions/step_functions_deployer.py +97 -0
metaflow/plugins/aws/step_functions/step_functions_deployer_objects.py +264 -0
metaflow/plugins/azure/azure_exceptions.py +1 -1
metaflow/plugins/azure/azure_secret_manager_secrets_provider.py +240 -0
metaflow/plugins/azure/azure_tail.py +1 -1
metaflow/plugins/azure/includefile_support.py +2 -0
metaflow/plugins/cards/card_cli.py +66 -30
metaflow/plugins/cards/card_creator.py +25 -1
metaflow/plugins/cards/card_datastore.py +21 -49
metaflow/plugins/cards/card_decorator.py +132 -8
metaflow/plugins/cards/card_modules/basic.py +112 -17
metaflow/plugins/cards/card_modules/bundle.css +1 -1
metaflow/plugins/cards/card_modules/card.py +16 -1
metaflow/plugins/cards/card_modules/chevron/renderer.py +1 -1
metaflow/plugins/cards/card_modules/components.py +665 -28
metaflow/plugins/cards/card_modules/convert_to_native_type.py +36 -7
metaflow/plugins/cards/card_modules/json_viewer.py +232 -0
metaflow/plugins/cards/card_modules/main.css +1 -0
metaflow/plugins/cards/card_modules/main.js +68 -49
metaflow/plugins/cards/card_modules/renderer_tools.py +1 -0
metaflow/plugins/cards/card_modules/test_cards.py +26 -12
metaflow/plugins/cards/card_server.py +39 -14
metaflow/plugins/cards/component_serializer.py +2 -9
metaflow/plugins/cards/metadata.py +22 -0
metaflow/plugins/catch_decorator.py +9 -0
metaflow/plugins/datastores/azure_storage.py +10 -1
metaflow/plugins/datastores/gs_storage.py +6 -2
metaflow/plugins/datastores/local_storage.py +12 -6
metaflow/plugins/datastores/spin_storage.py +12 -0
metaflow/plugins/datatools/local.py +2 -0
metaflow/plugins/datatools/s3/s3.py +126 -75
metaflow/plugins/datatools/s3/s3op.py +254 -121
metaflow/plugins/env_escape/__init__.py +3 -3
metaflow/plugins/env_escape/client_modules.py +102 -72
metaflow/plugins/env_escape/server.py +7 -0
metaflow/plugins/env_escape/stub.py +24 -5
metaflow/plugins/events_decorator.py +343 -185
metaflow/plugins/exit_hook/__init__.py +0 -0
metaflow/plugins/exit_hook/exit_hook_decorator.py +46 -0
metaflow/plugins/exit_hook/exit_hook_script.py +52 -0
metaflow/plugins/gcp/__init__.py +1 -1
metaflow/plugins/gcp/gcp_secret_manager_secrets_provider.py +11 -6
metaflow/plugins/gcp/gs_tail.py +10 -6
metaflow/plugins/gcp/includefile_support.py +3 -0
metaflow/plugins/kubernetes/kube_utils.py +108 -0
metaflow/plugins/kubernetes/kubernetes.py +411 -130
metaflow/plugins/kubernetes/kubernetes_cli.py +168 -36
metaflow/plugins/kubernetes/kubernetes_client.py +104 -2
metaflow/plugins/kubernetes/kubernetes_decorator.py +246 -88
metaflow/plugins/kubernetes/kubernetes_job.py +253 -581
metaflow/plugins/kubernetes/kubernetes_jobsets.py +1071 -0
metaflow/plugins/kubernetes/spot_metadata_cli.py +69 -0
metaflow/plugins/kubernetes/spot_monitor_sidecar.py +109 -0
metaflow/plugins/logs_cli.py +359 -0
metaflow/plugins/{metadata → metadata_providers}/local.py +144 -84
metaflow/plugins/{metadata → metadata_providers}/service.py +103 -26
metaflow/plugins/metadata_providers/spin.py +16 -0
metaflow/plugins/package_cli.py +36 -24
metaflow/plugins/parallel_decorator.py +128 -11
metaflow/plugins/parsers.py +16 -0
metaflow/plugins/project_decorator.py +51 -5
metaflow/plugins/pypi/bootstrap.py +357 -105
metaflow/plugins/pypi/conda_decorator.py +82 -81
metaflow/plugins/pypi/conda_environment.py +187 -52
metaflow/plugins/pypi/micromamba.py +157 -47
metaflow/plugins/pypi/parsers.py +268 -0
metaflow/plugins/pypi/pip.py +88 -13
metaflow/plugins/pypi/pypi_decorator.py +37 -1
metaflow/plugins/pypi/utils.py +48 -2
metaflow/plugins/resources_decorator.py +2 -2
metaflow/plugins/secrets/__init__.py +3 -0
metaflow/plugins/secrets/secrets_decorator.py +26 -181
metaflow/plugins/secrets/secrets_func.py +49 -0
metaflow/plugins/secrets/secrets_spec.py +101 -0
metaflow/plugins/secrets/utils.py +74 -0
metaflow/plugins/tag_cli.py +4 -7
metaflow/plugins/test_unbounded_foreach_decorator.py +41 -6
metaflow/plugins/timeout_decorator.py +3 -3
metaflow/plugins/uv/__init__.py +0 -0
metaflow/plugins/uv/bootstrap.py +128 -0
metaflow/plugins/uv/uv_environment.py +72 -0
metaflow/procpoll.py +1 -1
metaflow/pylint_wrapper.py +5 -1
metaflow/runner/__init__.py +0 -0
metaflow/runner/click_api.py +717 -0
metaflow/runner/deployer.py +470 -0
metaflow/runner/deployer_impl.py +201 -0
metaflow/runner/metaflow_runner.py +714 -0
metaflow/runner/nbdeploy.py +132 -0
metaflow/runner/nbrun.py +225 -0
metaflow/runner/subprocess_manager.py +650 -0
metaflow/runner/utils.py +335 -0
metaflow/runtime.py +1078 -260
metaflow/sidecar/sidecar_worker.py +1 -1
metaflow/system/__init__.py +5 -0
metaflow/system/system_logger.py +85 -0
metaflow/system/system_monitor.py +108 -0
metaflow/system/system_utils.py +19 -0
metaflow/task.py +521 -225
metaflow/tracing/__init__.py +7 -7
metaflow/tracing/span_exporter.py +31 -38
metaflow/tracing/tracing_modules.py +38 -43
metaflow/tuple_util.py +27 -0
metaflow/user_configs/__init__.py +0 -0
metaflow/user_configs/config_options.py +563 -0
metaflow/user_configs/config_parameters.py +598 -0
metaflow/user_decorators/__init__.py +0 -0
metaflow/user_decorators/common.py +144 -0
metaflow/user_decorators/mutable_flow.py +512 -0
metaflow/user_decorators/mutable_step.py +424 -0
metaflow/user_decorators/user_flow_decorator.py +264 -0
metaflow/user_decorators/user_step_decorator.py +749 -0
metaflow/util.py +243 -27
metaflow/vendor.py +23 -7
metaflow/version.py +1 -1
ob_metaflow-2.19.7.1rc0.data/data/share/metaflow/devtools/Makefile +355 -0
ob_metaflow-2.19.7.1rc0.data/data/share/metaflow/devtools/Tiltfile +726 -0
ob_metaflow-2.19.7.1rc0.data/data/share/metaflow/devtools/pick_services.sh +105 -0
ob_metaflow-2.19.7.1rc0.dist-info/METADATA +87 -0
ob_metaflow-2.19.7.1rc0.dist-info/RECORD +445 -0
{ob_metaflow-2.11.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/WHEEL +1 -1
{ob_metaflow-2.11.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/entry_points.txt +1 -0
metaflow/_vendor/v3_5/__init__.py +0 -1
metaflow/_vendor/v3_5/importlib_metadata/__init__.py +0 -644
metaflow/_vendor/v3_5/importlib_metadata/_compat.py +0 -152
metaflow/package.py +0 -188
ob_metaflow-2.11.13.1.dist-info/METADATA +0 -85
ob_metaflow-2.11.13.1.dist-info/RECORD +0 -308
/metaflow/_vendor/{v3_5/zipp.py → zipp.py} +0 -0
/metaflow/{metadata → metadata_provider}/__init__.py +0 -0
/metaflow/{metadata → metadata_provider}/util.py +0 -0
/metaflow/plugins/{metadata → metadata_providers}/__init__.py +0 -0
{ob_metaflow-2.11.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info/licenses}/LICENSE +0 -0
{ob_metaflow-2.11.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/top_level.txt +0 -0

metaflow/runtime.py CHANGED Viewed

@@ -4,39 +4,61 @@ Local backend
 Execute the flow with a native runtime
 using local / remote processes
 """
 from __future__ import print_function
+import json
 import os
 import sys
 import fcntl
+import re
+import tempfile
 import time
 import subprocess
 from datetime import datetime
+from enum import Enum
 from io import BytesIO
+from itertools import chain
 from functools import partial
 from concurrent import futures
+from typing import Dict, Tuple
 from metaflow.datastore.exceptions import DataException
+from contextlib import contextmanager
 from . import get_namespace
-from .metadata import MetaDatum
-from .metaflow_config import MAX_ATTEMPTS, UI_URL
+from .client.filecache import FileCache, FileBlobCache, TaskMetadataCache
+from .metadata_provider import MetaDatum
+from .metaflow_config import (
+    FEAT_ALWAYS_UPLOAD_CODE_PACKAGE,
+    MAX_ATTEMPTS,
+    UI_URL,
+    SPIN_ALLOWED_DECORATORS,
+    SPIN_DISALLOWED_DECORATORS,
+)
+from .metaflow_profile import from_start
+from .plugins import DATASTORES
 from .exception import (
     MetaflowException,
     MetaflowInternalError,
     METAFLOW_EXIT_DISALLOW_RETRY,
 )
 from . import procpoll
-from .datastore import TaskDataStoreSet
+from .datastore import FlowDataStore, TaskDataStoreSet
 from .debug import debug
 from .decorators import flow_decorators
+from .flowspec import FlowStateItems
 from .mflog import mflog, RUNTIME_LOG_SOURCE
-from .util import to_unicode, compress_list, unicode_type
+from .util import to_unicode, compress_list, unicode_type, get_latest_task_pathspec
 from .clone_util import clone_task_helper
 from .unbounded_foreach import (
     CONTROL_TASK_TAG,
     UBF_CONTROL,
     UBF_TASK,
 )
+from .user_configs.config_options import ConfigInput
+from .user_configs.config_parameters import dump_config_values
 import metaflow.tracing as tracing
 MAX_WORKERS = 16
@@ -47,9 +69,24 @@ PROGRESS_INTERVAL = 300  # s
 # The following is a list of the (data) artifacts used by the runtime while
 # executing a flow. These are prefetched during the resume operation by
 # leveraging the TaskDataStoreSet.
-PREFETCH_DATA_ARTIFACTS = ["_foreach_stack", "_task_ok", "_transition"]
+PREFETCH_DATA_ARTIFACTS = [
+    "_foreach_stack",
+    "_iteration_stack",
+    "_task_ok",
+    "_transition",
+    "_control_mapper_tasks",
+    "_control_task_is_mapper_zero",
+]
 RESUME_POLL_SECONDS = 60
+class LoopBehavior(Enum):
+    NONE = "none"
+    ENTERING = "entering"
+    EXITING = "exiting"
+    LOOPING = "looping"
 # Runtime must use logsource=RUNTIME_LOG_SOURCE for all loglines that it
 # formats according to mflog. See a comment in mflog.__init__
 mflog_msg = partial(mflog.decorate, RUNTIME_LOG_SOURCE)
@@ -57,6 +94,253 @@ mflog_msg = partial(mflog.decorate, RUNTIME_LOG_SOURCE)
 # TODO option: output dot graph periodically about execution
+class SpinRuntime(object):
+    def __init__(
+        self,
+        flow,
+        graph,
+        flow_datastore,
+        metadata,
+        environment,
+        package,
+        logger,
+        entrypoint,
+        event_logger,
+        monitor,
+        step_func,
+        step_name,
+        spin_pathspec,
+        skip_decorators=False,
+        artifacts_module=None,
+        persist=True,
+        max_log_size=MAX_LOG_SIZE,
+    ):
+        from metaflow import Task
+        self._flow = flow
+        self._graph = graph
+        self._flow_datastore = flow_datastore
+        self._metadata = metadata
+        self._environment = environment
+        self._package = package
+        self._logger = logger
+        self._entrypoint = entrypoint
+        self._event_logger = event_logger
+        self._monitor = monitor
+        self._step_func = step_func
+        # Determine if we have a complete pathspec or need to get the task
+        if spin_pathspec:
+            parts = spin_pathspec.split("/")
+            if len(parts) == 4:
+                # Complete pathspec: flow/run/step/task_id
+                try:
+                    # If user provides whole pathspec, we do not need to check namespace
+                    task = Task(spin_pathspec, _namespace_check=False)
+                except Exception:
+                    raise MetaflowException(
+                        f"Invalid pathspec: {spin_pathspec} for step: {step_name}"
+                    )
+            elif len(parts) == 3:
+                # Partial pathspec: flow/run/step - need to get the task
+                _, run_id, _ = parts
+                task = get_latest_task_pathspec(flow.name, step_name, run_id=run_id)
+                logger(
+                    f"To make spin even faster, provide complete pathspec with task_id: {task.pathspec}",
+                    system_msg=True,
+                )
+            else:
+                raise MetaflowException(
+                    f"Invalid pathspec format: {spin_pathspec}. Expected flow/run/step or flow/run/step/task_id"
+                )
+        else:
+            # No pathspec provided, get latest task for this step
+            task = get_latest_task_pathspec(flow.name, step_name)
+            logger(
+                f"To make spin even faster, provide complete pathspec {task.pathspec}",
+                system_msg=True,
+            )
+        from_start("SpinRuntime: after getting task")
+        # Get the original FlowDatastore so we can use it to access artifacts from the
+        # spun task
+        meta_dict = task.metadata_dict
+        ds_type = meta_dict["ds-type"]
+        ds_root = meta_dict["ds-root"]
+        orig_datastore_impl = [d for d in DATASTORES if d.TYPE == ds_type][0]
+        orig_datastore_impl.datastore_root = ds_root
+        spin_pathspec = task.pathspec
+        orig_flow_datastore = FlowDataStore(
+            flow.name,
+            environment=None,
+            storage_impl=orig_datastore_impl,
+            ds_root=ds_root,
+        )
+        self._filecache = FileCache()
+        orig_flow_datastore.set_metadata_cache(
+            TaskMetadataCache(self._filecache, ds_type, ds_root, flow.name)
+        )
+        orig_flow_datastore.ca_store.set_blob_cache(
+            FileBlobCache(
+                self._filecache, FileCache.flow_ds_id(ds_type, ds_root, flow.name)
+            )
+        )
+        self._orig_flow_datastore = orig_flow_datastore
+        self._spin_pathspec = spin_pathspec
+        self._persist = persist
+        self._spin_task = task
+        self._input_paths = None
+        self._split_index = None
+        self._whitelist_decorators = None
+        self._config_file_name = None
+        self._skip_decorators = skip_decorators
+        self._artifacts_module = artifacts_module
+        self._max_log_size = max_log_size
+        self._encoding = sys.stdout.encoding or "UTF-8"
+        # Create a new run_id for the spin task
+        self.run_id = self._metadata.new_run_id()
+        # Raise exception if we have a black listed decorator
+        for deco in self._step_func.decorators:
+            if deco.name in SPIN_DISALLOWED_DECORATORS:
+                raise MetaflowException(
+                    f"Spinning steps with @{deco.name} decorator is not supported."
+                )
+        for deco in self.whitelist_decorators:
+            deco.runtime_init(flow, graph, package, self.run_id)
+        from_start("SpinRuntime: after init decorators")
+    @property
+    def split_index(self):
+        """
+        Returns the split index, caching the result after the first access.
+        """
+        if self._split_index is None:
+            self._split_index = getattr(self._spin_task, "index", None)
+        return self._split_index
+    @property
+    def input_paths(self):
+        def _format_input_paths(task_pathspec, attempt):
+            _, run_id, step_name, task_id = task_pathspec.split("/")
+            return f"{run_id}/{step_name}/{task_id}/{attempt}"
+        if self._input_paths:
+            return self._input_paths
+        if self._step_func.name == "start":
+            from metaflow import Step
+            flow_name, run_id, _, _ = self._spin_pathspec.split("/")
+            task = Step(
+                f"{flow_name}/{run_id}/_parameters", _namespace_check=False
+            ).task
+            self._input_paths = [
+                _format_input_paths(task.pathspec, task.current_attempt)
+            ]
+        else:
+            parent_tasks = self._spin_task.parent_tasks
+            self._input_paths = [
+                _format_input_paths(t.pathspec, t.current_attempt) for t in parent_tasks
+            ]
+        return self._input_paths
+    @property
+    def whitelist_decorators(self):
+        if self._skip_decorators:
+            self._whitelist_decorators = []
+            return self._whitelist_decorators
+        if self._whitelist_decorators:
+            return self._whitelist_decorators
+        self._whitelist_decorators = [
+            deco
+            for deco in self._step_func.decorators
+            if any(deco.name.startswith(prefix) for prefix in SPIN_ALLOWED_DECORATORS)
+        ]
+        return self._whitelist_decorators
+    def _new_task(self, step, input_paths=None, **kwargs):
+        return Task(
+            flow_datastore=self._flow_datastore,
+            flow=self._flow,
+            step=step,
+            run_id=self.run_id,
+            metadata=self._metadata,
+            environment=self._environment,
+            entrypoint=self._entrypoint,
+            event_logger=self._event_logger,
+            monitor=self._monitor,
+            input_paths=input_paths,
+            decos=self.whitelist_decorators,
+            logger=self._logger,
+            split_index=self.split_index,
+            **kwargs,
+        )
+    def execute(self):
+        exception = None
+        with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8") as config_file:
+            config_value = dump_config_values(self._flow)
+            if config_value:
+                json.dump(config_value, config_file)
+                config_file.flush()
+                self._config_file_name = config_file.name
+            else:
+                self._config_file_name = None
+            from_start("SpinRuntime: config values processed")
+            self.task = self._new_task(self._step_func.name, self.input_paths)
+            try:
+                self._launch_and_monitor_task()
+            except Exception as ex:
+                self._logger("Task failed.", system_msg=True, bad=True)
+                exception = ex
+                raise
+            finally:
+                for deco in self.whitelist_decorators:
+                    deco.runtime_finished(exception)
+    def _launch_and_monitor_task(self):
+        worker = Worker(
+            self.task,
+            self._max_log_size,
+            self._config_file_name,
+            orig_flow_datastore=self._orig_flow_datastore,
+            spin_pathspec=self._spin_pathspec,
+            artifacts_module=self._artifacts_module,
+            persist=self._persist,
+            skip_decorators=self._skip_decorators,
+        )
+        from_start("SpinRuntime: created worker")
+        poll = procpoll.make_poll()
+        fds = worker.fds()
+        for fd in fds:
+            poll.add(fd)
+        active_fds = set(fds)
+        while active_fds:
+            events = poll.poll(POLL_TIMEOUT)
+            for event in events:
+                if event.can_read:
+                    worker.read_logline(event.fd)
+                if event.is_terminated:
+                    poll.remove(event.fd)
+                    active_fds.remove(event.fd)
+        from_start("SpinRuntime: read loglines")
+        returncode = worker.terminate()
+        from_start("SpinRuntime: worker terminated")
+        if returncode != 0:
+            raise TaskFailed(self.task, f"Task failed with return code {returncode}")
+        else:
+            self._logger("Task finished successfully.", system_msg=True)
 class NativeRuntime(object):
     def __init__(
         self,
@@ -74,11 +358,12 @@ class NativeRuntime(object):
         clone_run_id=None,
         clone_only=False,
         reentrant=False,
-        clone_steps=None,
+        steps_to_rerun=None,
         max_workers=MAX_WORKERS,
         max_num_splits=MAX_NUM_SPLITS,
         max_log_size=MAX_LOG_SIZE,
         resume_identifier=None,
+        skip_decorator_hooks=False,
     ):
         if run_id is None:
             self._run_id = metadata.new_run_id()
@@ -91,6 +376,7 @@ class NativeRuntime(object):
         self._flow_datastore = flow_datastore
         self._metadata = metadata
         self._environment = environment
+        self._package = package
         self._logger = logger
         self._max_workers = max_workers
         self._active_tasks = dict()  # Key: step name;
@@ -108,9 +394,21 @@ class NativeRuntime(object):
         self._clone_run_id = clone_run_id
         self._clone_only = clone_only
-        self._clone_steps = {} if clone_steps is None else clone_steps
+        self._cloned_tasks = []
+        self._ran_or_scheduled_task_index = set()
         self._reentrant = reentrant
         self._run_url = None
+        self._skip_decorator_hooks = skip_decorator_hooks
+        # If steps_to_rerun is specified, we will not clone them in resume mode.
+        self._steps_to_rerun = steps_to_rerun or {}
+        # sorted_nodes are in topological order already, so we only need to
+        # iterate through the nodes once to get a stable set of rerun steps.
+        for step_name in self._graph.sorted_nodes:
+            if step_name in self._steps_to_rerun:
+                out_funcs = self._graph[step_name].out_funcs or []
+                for next_step in out_funcs:
+                    self._steps_to_rerun.add(next_step)
         self._origin_ds_set = None
         if clone_run_id:
@@ -152,21 +450,21 @@ class NativeRuntime(object):
         # finished.
         self._control_num_splits = {}  # control_task -> num_splits mapping
-        for step in flow:
-            for deco in step.decorators:
-                deco.runtime_init(flow, graph, package, self._run_id)
+        if not self._skip_decorator_hooks:
+            for step in flow:
+                for deco in step.decorators:
+                    deco.runtime_init(flow, graph, package, self._run_id)
     def _new_task(self, step, input_paths=None, **kwargs):
         if input_paths is None:
             may_clone = True
         else:
             may_clone = all(self._is_cloned[path] for path in input_paths)
-        if step in self._clone_steps:
+        if step in self._steps_to_rerun:
             may_clone = False
-        if step == "_parameters":
+        if step == "_parameters" or self._skip_decorator_hooks:
             decos = []
         else:
             decos = getattr(self._flow, step).decorators
@@ -204,6 +502,22 @@ class NativeRuntime(object):
         self._is_cloned[self._params_task.path] = self._params_task.is_cloned
+    def should_skip_clone_only_execution(self):
+        (
+            should_skip_clone_only_execution,
+            skip_reason,
+        ) = self._should_skip_clone_only_execution()
+        if should_skip_clone_only_execution:
+            self._logger(skip_reason, system_msg=True)
+            return True
+        return False
+    @contextmanager
+    def run_heartbeat(self):
+        self._metadata.start_run_heartbeat(self._flow.name, self._run_id)
+        yield
+        self._metadata.stop_heartbeat()
     def print_workflow_info(self):
         self._run_url = (
             "%s/%s/%s" % (UI_URL.rstrip("/"), self._flow.name, self._run_id)
@@ -236,157 +550,375 @@ class NativeRuntime(object):
                 )
         return False, None
-    def clone_task(self, step_name, task_id):
-        self._logger(
-            "Cloning task from {}/{}/{}/{} to {}/{}/{}/{}".format(
+    def clone_task(
+        self,
+        step_name,
+        task_id,
+        pathspec_index,
+        cloned_task_pathspec_index,
+        finished_tuple,
+        iteration_tuple,
+        ubf_context,
+        generate_task_obj,
+        verbose=False,
+    ):
+        try:
+            new_task_id = task_id
+            if generate_task_obj:
+                task = self._new_task(step_name, pathspec_index=pathspec_index)
+                if ubf_context:
+                    task.ubf_context = ubf_context
+                new_task_id = task.task_id
+                self._cloned_tasks.append(task)
+                self._ran_or_scheduled_task_index.add(cloned_task_pathspec_index)
+                task_pathspec = "{}/{}/{}".format(self._run_id, step_name, new_task_id)
+            else:
+                task_pathspec = "{}/{}/{}".format(self._run_id, step_name, new_task_id)
+                Task.clone_pathspec_mapping[task_pathspec] = "{}/{}/{}".format(
+                    self._clone_run_id, step_name, task_id
+                )
+            if verbose:
+                self._logger(
+                    "Cloning task from {}/{}/{}/{} to {}/{}/{}/{}".format(
+                        self._flow.name,
+                        self._clone_run_id,
+                        step_name,
+                        task_id,
+                        self._flow.name,
+                        self._run_id,
+                        step_name,
+                        new_task_id,
+                    ),
+                    system_msg=True,
+                )
+            clone_task_helper(
                 self._flow.name,
                 self._clone_run_id,
-                step_name,
-                task_id,
-                self._flow.name,
                 self._run_id,
                 step_name,
-                task_id,
-            ),
-            system_msg=True,
-        )
-        clone_task_helper(
-            self._flow.name,
-            self._clone_run_id,
-            self._run_id,
-            step_name,
-            task_id,  # origin_task_id
-            task_id,
-            self._flow_datastore,
-            self._metadata,
-            origin_ds_set=self._origin_ds_set,
-        )
+                task_id,  # origin_task_id
+                new_task_id,
+                self._flow_datastore,
+                self._metadata,
+                origin_ds_set=self._origin_ds_set,
+            )
+            self._finished[(step_name, finished_tuple, iteration_tuple)] = task_pathspec
+            self._is_cloned[task_pathspec] = True
+        except Exception as e:
+            self._logger(
+                "Cloning {}/{}/{}/{} failed with error: {}".format(
+                    self._flow.name, self._clone_run_id, step_name, task_id, str(e)
+                )
+            )
-    def clone_original_run(self):
-        (
-            should_skip_clone_only_execution,
-            skip_reason,
-        ) = self._should_skip_clone_only_execution()
-        if should_skip_clone_only_execution:
-            self._logger(skip_reason, system_msg=True)
-            return
-        self._metadata.start_run_heartbeat(self._flow.name, self._run_id)
+    def clone_original_run(self, generate_task_obj=False, verbose=True):
         self._logger(
-            "Start cloning original run: {}/{}".format(
-                self._flow.name, self._clone_run_id
-            ),
+            "Cloning {}/{}".format(self._flow.name, self._clone_run_id),
             system_msg=True,
         )
         inputs = []
+        ubf_mapper_tasks_to_clone = set()
+        ubf_control_tasks = set()
+        # We only clone ubf mapper tasks if the control task is complete.
+        # Here we need to check which control tasks are complete, and then get the corresponding
+        # mapper tasks.
         for task_ds in self._origin_ds_set:
             _, step_name, task_id = task_ds.pathspec.split("/")
+            pathspec_index = task_ds.pathspec_index
             if task_ds["_task_ok"] and step_name != "_parameters":
-                inputs.append((step_name, task_id))
+                # Control task contains "_control_mapper_tasks" but, in the case of
+                # @parallel decorator, the control task is also a mapper task so we
+                # need to distinguish this using _control_task_is_mapper_zero
+                control_mapper_tasks = (
+                    []
+                    if "_control_mapper_tasks" not in task_ds
+                    else task_ds["_control_mapper_tasks"]
+                )
+                if control_mapper_tasks:
+                    if task_ds.get("_control_task_is_mapper_zero", False):
+                        # Strip out the control task of list of mapper tasks
+                        ubf_control_tasks.add(control_mapper_tasks[0])
+                        ubf_mapper_tasks_to_clone.update(control_mapper_tasks[1:])
+                    else:
+                        ubf_mapper_tasks_to_clone.update(control_mapper_tasks)
+                        # Since we only add mapper tasks here, if we are not in the list
+                        # we are a control task
+                        if task_ds.pathspec not in ubf_mapper_tasks_to_clone:
+                            ubf_control_tasks.add(task_ds.pathspec)
+        for task_ds in self._origin_ds_set:
+            _, step_name, task_id = task_ds.pathspec.split("/")
+            pathspec_index = task_ds.pathspec_index
+            if (
+                task_ds["_task_ok"]
+                and step_name != "_parameters"
+                and (step_name not in self._steps_to_rerun)
+            ):
+                # "_unbounded_foreach" is a special flag to indicate that the transition
+                # is an unbounded foreach.
+                # Both parent and splitted children tasks will have this flag set.
+                # The splitted control/mapper tasks
+                # are not foreach types because UBF is always followed by a join step.
+                is_ubf_task = (
+                    "_unbounded_foreach" in task_ds and task_ds["_unbounded_foreach"]
+                ) and (self._graph[step_name].type != "foreach")
+                is_ubf_control_task = task_ds.pathspec in ubf_control_tasks
+                is_ubf_mapper_task = is_ubf_task and (not is_ubf_control_task)
+                if is_ubf_mapper_task and (
+                    task_ds.pathspec not in ubf_mapper_tasks_to_clone
+                ):
+                    # Skip copying UBF mapper tasks if control task is incomplete.
+                    continue
+                ubf_context = None
+                if is_ubf_task:
+                    ubf_context = "ubf_test" if is_ubf_mapper_task else "ubf_control"
+                finished_tuple = tuple(
+                    [s._replace(value=0) for s in task_ds.get("_foreach_stack", ())]
+                )
+                iteration_tuple = tuple(task_ds.get("_iteration_stack", ()))
+                cloned_task_pathspec_index = pathspec_index.split("/")[1]
+                if task_ds.get("_control_task_is_mapper_zero", False):
+                    # Replace None with index 0 for control task as it is part of the
+                    # UBF (as a mapper as well)
+                    finished_tuple = finished_tuple[:-1] + (
+                        finished_tuple[-1]._replace(index=0),
+                    )
+                    # We need this reverse override though because when we check
+                    # if a task has been cloned in _queue_push, the index will be None
+                    # because the _control_task_is_mapper_zero is set in the control
+                    # task *itself* and *not* in the one that is launching the UBF nest.
+                    # This means that _translate_index will use None.
+                    cloned_task_pathspec_index = re.sub(
+                        r"(\[(?:\d+, ?)*)0\]",
+                        lambda m: (m.group(1) or "[") + "None]",
+                        cloned_task_pathspec_index,
+                    )
+                inputs.append(
+                    (
+                        step_name,
+                        task_id,
+                        pathspec_index,
+                        cloned_task_pathspec_index,
+                        finished_tuple,
+                        iteration_tuple,
+                        is_ubf_mapper_task,
+                        ubf_context,
+                    )
+                )
         with futures.ThreadPoolExecutor(max_workers=self._max_workers) as executor:
             all_tasks = [
-                executor.submit(self.clone_task, step_name, task_id)
-                for (step_name, task_id) in inputs
+                executor.submit(
+                    self.clone_task,
+                    step_name,
+                    task_id,
+                    pathspec_index,
+                    cloned_task_pathspec_index,
+                    finished_tuple,
+                    iteration_tuple,
+                    ubf_context=ubf_context,
+                    generate_task_obj=generate_task_obj and (not is_ubf_mapper_task),
+                    verbose=verbose,
+                )
+                for (
+                    step_name,
+                    task_id,
+                    pathspec_index,
+                    cloned_task_pathspec_index,
+                    finished_tuple,
+                    iteration_tuple,
+                    is_ubf_mapper_task,
+                    ubf_context,
+                ) in inputs
             ]
             _, _ = futures.wait(all_tasks)
-        self._logger("Cloning original run is done", system_msg=True)
+        self._logger(
+            "{}/{} cloned!".format(self._flow.name, self._clone_run_id), system_msg=True
+        )
         self._params_task.mark_resume_done()
-        self._metadata.stop_heartbeat()
     def execute(self):
-        (
-            should_skip_clone_only_execution,
-            skip_reason,
-        ) = self._should_skip_clone_only_execution()
-        if should_skip_clone_only_execution:
-            self._logger(skip_reason, system_msg=True)
-            return
-        self._metadata.start_run_heartbeat(self._flow.name, self._run_id)
-        if self._params_task:
-            self._queue_push("start", {"input_paths": [self._params_task.path]})
+        if len(self._cloned_tasks) > 0:
+            # mutable list storing the cloned tasks.
+            self._run_queue = []
+            self._active_tasks[0] = 0
         else:
-            self._queue_push("start", {})
+            if self._params_task:
+                self._queue_push("start", {"input_paths": [self._params_task.path]})
+            else:
+                self._queue_push("start", {})
         progress_tstamp = time.time()
-        try:
-            # main scheduling loop
-            exception = None
-            while self._run_queue or self._active_tasks[0] > 0:
-                # 1. are any of the current workers finished?
-                finished_tasks = list(self._poll_workers())
-                # 2. push new tasks triggered by the finished tasks to the queue
-                self._queue_tasks(finished_tasks)
-                # 3. if there are available worker slots, pop and start tasks
-                #    from the queue.
-                self._launch_workers()
-                if time.time() - progress_tstamp > PROGRESS_INTERVAL:
-                    progress_tstamp = time.time()
-                    tasks_print = ", ".join(
-                        [
-                            "%s (%d running; %d done)" % (k, v[0], v[1])
-                            for k, v in self._active_tasks.items()
-                            if k != 0 and v[0] > 0
-                        ]
-                    )
-                    if self._active_tasks[0] == 0:
-                        msg = "No tasks are running."
-                    else:
-                        if self._active_tasks[0] == 1:
-                            msg = "1 task is running: "
-                        else:
-                            msg = "%d tasks are running: " % self._active_tasks[0]
-                        msg += "%s." % tasks_print
-                    self._logger(msg, system_msg=True)
+        with tempfile.NamedTemporaryFile(mode="w", encoding="utf-8") as config_file:
+            # Configurations are passed through a file to avoid overloading the
+            # command-line. We only need to create this file once and it can be reused
+            # for any task launch
+            config_value = dump_config_values(self._flow)
+            if config_value:
+                json.dump(config_value, config_file)
+                config_file.flush()
+                self._config_file_name = config_file.name
+            else:
+                self._config_file_name = None
+            try:
+                # main scheduling loop
+                exception = None
+                while (
+                    self._run_queue or self._active_tasks[0] > 0 or self._cloned_tasks
+                ):
+                    # 1. are any of the current workers finished?
+                    if self._cloned_tasks:
+                        finished_tasks = []
+                        # For loops (right now just recursive steps), we need to find
+                        # the exact frontier because if we queue all "successors" to all
+                        # the finished iterations, we would incorrectly launch multiple
+                        # successors. We therefore have to strip out all non-last
+                        # iterations *per* foreach branch.
+                        idx_per_finished_id = (
+                            {}
+                        )  # type: Dict[Tuple[str, Tuple[int, ...], Tuple[int, Tuple[int, ...]]]]
+                        for task in self._cloned_tasks:
+                            step_name, foreach_stack, iteration_stack = task.finished_id
+                            existing_task_idx = idx_per_finished_id.get(
+                                (step_name, foreach_stack), None
+                            )
+                            if existing_task_idx is not None:
+                                len_diff = len(iteration_stack) - len(
+                                    existing_task_idx[1]
+                                )
+                                # In this case, we need to keep only the latest iteration
+                                if (
+                                    len_diff == 0
+                                    and iteration_stack > existing_task_idx[1]
+                                ) or len_diff == -1:
+                                    # We remove the one we currently have and replace
+                                    # by this one. The second option means that we are
+                                    # adding the finished iteration marker.
+                                    existing_task = finished_tasks[existing_task_idx[0]]
+                                    # These are the first two lines of _queue_tasks
+                                    # We still consider the tasks finished so we need
+                                    # to update state to be clean.
+                                    self._finished[existing_task.finished_id] = (
+                                        existing_task.path
+                                    )
+                                    self._is_cloned[existing_task.path] = (
+                                        existing_task.is_cloned
+                                    )
+                                    finished_tasks[existing_task_idx[0]] = task
+                                    idx_per_finished_id[(step_name, foreach_stack)] = (
+                                        existing_task_idx[0],
+                                        iteration_stack,
+                                    )
+                                elif (
+                                    len_diff == 0
+                                    and iteration_stack < existing_task_idx[1]
+                                ) or len_diff == 1:
+                                    # The second option is when we have already marked
+                                    # the end of the iteration in self._finished and
+                                    # are now seeing a previous iteration.
+                                    # We just mark the task as finished but we don't
+                                    # put it in the finished_tasks list to pass to
+                                    # the _queue_tasks function
+                                    self._finished[task.finished_id] = task.path
+                                    self._is_cloned[task.path] = task.is_cloned
+                                else:
+                                    raise MetaflowInternalError(
+                                        "Unexpected recursive cloned tasks -- "
+                                        "this is a bug, please report it."
+                                    )
+                            else:
+                                # New entry
+                                finished_tasks.append(task)
+                                idx_per_finished_id[(step_name, foreach_stack)] = (
+                                    len(finished_tasks) - 1,
+                                    iteration_stack,
+                                )
-                    if len(self._run_queue) == 0:
-                        msg = "No tasks are waiting in the queue."
+                        # reset the list of cloned tasks and let poll_workers handle
+                        # the remaining transition
+                        self._cloned_tasks = []
                     else:
-                        if len(self._run_queue) == 1:
-                            msg = "1 task is waiting in the queue: "
+                        finished_tasks = list(self._poll_workers())
+                    # 2. push new tasks triggered by the finished tasks to the queue
+                    self._queue_tasks(finished_tasks)
+                    # 3. if there are available worker slots, pop and start tasks
+                    #    from the queue.
+                    self._launch_workers()
+                    if time.time() - progress_tstamp > PROGRESS_INTERVAL:
+                        progress_tstamp = time.time()
+                        tasks_print = ", ".join(
+                            [
+                                "%s (%d running; %d done)" % (k, v[0], v[1])
+                                for k, v in self._active_tasks.items()
+                                if k != 0 and v[0] > 0
+                            ]
+                        )
+                        if self._active_tasks[0] == 0:
+                            msg = "No tasks are running."
                         else:
-                            msg = "%d tasks are waiting in the queue." % len(
-                                self._run_queue
-                            )
+                            if self._active_tasks[0] == 1:
+                                msg = "1 task is running: "
+                            else:
+                                msg = "%d tasks are running: " % self._active_tasks[0]
+                            msg += "%s." % tasks_print
-                    self._logger(msg, system_msg=True)
-                    if len(self._unprocessed_steps) > 0:
-                        if len(self._unprocessed_steps) == 1:
-                            msg = "%s step has not started" % (
-                                next(iter(self._unprocessed_steps)),
-                            )
-                        else:
-                            msg = "%d steps have not started: " % len(
-                                self._unprocessed_steps
-                            )
-                            msg += "%s." % ", ".join(self._unprocessed_steps)
                         self._logger(msg, system_msg=True)
-        except KeyboardInterrupt as ex:
-            self._logger("Workflow interrupted.", system_msg=True, bad=True)
-            self._killall()
-            exception = ex
-            raise
-        except Exception as ex:
-            self._logger("Workflow failed.", system_msg=True, bad=True)
-            self._killall()
-            exception = ex
-            raise
-        finally:
-            # on finish clean tasks
-            for step in self._flow:
-                for deco in step.decorators:
-                    deco.runtime_finished(exception)
+                        if len(self._run_queue) == 0:
+                            msg = "No tasks are waiting in the queue."
+                        else:
+                            if len(self._run_queue) == 1:
+                                msg = "1 task is waiting in the queue: "
+                            else:
+                                msg = "%d tasks are waiting in the queue." % len(
+                                    self._run_queue
+                                )
-            self._metadata.stop_heartbeat()
+                        self._logger(msg, system_msg=True)
+                        if len(self._unprocessed_steps) > 0:
+                            if len(self._unprocessed_steps) == 1:
+                                msg = "%s step has not started" % (
+                                    next(iter(self._unprocessed_steps)),
+                                )
+                            else:
+                                msg = "%d steps have not started: " % len(
+                                    self._unprocessed_steps
+                                )
+                                msg += "%s." % ", ".join(self._unprocessed_steps)
+                            self._logger(msg, system_msg=True)
+            except KeyboardInterrupt as ex:
+                self._logger("Workflow interrupted.", system_msg=True, bad=True)
+                self._killall()
+                exception = ex
+                raise
+            except Exception as ex:
+                self._logger("Workflow failed.", system_msg=True, bad=True)
+                self._killall()
+                exception = ex
+                raise
+            finally:
+                # on finish clean tasks
+                if not self._skip_decorator_hooks:
+                    for step in self._flow:
+                        for deco in step.decorators:
+                            deco.runtime_finished(exception)
+                self._run_exit_hooks()
         # assert that end was executed and it was successful
-        if ("end", ()) in self._finished:
+        if ("end", (), ()) in self._finished:
             if self._run_url:
                 self._logger(
                     "Done! See the run in the UI at %s" % self._run_url,
@@ -406,6 +938,51 @@ class NativeRuntime(object):
                 "The *end* step was not successful by the end of flow."
             )
+    def _run_exit_hooks(self):
+        try:
+            flow_decos = self._flow._flow_state[FlowStateItems.FLOW_DECORATORS]
+            exit_hook_decos = flow_decos.get("exit_hook", [])
+            if not exit_hook_decos:
+                return
+            successful = ("end", (), ()) in self._finished or self._clone_only
+            pathspec = f"{self._graph.name}/{self._run_id}"
+            flow_file = self._environment.get_environment_info()["script"]
+            def _call(fn_name):
+                try:
+                    result = (
+                        subprocess.check_output(
+                            args=[
+                                sys.executable,
+                                "-m",
+                                "metaflow.plugins.exit_hook.exit_hook_script",
+                                flow_file,
+                                fn_name,
+                                pathspec,
+                            ],
+                            env=os.environ,
+                        )
+                        .decode()
+                        .strip()
+                    )
+                    print(result)
+                except subprocess.CalledProcessError as e:
+                    print(f"[exit_hook] Hook '{fn_name}' failed with error: {e}")
+                except Exception as e:
+                    print(f"[exit_hook] Unexpected error in hook '{fn_name}': {e}")
+            # Call all exit hook functions regardless of individual failures
+            for fn_name in [
+                name
+                for deco in exit_hook_decos
+                for name in (deco.success_hooks if successful else deco.error_hooks)
+            ]:
+                _call(fn_name)
+        except Exception as ex:
+            pass  # do not fail due to exit hooks for whatever reason.
     def _killall(self):
         # If we are here, all children have received a signal and are shutting down.
         # We want to give them an opportunity to do so and then kill
@@ -434,9 +1011,88 @@ class NativeRuntime(object):
         for _ in range(3):
             list(self._poll_workers())
+    # Given the current task information (task_index), the type of transition,
+    # and the split index, return the new task index.
+    def _translate_index(
+        self, task, next_step, type, split_index=None, loop_mode=LoopBehavior.NONE
+    ):
+        match = re.match(r"^(.+)\[(.*)\]\[(.*)\]$", task.task_index)
+        old_match = re.match(r"^(.+)\[(.*)\]$", task.task_index)
+        if match:
+            _, foreach_index, iteration_index = match.groups()
+            # Convert foreach_index to a list of integers
+            if len(foreach_index) > 0:
+                foreach_index = foreach_index.split(",")
+            else:
+                foreach_index = []
+            # Ditto for iteration_index
+            if len(iteration_index) > 0:
+                iteration_index = iteration_index.split(",")
+            else:
+                iteration_index = []
+        elif old_match:
+            _, foreach_index = old_match.groups()
+            # Convert foreach_index to a list of integers
+            if len(foreach_index) > 0:
+                foreach_index = foreach_index.split(",")
+            else:
+                foreach_index = []
+            # Legacy case fallback. No iteration index exists for these runs.
+            iteration_index = []
+        else:
+            raise ValueError(
+                "Index not in the format of {run_id}/{step_name}[{foreach_index}][{iteration_index}]"
+            )
+        if loop_mode == LoopBehavior.NONE:
+            # Check if we are entering a looping construct. Right now, only recursive
+            # steps are looping constructs
+            next_step_node = self._graph[next_step]
+            if (
+                next_step_node.type == "split-switch"
+                and next_step in next_step_node.out_funcs
+            ):
+                loop_mode = LoopBehavior.ENTERING
+        # Update iteration_index
+        if loop_mode == LoopBehavior.ENTERING:
+            # We are entering a loop, so we add a new iteration level
+            iteration_index.append("0")
+        elif loop_mode == LoopBehavior.EXITING:
+            iteration_index = iteration_index[:-1]
+        elif loop_mode == LoopBehavior.LOOPING:
+            if len(iteration_index) == 0:
+                raise MetaflowInternalError(
+                    "In looping mode but there is no iteration index"
+                )
+            iteration_index[-1] = str(int(iteration_index[-1]) + 1)
+        iteration_index = ",".join(iteration_index)
+        if type == "linear":
+            return "%s[%s][%s]" % (next_step, ",".join(foreach_index), iteration_index)
+        elif type == "join":
+            indices = []
+            if len(foreach_index) > 0:
+                indices = foreach_index[:-1]
+            return "%s[%s][%s]" % (next_step, ",".join(indices), iteration_index)
+        elif type == "split":
+            foreach_index.append(str(split_index))
+            return "%s[%s][%s]" % (next_step, ",".join(foreach_index), iteration_index)
     # Store the parameters needed for task creation, so that pushing on items
     # onto the run_queue is an inexpensive operation.
-    def _queue_push(self, step, task_kwargs):
+    def _queue_push(self, step, task_kwargs, index=None):
+        # In the case of cloning, we set all the cloned tasks as the
+        # finished tasks when pushing tasks using _queue_tasks. This means that we
+        # could potentially try to push the same task multiple times (for example
+        # if multiple parents of a join are cloned). We therefore keep track of what
+        # has executed (been cloned) or what has been scheduled and avoid scheduling
+        # it again.
+        if index:
+            if index in self._ran_or_scheduled_task_index:
+                # It has already run or been scheduled
+                return
+            # Note that we are scheduling this to run
+            self._ran_or_scheduled_task_index.add(index)
         self._run_queue.insert(0, (step, task_kwargs))
         # For foreaches, this will happen multiple time but is ok, becomes a no-op
         self._unprocessed_steps.discard(step)
@@ -495,34 +1151,28 @@ class NativeRuntime(object):
                     )
                 num_splits = len(mapper_tasks)
                 self._control_num_splits[task.path] = num_splits
-                if task.is_cloned:
-                    # Add mapper tasks to be cloned.
-                    for i in range(num_splits):
-                        # NOTE: For improved robustness, introduce
-                        # `clone_options` as an enum so that we can force that
-                        # clone must occur for this task.
-                        self._queue_push(
-                            task.step,
-                            {
-                                "input_paths": task.input_paths,
-                                "split_index": str(i),
-                                "ubf_context": UBF_TASK,
-                            },
-                        )
-                else:
-                    # Update _finished since these tasks were successfully
-                    # run elsewhere so that join will be unblocked.
-                    _, foreach_stack = task.finished_id
+                # If the control task is cloned, all mapper tasks should have been cloned
+                # as well, so we no longer need to handle cloning of mapper tasks in runtime.
+                # Update _finished if we are not cloned. If we were cloned, we already
+                # updated _finished with the new tasks. Note that the *value* of mapper
+                # tasks is incorrect and contains the pathspec of the *cloned* run
+                # but we don't use it for anything. We could look to clean it up though
+                if not task.is_cloned:
+                    _, foreach_stack, iteration_stack = task.finished_id
                     top = foreach_stack[-1]
                     bottom = list(foreach_stack[:-1])
                     for i in range(num_splits):
                         s = tuple(bottom + [top._replace(index=i)])
-                        self._finished[(task.step, s)] = mapper_tasks[i]
+                        self._finished[(task.step, s, iteration_stack)] = mapper_tasks[
+                            i
+                        ]
                         self._is_cloned[mapper_tasks[i]] = False
             # Find and check status of control task and retrieve its pathspec
             # for retrieving unbounded foreach cardinality.
-            _, foreach_stack = task.finished_id
+            _, foreach_stack, iteration_stack = task.finished_id
             top = foreach_stack[-1]
             bottom = list(foreach_stack[:-1])
             s = tuple(bottom + [top._replace(index=None)])
@@ -531,7 +1181,7 @@ class NativeRuntime(object):
             # it will have index=0 instead of index=None.
             if task.results.get("_control_task_is_mapper_zero", False):
                 s = tuple(bottom + [top._replace(index=0)])
-            control_path = self._finished.get((task.step, s))
+            control_path = self._finished.get((task.step, s, iteration_stack))
             if control_path:
                 # Control task was successful.
                 # Additionally check the state of (sibling) mapper tasks as well
@@ -540,21 +1190,27 @@ class NativeRuntime(object):
                 required_tasks = []
                 for i in range(num_splits):
                     s = tuple(bottom + [top._replace(index=i)])
-                    required_tasks.append(self._finished.get((task.step, s)))
+                    required_tasks.append(
+                        self._finished.get((task.step, s, iteration_stack))
+                    )
                 if all(required_tasks):
+                    index = self._translate_index(task, next_step, "join")
                     # all tasks to be joined are ready. Schedule the next join step.
                     self._queue_push(
                         next_step,
                         {"input_paths": required_tasks, "join_type": "foreach"},
+                        index,
                     )
         else:
             # matching_split is the split-parent of the finished task
             matching_split = self._graph[self._graph[next_step].split_parents[-1]]
-            _, foreach_stack = task.finished_id
+            _, foreach_stack, iteration_stack = task.finished_id
+            direct_parents = set(self._graph[next_step].in_funcs)
+            # next step is a foreach join
             if matching_split.type == "foreach":
-                # next step is a foreach join
                 def siblings(foreach_stack):
                     top = foreach_stack[-1]
@@ -563,27 +1219,57 @@ class NativeRuntime(object):
                         yield tuple(bottom + [top._replace(index=index)])
                 # required tasks are all split-siblings of the finished task
-                required_tasks = [
-                    self._finished.get((task.step, s)) for s in siblings(foreach_stack)
-                ]
+                required_tasks = list(
+                    filter(
+                        lambda x: x is not None,
+                        [
+                            self._finished.get((p, s, iteration_stack))
+                            for p in direct_parents
+                            for s in siblings(foreach_stack)
+                        ],
+                    )
+                )
+                required_count = task.finished_id[1][-1].num_splits
                 join_type = "foreach"
+                index = self._translate_index(task, next_step, "join")
             else:
                 # next step is a split
-                # required tasks are all branches joined by the next step
-                required_tasks = [
-                    self._finished.get((step, foreach_stack))
-                    for step in self._graph[next_step].in_funcs
-                ]
-                join_type = "linear"
+                required_tasks = list(
+                    filter(
+                        lambda x: x is not None,
+                        [
+                            self._finished.get((p, foreach_stack, iteration_stack))
+                            for p in direct_parents
+                        ],
+                    )
+                )
-            if all(required_tasks):
-                # all tasks to be joined are ready. Schedule the next join step.
+                required_count = len(matching_split.out_funcs)
+                join_type = "linear"
+                index = self._translate_index(task, next_step, "linear")
+            if len(required_tasks) == required_count:
+                # We have all the required previous tasks to schedule a join
                 self._queue_push(
-                    next_step, {"input_paths": required_tasks, "join_type": join_type}
+                    next_step,
+                    {"input_paths": required_tasks, "join_type": join_type},
+                    index,
                 )
-    def _queue_task_foreach(self, task, next_steps):
+    def _queue_task_switch(self, task, next_steps, is_recursive):
+        chosen_step = next_steps[0]
+        loop_mode = LoopBehavior.NONE
+        if is_recursive:
+            if chosen_step != task.step:
+                # We are exiting a loop
+                loop_mode = LoopBehavior.EXITING
+            else:
+                # We are staying in the loop
+                loop_mode = LoopBehavior.LOOPING
+        index = self._translate_index(task, chosen_step, "linear", None, loop_mode)
+        self._queue_push(chosen_step, {"input_paths": [task.path]}, index)
+    def _queue_task_foreach(self, task, next_steps):
         # CHECK: this condition should be enforced by the linter but
         # let's assert that the assumption holds
         if len(next_steps) > 1:
@@ -601,6 +1287,12 @@ class NativeRuntime(object):
             # Need to push control process related task.
             ubf_iter_name = task.results.get("_foreach_var")
             ubf_iter = task.results.get(ubf_iter_name)
+            # UBF control task has no split index, hence "None" as place holder.
+            if task.results.get("_control_task_is_mapper_zero", False):
+                index = self._translate_index(task, next_step, "split", 0)
+            else:
+                index = self._translate_index(task, next_step, "split", None)
             self._queue_push(
                 next_step,
                 {
@@ -608,6 +1300,7 @@ class NativeRuntime(object):
                     "ubf_context": UBF_CONTROL,
                     "ubf_iter": ubf_iter,
                 },
+                index,
             )
         else:
             num_splits = task.results["_foreach_num_splits"]
@@ -627,8 +1320,11 @@ class NativeRuntime(object):
             # schedule all splits
             for i in range(num_splits):
+                index = self._translate_index(task, next_step, "split", i)
                 self._queue_push(
-                    next_step, {"split_index": str(i), "input_paths": [task.path]}
+                    next_step,
+                    {"split_index": str(i), "input_paths": [task.path]},
+                    index,
                 )
     def _queue_tasks(self, finished_tasks):
@@ -649,7 +1345,39 @@ class NativeRuntime(object):
                 next_steps = []
                 foreach = None
             expected = self._graph[task.step].out_funcs
-            if next_steps != expected:
+            if self._graph[task.step].type == "split-switch":
+                is_recursive = task.step in self._graph[task.step].out_funcs
+                if len(next_steps) != 1:
+                    msg = (
+                        "Switch step *{step}* should transition to exactly "
+                        "one step at runtime, but got: {actual}"
+                    )
+                    raise MetaflowInternalError(
+                        msg.format(step=task.step, actual=", ".join(next_steps))
+                    )
+                if next_steps[0] not in expected:
+                    msg = (
+                        "Switch step *{step}* transitioned to unexpected "
+                        "step *{actual}*. Expected one of: {expected}"
+                    )
+                    raise MetaflowInternalError(
+                        msg.format(
+                            step=task.step,
+                            actual=next_steps[0],
+                            expected=", ".join(expected),
+                        )
+                    )
+                # When exiting a recursive loop, we mark that the loop itself has
+                # finished by adding a special entry in self._finished which has
+                # an iteration stack that is shorter (ie: we are out of the loop) so
+                # that we can then find it when looking at successor tasks to launch.
+                if is_recursive and next_steps[0] != task.step:
+                    step_name, finished_tuple, iteration_tuple = task.finished_id
+                    self._finished[
+                        (step_name, finished_tuple, iteration_tuple[:-1])
+                    ] = task.path
+            elif next_steps != expected:
                 msg = (
                     "Based on static analysis of the code, step *{step}* "
                     "was expected to transition to step(s) *{expected}*. "
@@ -673,10 +1401,14 @@ class NativeRuntime(object):
             elif foreach:
                 # Next step is a foreach child
                 self._queue_task_foreach(task, next_steps)
+            elif self._graph[task.step].type == "split-switch":
+                # Current step is switch - queue the chosen step
+                self._queue_task_switch(task, next_steps, is_recursive)
             else:
                 # Next steps are normal linear steps
                 for step in next_steps:
-                    self._queue_push(step, {"input_paths": [task.path]})
+                    index = self._translate_index(task, step, "linear")
+                    self._queue_push(step, {"input_paths": [task.path]}, index)
     def _poll_workers(self):
         if self._workers:
@@ -728,6 +1460,22 @@ class NativeRuntime(object):
             # Initialize the task (which can be expensive using remote datastores)
             # before launching the worker so that cost is amortized over time, instead
             # of doing it during _queue_push.
+            if (
+                FEAT_ALWAYS_UPLOAD_CODE_PACKAGE
+                and "METAFLOW_CODE_SHA" not in os.environ
+            ):
+                # We check if the code package is uploaded and, if so, we set the
+                # environment variables that will cause the metadata service to
+                # register the code package with the task created in _new_task below
+                code_sha = self._package.package_sha(timeout=0.01)
+                if code_sha:
+                    os.environ["METAFLOW_CODE_SHA"] = code_sha
+                    os.environ["METAFLOW_CODE_URL"] = self._package.package_url()
+                    os.environ["METAFLOW_CODE_DS"] = self._flow_datastore.TYPE
+                    os.environ["METAFLOW_CODE_METADATA"] = (
+                        self._package.package_metadata
+                    )
             task = self._new_task(step, **task_kwargs)
             self._launch_worker(task)
@@ -755,7 +1503,7 @@ class NativeRuntime(object):
             )
             return
-        worker = Worker(task, self._max_log_size)
+        worker = Worker(task, self._max_log_size, self._config_file_name)
         for fd in worker.fds():
             self._workers[fd] = worker
             self._poll.add(fd)
@@ -797,9 +1545,10 @@ class Task(object):
         join_type=None,
         task_id=None,
         resume_identifier=None,
+        pathspec_index=None,
     ):
         self.step = step
+        self.flow = flow
         self.flow_name = flow.name
         self.run_id = run_id
         self.task_id = None
@@ -839,10 +1588,9 @@ class Task(object):
         self._is_resume_leader = None
         self._resume_done = None
         self._resume_identifier = resume_identifier
         origin = None
         if clone_run_id and may_clone:
-            origin = self._find_origin_task(clone_run_id, join_type)
+            origin = self._find_origin_task(clone_run_id, join_type, pathspec_index)
         if origin and origin["_task_ok"]:
             # At this point, we know we are going to clone
             self._is_cloned = True
@@ -934,8 +1682,7 @@ class Task(object):
                     # To avoid the edge case where the resume leader is selected but has not
                     # yet written the _resume_leader metadata, we will wait for a few seconds.
                     # We will wait for resume leader for at most 3 times.
-                    for resume_leader_wait_retry in range(3):
+                    for _ in range(3):
                         if ds.has_metadata("_resume_leader", add_attempt=False):
                             resume_leader = ds.load_metadata(
                                 ["_resume_leader"], add_attempt=False
@@ -964,10 +1711,11 @@ class Task(object):
                         )
                 if self._is_resume_leader:
-                    self.log(
-                        "Selected as the reentrant clone leader.",
-                        system_msg=True,
-                    )
+                    if reentrant:
+                        self.log(
+                            "Selected as the reentrant clone leader.",
+                            system_msg=True,
+                        )
                     # Clone in place without relying on run_queue.
                     self.new_attempt()
                     self._ds.clone(origin)
@@ -1016,13 +1764,13 @@ class Task(object):
                 self._should_skip_cloning = task_completed
                 if self._should_skip_cloning:
                     self.log(
-                        "Skip cloning of previously run task %s" % self.clone_origin,
+                        "Skipping cloning of previously run task %s"
+                        % self.clone_origin,
                         system_msg=True,
                     )
                 else:
                     self.log(
-                        "Cloning results of a previously run task %s"
-                        % self.clone_origin,
+                        "Cloning previously run task %s" % self.clone_origin,
                         system_msg=True,
                     )
         else:
@@ -1035,7 +1783,6 @@ class Task(object):
         # Open the output datastore only if the task is not being cloned.
         if not self._is_cloned:
             self.new_attempt()
             for deco in decos:
                 deco.runtime_task_created(
                     self._ds,
@@ -1112,63 +1859,34 @@ class Task(object):
     def _get_task_id(self, task_id):
         already_existed = True
+        tags = []
         if self.ubf_context == UBF_CONTROL:
-            [input_path] = self.input_paths
-            run, input_step, input_task = input_path.split("/")
-            # We associate the control task-id to be 1:1 with the split node
-            # where the unbounded-foreach was defined.
-            # We prefer encoding the corresponding split into the task_id of
-            # the control node; so it has access to this information quite
-            # easily. There is anyway a corresponding int id stored in the
-            # metadata backend - so this should be fine.
-            task_id = "control-%s-%s-%s" % (run, input_step, input_task)
-        # Register only regular Metaflow (non control) tasks.
+            tags = [CONTROL_TASK_TAG]
+        # Register Metaflow tasks.
         if task_id is None:
-            task_id = str(self.metadata.new_task_id(self.run_id, self.step))
+            task_id = str(
+                self.metadata.new_task_id(self.run_id, self.step, sys_tags=tags)
+            )
             already_existed = False
         else:
-            # task_id is preset only by persist_constants() or control tasks.
-            if self.ubf_context == UBF_CONTROL:
-                tags = [CONTROL_TASK_TAG]
-                attempt_id = 0
-                already_existed = not self.metadata.register_task_id(
-                    self.run_id,
-                    self.step,
-                    task_id,
-                    attempt_id,
-                    sys_tags=tags,
-                )
-                # A Task's tags are now those of its ancestral Run, so we are not able
-                # to rely on a task's tags to indicate the presence of a control task
-                # so, on top of adding the tags above, we also add a task metadata
-                # entry indicating that this is a "control task".
-                #
-                # Here we will also add a task metadata entry to indicate "control task".
-                # Within the metaflow repo, the only dependency of such a "control task"
-                # indicator is in the integration test suite (see Step.control_tasks() in
-                # client API).
-                task_metadata_list = [
-                    MetaDatum(
-                        field="internal_task_type",
-                        value=CONTROL_TASK_TAG,
-                        type="internal_task_type",
-                        tags=["attempt_id:{0}".format(attempt_id)],
-                    )
-                ]
-                self.metadata.register_metadata(
-                    self.run_id, self.step, task_id, task_metadata_list
-                )
-            else:
-                already_existed = not self.metadata.register_task_id(
-                    self.run_id, self.step, task_id, 0
-                )
+            # task_id is preset only by persist_constants().
+            already_existed = not self.metadata.register_task_id(
+                self.run_id,
+                self.step,
+                task_id,
+                0,
+                sys_tags=tags,
+            )
         self.task_id = task_id
         self._path = "%s/%s/%s" % (self.run_id, self.step, self.task_id)
         return already_existed
-    def _find_origin_task(self, clone_run_id, join_type):
-        if self.step == "_parameters":
+    def _find_origin_task(self, clone_run_id, join_type, pathspec_index=None):
+        if pathspec_index:
+            origin = self.origin_ds_set.get_with_pathspec_index(pathspec_index)
+            return origin
+        elif self.step == "_parameters":
             pathspec = "%s/_parameters[]" % clone_run_id
             origin = self.origin_ds_set.get_with_pathspec_index(pathspec)
@@ -1218,16 +1936,23 @@ class Task(object):
             )
             return self._results_ds
+    @property
+    def task_index(self):
+        _, task_index = self.results.pathspec_index.split("/")
+        return task_index
     @property
     def finished_id(self):
         # note: id is not available before the task has finished.
-        # Index already identifies the task within the foreach,
-        # we will remove foreach value so that it is easier to
+        # Index already identifies the task within the foreach and loop.
+        # We will remove foreach value so that it is easier to
         # identify siblings within a foreach.
         foreach_stack_tuple = tuple(
             [s._replace(value=0) for s in self.results["_foreach_stack"]]
         )
-        return (self.step, foreach_stack_tuple)
+        # _iteration_stack requires a fallback, as it does not exist for runs before v2.17.4
+        iteration_stack_tuple = tuple(self.results.get("_iteration_stack", []))
+        return (self.step, foreach_stack_tuple, iteration_stack_tuple)
     @property
     def is_cloned(self):
@@ -1301,9 +2026,29 @@ class CLIArgs(object):
     for step execution in StepDecorator.runtime_step_cli().
     """
-    def __init__(self, task):
+    def __init__(
+        self,
+        task,
+        orig_flow_datastore=None,
+        spin_pathspec=None,
+        artifacts_module=None,
+        persist=True,
+        skip_decorators=False,
+    ):
         self.task = task
+        if orig_flow_datastore is not None:
+            self.orig_flow_datastore = "%s@%s" % (
+                orig_flow_datastore.TYPE,
+                orig_flow_datastore.datastore_root,
+            )
+        else:
+            self.orig_flow_datastore = None
+        self.spin_pathspec = spin_pathspec
+        self.artifacts_module = artifacts_module
+        self.persist = persist
+        self.skip_decorators = skip_decorators
         self.entrypoint = list(task.entrypoint)
+        step_obj = getattr(self.task.flow, self.task.step)
         self.top_level_options = {
             "quiet": True,
             "metadata": self.task.metadata_type,
@@ -1315,38 +2060,77 @@ class CLIArgs(object):
             "datastore-root": self.task.datastore_sysroot,
             "with": [
                 deco.make_decorator_spec()
-                for deco in self.task.decos
-                if not deco.statically_defined
+                for deco in chain(
+                    self.task.decos,
+                    step_obj.wrappers,
+                    step_obj.config_decorators,
+                )
+                if not deco.statically_defined and deco.inserted_by is None
             ],
         }
         # FlowDecorators can define their own top-level options. They are
         # responsible for adding their own top-level options and values through
         # the get_top_level_options() hook.
-        for deco in flow_decorators():
+        for deco in flow_decorators(self.task.flow):
             self.top_level_options.update(deco.get_top_level_options())
+        # We also pass configuration options using the kv.<name> syntax which will cause
+        # the configuration options to be loaded from the CONFIG file (or local-config-file
+        # in the case of the local runtime)
+        configs = self.task.flow._flow_state[FlowStateItems.CONFIGS]
+        if configs:
+            self.top_level_options["config-value"] = [
+                (k, ConfigInput.make_key_name(k)) for k in configs
+            ]
+        if spin_pathspec:
+            self.spin_args()
+        else:
+            self.default_args()
+    def default_args(self):
         self.commands = ["step"]
         self.command_args = [self.task.step]
         self.command_options = {
-            "run-id": task.run_id,
-            "task-id": task.task_id,
-            "input-paths": compress_list(task.input_paths),
-            "split-index": task.split_index,
-            "retry-count": task.retries,
-            "max-user-code-retries": task.user_code_retries,
-            "tag": task.tags,
+            "run-id": self.task.run_id,
+            "task-id": self.task.task_id,
+            "input-paths": compress_list(self.task.input_paths),
+            "split-index": self.task.split_index,
+            "retry-count": self.task.retries,
+            "max-user-code-retries": self.task.user_code_retries,
+            "tag": self.task.tags,
             "namespace": get_namespace() or "",
-            "ubf-context": task.ubf_context,
+            "ubf-context": self.task.ubf_context,
         }
         self.env = {}
-    def get_args(self):
+    def spin_args(self):
+        self.commands = ["spin-step"]
+        self.command_args = [self.task.step]
+        self.command_options = {
+            "run-id": self.task.run_id,
+            "task-id": self.task.task_id,
+            "input-paths": compress_list(self.task.input_paths),
+            "split-index": self.task.split_index,
+            "retry-count": self.task.retries,
+            "max-user-code-retries": self.task.user_code_retries,
+            "namespace": get_namespace() or "",
+            "orig-flow-datastore": self.orig_flow_datastore,
+            "artifacts-module": self.artifacts_module,
+            "skip-decorators": self.skip_decorators,
+        }
+        if self.persist:
+            self.command_options["persist"] = True
+        else:
+            self.command_options["no-persist"] = True
+        self.env = {}
+    def get_args(self):
         # TODO: Make one with dict_to_cli_options; see cli_args.py for more detail
         def _options(mapping):
             for k, v in mapping.items():
                 # None or False arguments are ignored
                 # v needs to be explicitly False, not falsy, e.g. 0 is an acceptable value
                 if v is None or v is False:
@@ -1361,12 +2145,15 @@ class CLIArgs(object):
                 for value in v:
                     yield "--%s" % k
                     if not isinstance(value, bool):
-                        yield to_unicode(value)
+                        value = value if isinstance(value, tuple) else (value,)
+                        for vv in value:
+                            yield to_unicode(vv)
         args = list(self.entrypoint)
         args.extend(_options(self.top_level_options))
         args.extend(self.commands)
         args.extend(self.command_args)
         args.extend(_options(self.command_options))
         return args
@@ -1378,8 +2165,24 @@ class CLIArgs(object):
 class Worker(object):
-    def __init__(self, task, max_logs_size):
+    def __init__(
+        self,
+        task,
+        max_logs_size,
+        config_file_name,
+        orig_flow_datastore=None,
+        spin_pathspec=None,
+        artifacts_module=None,
+        persist=True,
+        skip_decorators=False,
+    ):
         self.task = task
+        self._config_file_name = config_file_name
+        self._orig_flow_datastore = orig_flow_datastore
+        self._spin_pathspec = spin_pathspec
+        self._artifacts_module = artifacts_module
+        self._skip_decorators = skip_decorators
+        self._persist = persist
         self._proc = self._launch()
         if task.retries > task.user_code_retries:
@@ -1411,7 +2214,14 @@ class Worker(object):
         # not it is properly shut down)
     def _launch(self):
-        args = CLIArgs(self.task)
+        args = CLIArgs(
+            self.task,
+            orig_flow_datastore=self._orig_flow_datastore,
+            spin_pathspec=self._spin_pathspec,
+            artifacts_module=self._artifacts_module,
+            persist=self._persist,
+            skip_decorators=self._skip_decorators,
+        )
         env = dict(os.environ)
         if self.task.clone_run_id:
@@ -1431,6 +2241,12 @@ class Worker(object):
                     self.task.user_code_retries,
                     self.task.ubf_context,
                 )
+        # Add user configurations using a file to avoid using up too much space on the
+        # command line
+        if self._config_file_name:
+            args.top_level_options["local-config-file"] = self._config_file_name
+        # Pass configuration options
         env.update(args.get_env())
         env["PYTHONUNBUFFERED"] = "x"
         tracing.inject_tracing_vars(env)
@@ -1438,6 +2254,7 @@ class Worker(object):
         # by read_logline() below that relies on readline() not blocking
         # print('running', args)
         cmdline = args.get_args()
+        from_start(f"Command line: {' '.join(cmdline)}")
         debug.subcommand_exec(cmdline)
         return subprocess.Popen(
             cmdline,
@@ -1560,13 +2377,14 @@ class Worker(object):
                     else:
                         self.emit_log(b"Task failed.", self._stderr, system_msg=True)
             else:
-                num = self.task.results["_foreach_num_splits"]
-                if num:
-                    self.task.log(
-                        "Foreach yields %d child steps." % num,
-                        system_msg=True,
-                        pid=self._proc.pid,
-                    )
+                if not self._spin_pathspec:
+                    num = self.task.results["_foreach_num_splits"]
+                    if num:
+                        self.task.log(
+                            "Foreach yields %d child steps." % num,
+                            system_msg=True,
+                            pid=self._proc.pid,
+                        )
                 self.task.log(
                     "Task finished successfully.", system_msg=True, pid=self._proc.pid
                 )

ob-metaflow 2.11.13.1__py2.py3-none-any.whl → 2.19.7.1rc0__py2.py3-none-any.whl

ob-metaflow 2.11.13.1py2.py3-none-any.whl → 2.19.7.1rc0py2.py3-none-any.whl