PyPI - ob-metaflow - Versions diffs - 2.11.13.1__py2.py3-none-any.whl → 2.19.7.1rc0__py2.py3-none-any.whl - Mend

ob-metaflow 2.11.13.1py2.py3-none-any.whl → 2.19.7.1rc0py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (289) hide show

metaflow/R.py +10 -7
metaflow/__init__.py +40 -25
metaflow/_vendor/imghdr/__init__.py +186 -0
metaflow/_vendor/importlib_metadata/__init__.py +1063 -0
metaflow/_vendor/importlib_metadata/_adapters.py +68 -0
metaflow/_vendor/importlib_metadata/_collections.py +30 -0
metaflow/_vendor/importlib_metadata/_compat.py +71 -0
metaflow/_vendor/importlib_metadata/_functools.py +104 -0
metaflow/_vendor/importlib_metadata/_itertools.py +73 -0
metaflow/_vendor/importlib_metadata/_meta.py +48 -0
metaflow/_vendor/importlib_metadata/_text.py +99 -0
metaflow/_vendor/importlib_metadata/py.typed +0 -0
metaflow/_vendor/typeguard/__init__.py +48 -0
metaflow/_vendor/typeguard/_checkers.py +1070 -0
metaflow/_vendor/typeguard/_config.py +108 -0
metaflow/_vendor/typeguard/_decorators.py +233 -0
metaflow/_vendor/typeguard/_exceptions.py +42 -0
metaflow/_vendor/typeguard/_functions.py +308 -0
metaflow/_vendor/typeguard/_importhook.py +213 -0
metaflow/_vendor/typeguard/_memo.py +48 -0
metaflow/_vendor/typeguard/_pytest_plugin.py +127 -0
metaflow/_vendor/typeguard/_suppression.py +86 -0
metaflow/_vendor/typeguard/_transformer.py +1229 -0
metaflow/_vendor/typeguard/_union_transformer.py +55 -0
metaflow/_vendor/typeguard/_utils.py +173 -0
metaflow/_vendor/typeguard/py.typed +0 -0
metaflow/_vendor/typing_extensions.py +3641 -0
metaflow/_vendor/v3_7/importlib_metadata/__init__.py +1063 -0
metaflow/_vendor/v3_7/importlib_metadata/_adapters.py +68 -0
metaflow/_vendor/v3_7/importlib_metadata/_collections.py +30 -0
metaflow/_vendor/v3_7/importlib_metadata/_compat.py +71 -0
metaflow/_vendor/v3_7/importlib_metadata/_functools.py +104 -0
metaflow/_vendor/v3_7/importlib_metadata/_itertools.py +73 -0
metaflow/_vendor/v3_7/importlib_metadata/_meta.py +48 -0
metaflow/_vendor/v3_7/importlib_metadata/_text.py +99 -0
metaflow/_vendor/v3_7/importlib_metadata/py.typed +0 -0
metaflow/_vendor/v3_7/typeguard/__init__.py +48 -0
metaflow/_vendor/v3_7/typeguard/_checkers.py +906 -0
metaflow/_vendor/v3_7/typeguard/_config.py +108 -0
metaflow/_vendor/v3_7/typeguard/_decorators.py +237 -0
metaflow/_vendor/v3_7/typeguard/_exceptions.py +42 -0
metaflow/_vendor/v3_7/typeguard/_functions.py +310 -0
metaflow/_vendor/v3_7/typeguard/_importhook.py +213 -0
metaflow/_vendor/v3_7/typeguard/_memo.py +48 -0
metaflow/_vendor/v3_7/typeguard/_pytest_plugin.py +100 -0
metaflow/_vendor/v3_7/typeguard/_suppression.py +88 -0
metaflow/_vendor/v3_7/typeguard/_transformer.py +1207 -0
metaflow/_vendor/v3_7/typeguard/_union_transformer.py +54 -0
metaflow/_vendor/v3_7/typeguard/_utils.py +169 -0
metaflow/_vendor/v3_7/typeguard/py.typed +0 -0
metaflow/_vendor/v3_7/typing_extensions.py +3072 -0
metaflow/_vendor/yaml/__init__.py +427 -0
metaflow/_vendor/yaml/composer.py +139 -0
metaflow/_vendor/yaml/constructor.py +748 -0
metaflow/_vendor/yaml/cyaml.py +101 -0
metaflow/_vendor/yaml/dumper.py +62 -0
metaflow/_vendor/yaml/emitter.py +1137 -0
metaflow/_vendor/yaml/error.py +75 -0
metaflow/_vendor/yaml/events.py +86 -0
metaflow/_vendor/yaml/loader.py +63 -0
metaflow/_vendor/yaml/nodes.py +49 -0
metaflow/_vendor/yaml/parser.py +589 -0
metaflow/_vendor/yaml/reader.py +185 -0
metaflow/_vendor/yaml/representer.py +389 -0
metaflow/_vendor/yaml/resolver.py +227 -0
metaflow/_vendor/yaml/scanner.py +1435 -0
metaflow/_vendor/yaml/serializer.py +111 -0
metaflow/_vendor/yaml/tokens.py +104 -0
metaflow/cards.py +5 -0
metaflow/cli.py +331 -785
metaflow/cli_args.py +17 -0
metaflow/cli_components/__init__.py +0 -0
metaflow/cli_components/dump_cmd.py +96 -0
metaflow/cli_components/init_cmd.py +52 -0
metaflow/cli_components/run_cmds.py +546 -0
metaflow/cli_components/step_cmd.py +334 -0
metaflow/cli_components/utils.py +140 -0
metaflow/client/__init__.py +1 -0
metaflow/client/core.py +467 -73
metaflow/client/filecache.py +75 -35
metaflow/clone_util.py +7 -1
metaflow/cmd/code/__init__.py +231 -0
metaflow/cmd/develop/stub_generator.py +756 -288
metaflow/cmd/develop/stubs.py +12 -28
metaflow/cmd/main_cli.py +6 -4
metaflow/cmd/make_wrapper.py +78 -0
metaflow/datastore/__init__.py +1 -0
metaflow/datastore/content_addressed_store.py +41 -10
metaflow/datastore/datastore_set.py +11 -2
metaflow/datastore/flow_datastore.py +156 -10
metaflow/datastore/spin_datastore.py +91 -0
metaflow/datastore/task_datastore.py +154 -39
metaflow/debug.py +5 -0
metaflow/decorators.py +404 -78
metaflow/exception.py +8 -2
metaflow/extension_support/__init__.py +527 -376
metaflow/extension_support/_empty_file.py +2 -2
metaflow/extension_support/plugins.py +49 -31
metaflow/flowspec.py +482 -33
metaflow/graph.py +210 -42
metaflow/includefile.py +84 -40
metaflow/lint.py +141 -22
metaflow/meta_files.py +13 -0
metaflow/{metadata → metadata_provider}/heartbeat.py +24 -8
metaflow/{metadata → metadata_provider}/metadata.py +86 -1
metaflow/metaflow_config.py +175 -28
metaflow/metaflow_config_funcs.py +51 -3
metaflow/metaflow_current.py +4 -10
metaflow/metaflow_environment.py +139 -53
metaflow/metaflow_git.py +115 -0
metaflow/metaflow_profile.py +18 -0
metaflow/metaflow_version.py +150 -66
metaflow/mflog/__init__.py +4 -3
metaflow/mflog/save_logs.py +2 -2
metaflow/multicore_utils.py +31 -14
metaflow/package/__init__.py +673 -0
metaflow/packaging_sys/__init__.py +880 -0
metaflow/packaging_sys/backend.py +128 -0
metaflow/packaging_sys/distribution_support.py +153 -0
metaflow/packaging_sys/tar_backend.py +99 -0
metaflow/packaging_sys/utils.py +54 -0
metaflow/packaging_sys/v1.py +527 -0
metaflow/parameters.py +149 -28
metaflow/plugins/__init__.py +74 -5
metaflow/plugins/airflow/airflow.py +40 -25
metaflow/plugins/airflow/airflow_cli.py +22 -5
metaflow/plugins/airflow/airflow_decorator.py +1 -1
metaflow/plugins/airflow/airflow_utils.py +5 -3
metaflow/plugins/airflow/sensors/base_sensor.py +4 -4
metaflow/plugins/airflow/sensors/external_task_sensor.py +2 -2
metaflow/plugins/airflow/sensors/s3_sensor.py +2 -2
metaflow/plugins/argo/argo_client.py +78 -33
metaflow/plugins/argo/argo_events.py +6 -6
metaflow/plugins/argo/argo_workflows.py +2410 -527
metaflow/plugins/argo/argo_workflows_cli.py +571 -121
metaflow/plugins/argo/argo_workflows_decorator.py +43 -12
metaflow/plugins/argo/argo_workflows_deployer.py +106 -0
metaflow/plugins/argo/argo_workflows_deployer_objects.py +453 -0
metaflow/plugins/argo/capture_error.py +73 -0
metaflow/plugins/argo/conditional_input_paths.py +35 -0
metaflow/plugins/argo/exit_hooks.py +209 -0
metaflow/plugins/argo/jobset_input_paths.py +15 -0
metaflow/plugins/argo/param_val.py +19 -0
metaflow/plugins/aws/aws_client.py +10 -3
metaflow/plugins/aws/aws_utils.py +55 -2
metaflow/plugins/aws/batch/batch.py +72 -5
metaflow/plugins/aws/batch/batch_cli.py +33 -10
metaflow/plugins/aws/batch/batch_client.py +4 -3
metaflow/plugins/aws/batch/batch_decorator.py +102 -35
metaflow/plugins/aws/secrets_manager/aws_secrets_manager_secrets_provider.py +13 -10
metaflow/plugins/aws/step_functions/dynamo_db_client.py +0 -3
metaflow/plugins/aws/step_functions/production_token.py +1 -1
metaflow/plugins/aws/step_functions/step_functions.py +65 -8
metaflow/plugins/aws/step_functions/step_functions_cli.py +101 -7
metaflow/plugins/aws/step_functions/step_functions_decorator.py +1 -2
metaflow/plugins/aws/step_functions/step_functions_deployer.py +97 -0
metaflow/plugins/aws/step_functions/step_functions_deployer_objects.py +264 -0
metaflow/plugins/azure/azure_exceptions.py +1 -1
metaflow/plugins/azure/azure_secret_manager_secrets_provider.py +240 -0
metaflow/plugins/azure/azure_tail.py +1 -1
metaflow/plugins/azure/includefile_support.py +2 -0
metaflow/plugins/cards/card_cli.py +66 -30
metaflow/plugins/cards/card_creator.py +25 -1
metaflow/plugins/cards/card_datastore.py +21 -49
metaflow/plugins/cards/card_decorator.py +132 -8
metaflow/plugins/cards/card_modules/basic.py +112 -17
metaflow/plugins/cards/card_modules/bundle.css +1 -1
metaflow/plugins/cards/card_modules/card.py +16 -1
metaflow/plugins/cards/card_modules/chevron/renderer.py +1 -1
metaflow/plugins/cards/card_modules/components.py +665 -28
metaflow/plugins/cards/card_modules/convert_to_native_type.py +36 -7
metaflow/plugins/cards/card_modules/json_viewer.py +232 -0
metaflow/plugins/cards/card_modules/main.css +1 -0
metaflow/plugins/cards/card_modules/main.js +68 -49
metaflow/plugins/cards/card_modules/renderer_tools.py +1 -0
metaflow/plugins/cards/card_modules/test_cards.py +26 -12
metaflow/plugins/cards/card_server.py +39 -14
metaflow/plugins/cards/component_serializer.py +2 -9
metaflow/plugins/cards/metadata.py +22 -0
metaflow/plugins/catch_decorator.py +9 -0
metaflow/plugins/datastores/azure_storage.py +10 -1
metaflow/plugins/datastores/gs_storage.py +6 -2
metaflow/plugins/datastores/local_storage.py +12 -6
metaflow/plugins/datastores/spin_storage.py +12 -0
metaflow/plugins/datatools/local.py +2 -0
metaflow/plugins/datatools/s3/s3.py +126 -75
metaflow/plugins/datatools/s3/s3op.py +254 -121
metaflow/plugins/env_escape/__init__.py +3 -3
metaflow/plugins/env_escape/client_modules.py +102 -72
metaflow/plugins/env_escape/server.py +7 -0
metaflow/plugins/env_escape/stub.py +24 -5
metaflow/plugins/events_decorator.py +343 -185
metaflow/plugins/exit_hook/__init__.py +0 -0
metaflow/plugins/exit_hook/exit_hook_decorator.py +46 -0
metaflow/plugins/exit_hook/exit_hook_script.py +52 -0
metaflow/plugins/gcp/__init__.py +1 -1
metaflow/plugins/gcp/gcp_secret_manager_secrets_provider.py +11 -6
metaflow/plugins/gcp/gs_tail.py +10 -6
metaflow/plugins/gcp/includefile_support.py +3 -0
metaflow/plugins/kubernetes/kube_utils.py +108 -0
metaflow/plugins/kubernetes/kubernetes.py +411 -130
metaflow/plugins/kubernetes/kubernetes_cli.py +168 -36
metaflow/plugins/kubernetes/kubernetes_client.py +104 -2
metaflow/plugins/kubernetes/kubernetes_decorator.py +246 -88
metaflow/plugins/kubernetes/kubernetes_job.py +253 -581
metaflow/plugins/kubernetes/kubernetes_jobsets.py +1071 -0
metaflow/plugins/kubernetes/spot_metadata_cli.py +69 -0
metaflow/plugins/kubernetes/spot_monitor_sidecar.py +109 -0
metaflow/plugins/logs_cli.py +359 -0
metaflow/plugins/{metadata → metadata_providers}/local.py +144 -84
metaflow/plugins/{metadata → metadata_providers}/service.py +103 -26
metaflow/plugins/metadata_providers/spin.py +16 -0
metaflow/plugins/package_cli.py +36 -24
metaflow/plugins/parallel_decorator.py +128 -11
metaflow/plugins/parsers.py +16 -0
metaflow/plugins/project_decorator.py +51 -5
metaflow/plugins/pypi/bootstrap.py +357 -105
metaflow/plugins/pypi/conda_decorator.py +82 -81
metaflow/plugins/pypi/conda_environment.py +187 -52
metaflow/plugins/pypi/micromamba.py +157 -47
metaflow/plugins/pypi/parsers.py +268 -0
metaflow/plugins/pypi/pip.py +88 -13
metaflow/plugins/pypi/pypi_decorator.py +37 -1
metaflow/plugins/pypi/utils.py +48 -2
metaflow/plugins/resources_decorator.py +2 -2
metaflow/plugins/secrets/__init__.py +3 -0
metaflow/plugins/secrets/secrets_decorator.py +26 -181
metaflow/plugins/secrets/secrets_func.py +49 -0
metaflow/plugins/secrets/secrets_spec.py +101 -0
metaflow/plugins/secrets/utils.py +74 -0
metaflow/plugins/tag_cli.py +4 -7
metaflow/plugins/test_unbounded_foreach_decorator.py +41 -6
metaflow/plugins/timeout_decorator.py +3 -3
metaflow/plugins/uv/__init__.py +0 -0
metaflow/plugins/uv/bootstrap.py +128 -0
metaflow/plugins/uv/uv_environment.py +72 -0
metaflow/procpoll.py +1 -1
metaflow/pylint_wrapper.py +5 -1
metaflow/runner/__init__.py +0 -0
metaflow/runner/click_api.py +717 -0
metaflow/runner/deployer.py +470 -0
metaflow/runner/deployer_impl.py +201 -0
metaflow/runner/metaflow_runner.py +714 -0
metaflow/runner/nbdeploy.py +132 -0
metaflow/runner/nbrun.py +225 -0
metaflow/runner/subprocess_manager.py +650 -0
metaflow/runner/utils.py +335 -0
metaflow/runtime.py +1078 -260
metaflow/sidecar/sidecar_worker.py +1 -1
metaflow/system/__init__.py +5 -0
metaflow/system/system_logger.py +85 -0
metaflow/system/system_monitor.py +108 -0
metaflow/system/system_utils.py +19 -0
metaflow/task.py +521 -225
metaflow/tracing/__init__.py +7 -7
metaflow/tracing/span_exporter.py +31 -38
metaflow/tracing/tracing_modules.py +38 -43
metaflow/tuple_util.py +27 -0
metaflow/user_configs/__init__.py +0 -0
metaflow/user_configs/config_options.py +563 -0
metaflow/user_configs/config_parameters.py +598 -0
metaflow/user_decorators/__init__.py +0 -0
metaflow/user_decorators/common.py +144 -0
metaflow/user_decorators/mutable_flow.py +512 -0
metaflow/user_decorators/mutable_step.py +424 -0
metaflow/user_decorators/user_flow_decorator.py +264 -0
metaflow/user_decorators/user_step_decorator.py +749 -0
metaflow/util.py +243 -27
metaflow/vendor.py +23 -7
metaflow/version.py +1 -1
ob_metaflow-2.19.7.1rc0.data/data/share/metaflow/devtools/Makefile +355 -0
ob_metaflow-2.19.7.1rc0.data/data/share/metaflow/devtools/Tiltfile +726 -0
ob_metaflow-2.19.7.1rc0.data/data/share/metaflow/devtools/pick_services.sh +105 -0
ob_metaflow-2.19.7.1rc0.dist-info/METADATA +87 -0
ob_metaflow-2.19.7.1rc0.dist-info/RECORD +445 -0
{ob_metaflow-2.11.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/WHEEL +1 -1
{ob_metaflow-2.11.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/entry_points.txt +1 -0
metaflow/_vendor/v3_5/__init__.py +0 -1
metaflow/_vendor/v3_5/importlib_metadata/__init__.py +0 -644
metaflow/_vendor/v3_5/importlib_metadata/_compat.py +0 -152
metaflow/package.py +0 -188
ob_metaflow-2.11.13.1.dist-info/METADATA +0 -85
ob_metaflow-2.11.13.1.dist-info/RECORD +0 -308
/metaflow/_vendor/{v3_5/zipp.py → zipp.py} +0 -0
/metaflow/{metadata → metadata_provider}/__init__.py +0 -0
/metaflow/{metadata → metadata_provider}/util.py +0 -0
/metaflow/plugins/{metadata → metadata_providers}/__init__.py +0 -0
{ob_metaflow-2.11.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info/licenses}/LICENSE +0 -0
{ob_metaflow-2.11.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/top_level.txt +0 -0

metaflow/plugins/argo/argo_workflows.py CHANGED Viewed

@@ -6,22 +6,28 @@ import shlex
 import sys
 from collections import defaultdict
 from hashlib import sha1
+from math import inf
+from typing import List
 from metaflow import JSONType, current
 from metaflow.decorators import flow_decorators
 from metaflow.exception import MetaflowException
+from metaflow.graph import FlowGraph
 from metaflow.includefile import FilePathClass
 from metaflow.metaflow_config import (
     ARGO_EVENTS_EVENT,
     ARGO_EVENTS_EVENT_BUS,
     ARGO_EVENTS_EVENT_SOURCE,
     ARGO_EVENTS_INTERNAL_WEBHOOK_URL,
+    ARGO_EVENTS_SENSOR_NAMESPACE,
     ARGO_EVENTS_SERVICE_ACCOUNT,
     ARGO_EVENTS_WEBHOOK_AUTH,
+    ARGO_WORKFLOWS_CAPTURE_ERROR_SCRIPT,
     ARGO_WORKFLOWS_ENV_VARS_TO_SKIP,
     ARGO_WORKFLOWS_KUBERNETES_SECRETS,
     ARGO_WORKFLOWS_UI_URL,
     AWS_SECRETS_MANAGER_DEFAULT_REGION,
+    AZURE_KEY_VAULT_PREFIX,
     AZURE_STORAGE_BLOB_SERVICE_ENDPOINT,
     CARD_AZUREROOT,
     CARD_GSROOT,
@@ -34,9 +40,7 @@ from metaflow.metaflow_config import (
     DEFAULT_SECRETS_BACKEND_TYPE,
     GCP_SECRET_MANAGER_PREFIX,
     KUBERNETES_FETCH_EC2_METADATA,
-    KUBERNETES_LABELS,
     KUBERNETES_NAMESPACE,
-    KUBERNETES_NODE_SELECTOR,
     KUBERNETES_SANDBOX_INIT_SCRIPT,
     KUBERNETES_SECRETS,
     S3_ENDPOINT_URL,
@@ -44,14 +48,16 @@ from metaflow.metaflow_config import (
     SERVICE_HEADERS,
     SERVICE_INTERNAL_URL,
     UI_URL,
+    PAGERDUTY_TEMPLATE_URL,
 )
-from metaflow.metaflow_config_funcs import config_values
+from metaflow.metaflow_config_funcs import config_values, init_config
 from metaflow.mflog import BASH_SAVE_LOGS, bash_capture_logs, export_mflog_env_vars
 from metaflow.parameters import deploy_time_eval
-from metaflow.plugins.kubernetes.kubernetes import (
-    parse_kube_keyvalue_list,
-    validate_kube_labels,
-)
+from metaflow.plugins.kubernetes.kube_utils import qos_requests_and_limits
+from metaflow.plugins.kubernetes.kubernetes_jobsets import KubernetesArgoJobSet
+from metaflow.unbounded_foreach import UBF_CONTROL, UBF_TASK
+from metaflow.user_configs.config_options import ConfigInput
 from metaflow.util import (
     compress_list,
     dict_to_cli_options,
@@ -61,12 +67,18 @@ from metaflow.util import (
 )
 from .argo_client import ArgoClient
+from .exit_hooks import ExitHookHack, HttpExitHook, ContainerHook
+from metaflow.util import resolve_identity
 class ArgoWorkflowsException(MetaflowException):
     headline = "Argo Workflows error"
+class ArgoWorkflowsSensorCleanupException(MetaflowException):
+    headline = "Argo Workflows sensor clean up error"
 class ArgoWorkflowsSchedulingException(MetaflowException):
     headline = "Argo Workflows scheduling error"
@@ -74,21 +86,18 @@ class ArgoWorkflowsSchedulingException(MetaflowException):
 # List of future enhancements -
 #     1. Configure Argo metrics.
 #     2. Support resuming failed workflows within Argo Workflows.
-#     3. Support gang-scheduled clusters for distributed PyTorch/TF - One option is to
-#        use volcano - https://github.com/volcano-sh/volcano/tree/master/example/integrations/argo
-#     4. Support GitOps workflows.
-#     5. Add Metaflow tags to labels/annotations.
-#     6. Support Multi-cluster scheduling - https://github.com/argoproj/argo-workflows/issues/3523#issuecomment-792307297
-#     7. Support R lang.
-#     8. Ping @savin at slack.outerbounds.co for any feature request.
+#     3. Add Metaflow tags to labels/annotations.
+#     4. Support R lang.
+#     5. Ping @savin at slack.outerbounds.co for any feature request
 class ArgoWorkflows(object):
     def __init__(
         self,
         name,
-        graph,
+        graph: FlowGraph,
         flow,
+        code_package_metadata,
         code_package_sha,
         code_package_url,
         production_token,
@@ -108,6 +117,13 @@ class ArgoWorkflows(object):
         notify_on_success=False,
         notify_slack_webhook_url=None,
         notify_pager_duty_integration_key=None,
+        notify_incident_io_api_key=None,
+        incident_io_alert_source_config_id=None,
+        incident_io_metadata: List[str] = None,
+        enable_heartbeat_daemon=True,
+        enable_error_msg_capture=False,
+        workflow_title=None,
+        workflow_description=None,
     ):
         # Some high-level notes -
         #
@@ -133,9 +149,19 @@ class ArgoWorkflows(object):
         # ensure that your Argo Workflows controller doesn't restrict
         # templateReferencing.
+        # get initial configs
+        self.initial_configs = init_config()
+        for entry in ["OBP_PERIMETER", "OBP_INTEGRATIONS_URL"]:
+            if entry not in self.initial_configs:
+                raise ArgoWorkflowsException(
+                    f"{entry} was not found in metaflow config. Please make sure to run `outerbounds configure <...>` command which can be found on the Outerbounds UI or reach out to your Outerbounds support team."
+                )
         self.name = name
         self.graph = graph
+        self._parse_conditional_branches()
         self.flow = flow
+        self.code_package_metadata = code_package_metadata
         self.code_package_sha = code_package_sha
         self.code_package_url = code_package_url
         self.production_token = production_token
@@ -155,12 +181,22 @@ class ArgoWorkflows(object):
         self.notify_on_success = notify_on_success
         self.notify_slack_webhook_url = notify_slack_webhook_url
         self.notify_pager_duty_integration_key = notify_pager_duty_integration_key
+        self.notify_incident_io_api_key = notify_incident_io_api_key
+        self.incident_io_alert_source_config_id = incident_io_alert_source_config_id
+        self.incident_io_metadata = self.parse_incident_io_metadata(
+            incident_io_metadata
+        )
+        self.enable_heartbeat_daemon = enable_heartbeat_daemon
+        self.enable_error_msg_capture = enable_error_msg_capture
+        self.workflow_title = workflow_title
+        self.workflow_description = workflow_description
         self.parameters = self._process_parameters()
+        self.config_parameters = self._process_config_parameters()
         self.triggers, self.trigger_options = self._process_triggers()
         self._schedule, self._timezone = self._get_schedule()
-        self.kubernetes_labels = self._get_kubernetes_labels()
+        self._base_labels = self._base_kubernetes_labels()
+        self._base_annotations = self._base_kubernetes_annotations()
         self._workflow_template = self._compile_workflow_template()
         self._sensor = self._compile_sensor()
@@ -168,6 +204,7 @@ class ArgoWorkflows(object):
         return str(self._workflow_template)
     def deploy(self):
+        self.cleanup_previous_sensors()
         try:
             # Register workflow template.
             ArgoClient(namespace=KUBERNETES_NAMESPACE).register_workflow_template(
@@ -176,6 +213,37 @@ class ArgoWorkflows(object):
         except Exception as e:
             raise ArgoWorkflowsException(str(e))
+    def cleanup_previous_sensors(self):
+        try:
+            client = ArgoClient(namespace=KUBERNETES_NAMESPACE)
+            # Check for existing deployment and do cleanup
+            old_template = client.get_workflow_template(self.name)
+            if not old_template:
+                return None
+            # Clean up old sensors
+            old_sensor_namespace = old_template["metadata"]["annotations"].get(
+                "metaflow/sensor_namespace"
+            )
+            if old_sensor_namespace is None:
+                # This workflow was created before sensor annotations
+                # and may have a sensor in the default namespace
+                # we will delete it and it'll get recreated if need be
+                old_sensor_name = ArgoWorkflows._sensor_name(self.name)
+                client.delete_sensor(old_sensor_name, client._namespace)
+            else:
+                # delete old sensor only if it was somewhere else, otherwise it'll get replaced
+                old_sensor_name = old_template["metadata"]["annotations"][
+                    "metaflow/sensor_name"
+                ]
+                if (
+                    not self._sensor
+                    or old_sensor_namespace != ARGO_EVENTS_SENSOR_NAMESPACE
+                ):
+                    client.delete_sensor(old_sensor_name, old_sensor_namespace)
+        except Exception as e:
+            raise ArgoWorkflowsSensorCleanupException(str(e))
     @staticmethod
     def _sanitize(name):
         # Metaflow allows underscores in node names, which are disallowed in Argo
@@ -184,28 +252,39 @@ class ArgoWorkflows(object):
         return name.replace("_", "-")
     @staticmethod
-    def list_templates(flow_name, all=False):
+    def _sensor_name(name):
+        # Unfortunately, Argo Events Sensor names don't allow for
+        # dots (sensors run into an error) which rules out self.name :(
+        return name.replace(".", "-")
+    @staticmethod
+    def list_templates(flow_name, all=False, page_size=100):
         client = ArgoClient(namespace=KUBERNETES_NAMESPACE)
-        templates = client.get_workflow_templates()
-        if templates is None:
-            return []
-        template_names = [
-            template["metadata"]["name"]
-            for template in templates
-            if all
-            or flow_name
-            == template["metadata"]
-            .get("annotations", {})
-            .get("metaflow/flow_name", None)
-        ]
-        return template_names
+        for template in client.get_workflow_templates(page_size=page_size):
+            if all or flow_name == template["metadata"].get("annotations", {}).get(
+                "metaflow/flow_name", None
+            ):
+                yield template["metadata"]["name"]
     @staticmethod
     def delete(name):
         client = ArgoClient(namespace=KUBERNETES_NAMESPACE)
+        # the workflow template might not exist, but we still want to try clean up associated sensors and schedules.
+        workflow_template = client.get_workflow_template(name) or {}
+        workflow_annotations = workflow_template.get("metadata", {}).get(
+            "annotations", {}
+        )
+        sensor_name = ArgoWorkflows._sensor_name(
+            workflow_annotations.get("metaflow/sensor_name", name)
+        )
+        # if below is missing then it was deployed before custom sensor namespaces
+        sensor_namespace = workflow_annotations.get(
+            "metaflow/sensor_namespace", KUBERNETES_NAMESPACE
+        )
         # Always try to delete the schedule. Failure in deleting the schedule should not
         # be treated as an error, due to any of the following reasons
         # - there might not have been a schedule, or it was deleted by some other means
@@ -215,7 +294,7 @@ class ArgoWorkflows(object):
         # The workflow might have sensors attached to it, which consume actual resources.
         # Try to delete these as well.
-        sensor_deleted = client.delete_sensor(name)
+        sensor_deleted = client.delete_sensor(sensor_name, sensor_namespace)
         # After cleaning up related resources, delete the workflow in question.
         # Failure in deleting is treated as critical and will be made visible to the user
@@ -239,6 +318,7 @@ class ArgoWorkflows(object):
                     flow_name=flow_name, run_id=name
                 )
             )
+        return True
     @staticmethod
     def get_workflow_status(flow_name, name):
@@ -272,6 +352,21 @@ class ArgoWorkflows(object):
         return True
+    @staticmethod
+    def parse_incident_io_metadata(metadata: List[str] = None):
+        "parse key value pairs into a dict for incident.io metadata if given"
+        parsed_metadata = None
+        if metadata is not None:
+            parsed_metadata = {}
+            for kv in metadata:
+                key, value = kv.split("=", 1)
+                if key in parsed_metadata:
+                    raise MetaflowException(
+                        "Incident.io Metadata *%s* provided multiple times" % key
+                    )
+                parsed_metadata[key] = value
+        return parsed_metadata
     @classmethod
     def trigger(cls, name, parameters=None):
         if parameters is None:
@@ -291,7 +386,7 @@ class ArgoWorkflows(object):
             try:
                 # Check that the workflow was deployed through Metaflow
                 workflow_template["metadata"]["annotations"]["metaflow/owner"]
-            except KeyError as e:
+            except KeyError:
                 raise ArgoWorkflowsException(
                     "An existing non-metaflow workflow with the same name as "
                     "*%s* already exists in Argo Workflows. \nPlease modify the "
@@ -299,24 +394,75 @@ class ArgoWorkflows(object):
                     "Workflows before proceeding." % name
                 )
         try:
+            id_parts = resolve_identity().split(":")
+            parts_size = len(id_parts)
+            usertype = id_parts[0] if parts_size > 0 else "unknown"
+            username = id_parts[1] if parts_size > 1 else "unknown"
             return ArgoClient(namespace=KUBERNETES_NAMESPACE).trigger_workflow_template(
-                name, parameters
+                name,
+                usertype,
+                username,
+                parameters,
             )
         except Exception as e:
             raise ArgoWorkflowsException(str(e))
-    @staticmethod
-    def _get_kubernetes_labels():
+    def _base_kubernetes_labels(self):
         """
-        Get Kubernetes labels from environment variable.
-        Parses the string into a dict and validates that values adhere to Kubernetes restrictions.
+        Get shared Kubernetes labels for Argo resources.
         """
-        if not KUBERNETES_LABELS:
-            return {}
-        env_labels = KUBERNETES_LABELS.split(",")
-        env_labels = parse_kube_keyvalue_list(env_labels, False)
-        validate_kube_labels(env_labels)
-        return env_labels
+        # TODO: Add configuration through an environment variable or Metaflow config in the future if required.
+        labels = {"app.kubernetes.io/part-of": "metaflow"}
+        return labels
+    def _base_kubernetes_annotations(self):
+        """
+        Get shared Kubernetes annotations for Argo resources.
+        """
+        from datetime import datetime, timezone
+        # TODO: Add configuration through an environment variable or Metaflow config in the future if required.
+        # base annotations
+        annotations = {
+            "metaflow/production_token": self.production_token,
+            "metaflow/owner": self.username,
+            "metaflow/user": "argo-workflows",
+            "metaflow/flow_name": self.flow.name,
+            "metaflow/deployment_timestamp": str(
+                datetime.now(timezone.utc).isoformat()
+            ),
+        }
+        if current.get("project_name"):
+            annotations.update(
+                {
+                    "metaflow/project_name": current.project_name,
+                    "metaflow/branch_name": current.branch_name,
+                    "metaflow/project_flow_name": current.project_flow_name,
+                }
+            )
+        # Add Argo Workflows title and description annotations
+        # https://argo-workflows.readthedocs.io/en/latest/title-and-description/
+        # Use CLI-provided values or auto-populate from metadata
+        title = (
+            (self.workflow_title.strip() if self.workflow_title else None)
+            or current.get("project_flow_name")
+            or self.flow.name
+        )
+        description = (
+            self.workflow_description.strip() if self.workflow_description else None
+        ) or (self.flow.__doc__.strip() if self.flow.__doc__ else None)
+        if title:
+            annotations["workflows.argoproj.io/title"] = title
+        if description:
+            annotations["workflows.argoproj.io/description"] = description
+        return annotations
     def _get_schedule(self):
         schedule = self.flow._flow_decorators.get("schedule")
@@ -332,16 +478,14 @@ class ArgoWorkflows(object):
             argo_client.schedule_workflow_template(
                 self.name, self._schedule, self._timezone
             )
-            # Register sensor. Unfortunately, Argo Events Sensor names don't allow for
-            # dots (sensors run into an error) which rules out self.name :(
+            # Register sensor.
             # Metaflow will overwrite any existing sensor.
-            sensor_name = self.name.replace(".", "-")
+            sensor_name = ArgoWorkflows._sensor_name(self.name)
             if self._sensor:
-                argo_client.register_sensor(sensor_name, self._sensor.to_json())
-            else:
-                # Since sensors occupy real resources, delete existing sensor if needed
-                # Deregister sensors that might have existed before this deployment
-                argo_client.delete_sensor(sensor_name)
+                # The new sensor will go into the sensor namespace specified
+                ArgoClient(namespace=ARGO_EVENTS_SENSOR_NAMESPACE).register_sensor(
+                    sensor_name, self._sensor.to_json(), ARGO_EVENTS_SENSOR_NAMESPACE
+                )
         except Exception as e:
             raise ArgoWorkflowsSchedulingException(str(e))
@@ -393,7 +537,7 @@ class ArgoWorkflows(object):
                         "metaflow/production_token"
                     ],
                 )
-            except KeyError as e:
+            except KeyError:
                 raise ArgoWorkflowsException(
                     "An existing non-metaflow workflow with the same name as "
                     "*%s* already exists in Argo Workflows. \nPlease modify the "
@@ -439,12 +583,22 @@ class ArgoWorkflows(object):
                     "case-insensitive." % param.name
                 )
             seen.add(norm)
+            # NOTE: We skip config parameters as these do not have dynamic values,
+            # and need to be treated differently.
+            if param.IS_CONFIG_PARAMETER:
+                continue
-            if param.kwargs.get("type") == JSONType or isinstance(
-                param.kwargs.get("type"), FilePathClass
-            ):
-                # Special-case this to avoid touching core
+            extra_attrs = {}
+            if param.kwargs.get("type") == JSONType:
+                param_type = str(param.kwargs.get("type").name)
+            elif isinstance(param.kwargs.get("type"), FilePathClass):
                 param_type = str(param.kwargs.get("type").name)
+                extra_attrs["is_text"] = getattr(
+                    param.kwargs.get("type"), "_is_text", True
+                )
+                extra_attrs["encoding"] = getattr(
+                    param.kwargs.get("type"), "_encoding", "utf-8"
+                )
             else:
                 param_type = str(param.kwargs.get("type").__name__)
@@ -464,14 +618,47 @@ class ArgoWorkflows(object):
             # the JSON equivalent of None to please argo-workflows. Unfortunately it
             # has the side effect of casting the parameter value to string null during
             # execution - which needs to be fixed imminently.
-            if not is_required or default_value is not None:
+            if default_value is None:
+                default_value = json.dumps(None)
+            elif param_type == "JSON":
+                if not isinstance(default_value, str):
+                    # once to serialize the default value if needed.
+                    default_value = json.dumps(default_value)
+                # adds outer quotes to param
                 default_value = json.dumps(default_value)
+            else:
+                # Make argo sensors happy
+                default_value = json.dumps(default_value)
             parameters[param.name] = dict(
+                python_var_name=var,
                 name=param.name,
                 value=default_value,
                 type=param_type,
                 description=param.kwargs.get("help"),
                 is_required=is_required,
+                **extra_attrs,
+            )
+        return parameters
+    def _process_config_parameters(self):
+        parameters = []
+        seen = set()
+        for var, param in self.flow._get_parameters():
+            if not param.IS_CONFIG_PARAMETER:
+                continue
+            # Throw an exception if the parameter is specified twice.
+            norm = param.name.lower()
+            if norm in seen:
+                raise MetaflowException(
+                    "Parameter *%s* is specified twice. "
+                    "Note that parameter names are "
+                    "case-insensitive." % param.name
+                )
+            seen.add(norm)
+            parameters.append(
+                dict(name=param.name, kv_name=ConfigInput.make_key_name(param.name))
             )
         return parameters
@@ -497,10 +684,17 @@ class ArgoWorkflows(object):
             # convert them to lower case since Metaflow parameters are case
             # insensitive.
             seen = set()
+            # NOTE: We skip config parameters as their values can not be set through event payloads
             params = set(
-                [param.name.lower() for var, param in self.flow._get_parameters()]
+                [
+                    param.name.lower()
+                    for var, param in self.flow._get_parameters()
+                    if not param.IS_CONFIG_PARAMETER
+                ]
             )
-            for event in self.flow._flow_decorators.get("trigger")[0].triggers:
+            trigger_deco = self.flow._flow_decorators.get("trigger")[0]
+            trigger_deco.format_deploytime_value()
+            for event in trigger_deco.triggers:
                 parameters = {}
                 # TODO: Add a check to guard against names starting with numerals(?)
                 if not re.match(r"^[A-Za-z0-9_.-]+$", event["name"]):
@@ -540,11 +734,23 @@ class ArgoWorkflows(object):
         # @trigger_on_finish decorator
         if self.flow._flow_decorators.get("trigger_on_finish"):
-            for event in self.flow._flow_decorators.get("trigger_on_finish")[
-                0
-            ].triggers:
+            trigger_on_finish_deco = self.flow._flow_decorators.get(
+                "trigger_on_finish"
+            )[0]
+            trigger_on_finish_deco.format_deploytime_value()
+            for event in trigger_on_finish_deco.triggers:
                 # Actual filters are deduced here since we don't have access to
                 # the current object in the @trigger_on_finish decorator.
+                project_name = event.get("project") or current.get("project_name")
+                branch_name = event.get("branch") or current.get("branch_name")
+                # validate that we have complete project info for an event name
+                if project_name or branch_name:
+                    if not (project_name and branch_name):
+                        # if one of the two is missing, we would end up listening to an event that will never be broadcast.
+                        raise ArgoWorkflowsException(
+                            "Incomplete project info. Please specify both 'project' and 'project_branch' or use the @project decorator"
+                        )
                 triggers.append(
                     {
                         # Make sure this remains consistent with the event name format
@@ -553,18 +759,16 @@ class ArgoWorkflows(object):
                         % ".".join(
                             v
                             for v in [
-                                event.get("project") or current.get("project_name"),
-                                event.get("branch") or current.get("branch_name"),
+                                project_name,
+                                branch_name,
                                 event["flow"],
                             ]
                             if v
                         ),
                         "filters": {
                             "auto-generated-by-metaflow": True,
-                            "project_name": event.get("project")
-                            or current.get("project_name"),
-                            "branch_name": event.get("branch")
-                            or current.get("branch_name"),
+                            "project_name": project_name,
+                            "branch_name": branch_name,
                             # TODO: Add a time filters to guard against cached events
                         },
                         "type": "run",
@@ -616,30 +820,19 @@ class ArgoWorkflows(object):
         # generate container templates at the top level (in WorkflowSpec) and maintain
         # references to them within the DAGTask.
-        from datetime import datetime, timezone
+        annotations = {}
-        annotations = {
-            "metaflow/production_token": self.production_token,
-            "metaflow/owner": self.username,
-            "metaflow/user": "argo-workflows",
-            "metaflow/flow_name": self.flow.name,
-            "metaflow/deployment_timestamp": str(
-                datetime.now(timezone.utc).isoformat()
-            ),
-        }
+        if self._schedule is not None:
+            # timezone is an optional field and json dumps on None will result in null
+            # hence configuring it to an empty string
+            if self._timezone is None:
+                self._timezone = ""
+            cron_info = {"schedule": self._schedule, "tz": self._timezone}
+            annotations.update({"metaflow/cron": json.dumps(cron_info)})
         if self.parameters:
             annotations.update({"metaflow/parameters": json.dumps(self.parameters)})
-        if current.get("project_name"):
-            annotations.update(
-                {
-                    "metaflow/project_name": current.project_name,
-                    "metaflow/branch_name": current.branch_name,
-                    "metaflow/project_flow_name": current.project_flow_name,
-                }
-            )
         # Some more annotations to populate the Argo UI nicely
         if self.tags:
             annotations.update({"metaflow/tags": json.dumps(self.tags)})
@@ -651,7 +844,9 @@ class ArgoWorkflows(object):
                             {key: trigger.get(key) for key in ["name", "type"]}
                             for trigger in self.triggers
                         ]
-                    )
+                    ),
+                    "metaflow/sensor_name": ArgoWorkflows._sensor_name(self.name),
+                    "metaflow/sensor_namespace": ARGO_EVENTS_SENSOR_NAMESPACE,
                 }
             )
         if self.notify_on_error:
@@ -661,6 +856,7 @@ class ArgoWorkflows(object):
                         {
                             "slack": bool(self.notify_slack_webhook_url),
                             "pager_duty": bool(self.notify_pager_duty_integration_key),
+                            "incident_io": bool(self.notify_incident_io_api_key),
                         }
                     )
                 }
@@ -672,11 +868,24 @@ class ArgoWorkflows(object):
                         {
                             "slack": bool(self.notify_slack_webhook_url),
                             "pager_duty": bool(self.notify_pager_duty_integration_key),
+                            "incident_io": bool(self.notify_incident_io_api_key),
                         }
                     )
                 }
             )
+        try:
+            # Build the DAG based on the DAGNodes given by the FlowGraph for the found FlowSpec class.
+            _steps_info, graph_structure = self.graph.output_steps()
+            graph_info = {
+                # for the time being, we only need the graph_structure. Being mindful of annotation size limits we do not include anything extra.
+                "graph_structure": graph_structure
+            }
+        except Exception:
+            graph_info = None
+        dag_annotation = {"metaflow/dag": json.dumps(graph_info)}
+        lifecycle_hooks = self._lifecycle_hooks()
         return (
             WorkflowTemplate()
             .metadata(
@@ -687,9 +896,11 @@ class ArgoWorkflows(object):
                 # is released, we should be able to support multi-namespace /
                 # multi-cluster scheduling.
                 .namespace(KUBERNETES_NAMESPACE)
-                .label("app.kubernetes.io/name", "metaflow-flow")
-                .label("app.kubernetes.io/part-of", "metaflow")
                 .annotations(annotations)
+                .annotations(self._base_annotations)
+                .labels(self._base_labels)
+                .label("app.kubernetes.io/name", "metaflow-flow")
+                .annotations(dag_annotation)
             )
             .spec(
                 WorkflowSpec()
@@ -719,10 +930,23 @@ class ArgoWorkflows(object):
                 # Set workflow metadata
                 .workflow_metadata(
                     Metadata()
+                    .labels(self._base_labels)
                     .label("app.kubernetes.io/name", "metaflow-run")
-                    .label("app.kubernetes.io/part-of", "metaflow")
                     .annotations(
-                        {**annotations, **{"metaflow/run_id": "argo-{{workflow.name}}"}}
+                        {
+                            **annotations,
+                            **{
+                                k: v
+                                for k, v in self._base_annotations.items()
+                                if k
+                                # Skip custom title/description for workflows as this makes it harder to find specific runs.
+                                not in [
+                                    "workflows.argoproj.io/title",
+                                    "workflows.argoproj.io/description",
+                                ]
+                            },
+                            **{"metaflow/run_id": "argo-{{workflow.name}}"},
+                        }
                     )
                     # TODO: Set dynamic labels using labels_from. Ideally, we would
                     #       want to expose run_id as a label. It's easy to add labels,
@@ -755,95 +979,251 @@ class ArgoWorkflows(object):
                 # Set common pod metadata.
                 .pod_metadata(
                     Metadata()
+                    .labels(self._base_labels)
                     .label("app.kubernetes.io/name", "metaflow-task")
-                    .label("app.kubernetes.io/part-of", "metaflow")
-                    .annotations(annotations)
-                    .labels(self.kubernetes_labels)
+                    .annotations(
+                        {
+                            **annotations,
+                            **self._base_annotations,
+                            **{
+                                "metaflow/run_id": "argo-{{workflow.name}}"
+                            },  # we want pods of the workflow to have the run_id as an annotation as well
+                        }
+                    )
                 )
                 # Set the entrypoint to flow name
                 .entrypoint(self.flow.name)
-                # Set exit hook handlers if notifications are enabled
+                # OnExit hooks
+                .onExit(
+                    "capture-error-hook-fn-preflight"
+                    if self.enable_error_msg_capture
+                    else None
+                )
+                # Set lifecycle hooks if notifications are enabled
                 .hooks(
                     {
-                        **(
-                            {
-                                # workflow status maps to Completed
-                                "notify-slack-on-success": LifecycleHook()
-                                .expression("workflow.status == 'Succeeded'")
-                                .template("notify-slack-on-success"),
-                            }
-                            if self.notify_on_success and self.notify_slack_webhook_url
-                            else {}
-                        ),
-                        **(
-                            {
-                                # workflow status maps to Completed
-                                "notify-pager-duty-on-success": LifecycleHook()
-                                .expression("workflow.status == 'Succeeded'")
-                                .template("notify-pager-duty-on-success"),
-                            }
-                            if self.notify_on_success
-                            and self.notify_pager_duty_integration_key
-                            else {}
-                        ),
-                        **(
-                            {
-                                # workflow status maps to Failed or Error
-                                "notify-slack-on-failure": LifecycleHook()
-                                .expression("workflow.status == 'Failed'")
-                                .template("notify-slack-on-error"),
-                                "notify-slack-on-error": LifecycleHook()
-                                .expression("workflow.status == 'Error'")
-                                .template("notify-slack-on-error"),
-                            }
-                            if self.notify_on_error and self.notify_slack_webhook_url
-                            else {}
-                        ),
-                        **(
-                            {
-                                # workflow status maps to Failed or Error
-                                "notify-pager-duty-on-failure": LifecycleHook()
-                                .expression("workflow.status == 'Failed'")
-                                .template("notify-pager-duty-on-error"),
-                                "notify-pager-duty-on-error": LifecycleHook()
-                                .expression("workflow.status == 'Error'")
-                                .template("notify-pager-duty-on-error"),
-                            }
-                            if self.notify_on_error
-                            and self.notify_pager_duty_integration_key
-                            else {}
-                        ),
-                        # Warning: terrible hack to workaround a bug in Argo Workflow
-                        #          where the hooks listed above do not execute unless
-                        #          there is an explicit exit hook. as and when this
-                        #          bug is patched, we should remove this effectively
-                        #          no-op hook.
-                        **(
-                            {"exit": LifecycleHook().template("exit-hook-hack")}
-                            if self.notify_on_error or self.notify_on_success
-                            else {}
-                        ),
+                        lifecycle.name: lifecycle
+                        for hook in lifecycle_hooks
+                        for lifecycle in hook.lifecycle_hooks
                     }
                 )
                 # Top-level DAG template(s)
                 .templates(self._dag_templates())
                 # Container templates
                 .templates(self._container_templates())
+                # Lifecycle hook template(s)
+                .templates([hook.template for hook in lifecycle_hooks])
                 # Exit hook template(s)
                 .templates(self._exit_hook_templates())
+                # Sidecar templates (Daemon Containers)
+                .templates(self._daemon_templates())
+            )
+        )
+    # Visit every node and record information on conditional step structure
+    def _parse_conditional_branches(self):
+        self.conditional_nodes = set()
+        self.conditional_join_nodes = set()
+        self.matching_conditional_join_dict = {}
+        self.recursive_nodes = set()
+        node_conditional_parents = {}
+        node_conditional_branches = {}
+        def _visit(node, conditional_branch, conditional_parents=None):
+            if not node.type == "split-switch" and not (
+                conditional_branch and conditional_parents
+            ):
+                # skip regular non-conditional nodes entirely
+                return
+            if node.type == "split-switch":
+                conditional_branch = conditional_branch + [node.name]
+                c_br = node_conditional_branches.get(node.name, [])
+                node_conditional_branches[node.name] = c_br + [
+                    b for b in conditional_branch if b not in c_br
+                ]
+                conditional_parents = (
+                    [node.name]
+                    if not conditional_parents
+                    else conditional_parents + [node.name]
+                )
+                node_conditional_parents[node.name] = conditional_parents
+                # check for recursion. this split is recursive if any of its out functions are itself.
+                if any(
+                    out_func for out_func in node.out_funcs if out_func == node.name
+                ):
+                    self.recursive_nodes.add(node.name)
+            if conditional_parents and not node.type == "split-switch":
+                node_conditional_parents[node.name] = conditional_parents
+                conditional_branch = conditional_branch + [node.name]
+                c_br = node_conditional_branches.get(node.name, [])
+                node_conditional_branches[node.name] = c_br + [
+                    b for b in conditional_branch if b not in c_br
+                ]
+                self.conditional_nodes.add(node.name)
+            if conditional_branch and conditional_parents:
+                for n in node.out_funcs:
+                    child = self.graph[n]
+                    if child.name == node.name:
+                        continue
+                    _visit(child, conditional_branch, conditional_parents)
+        # First we visit all nodes to determine conditional parents and branches
+        for n in self.graph:
+            _visit(n, [])
+        # helper to clean up conditional info for all children of a node, until a new split-switch is encountered.
+        def _cleanup_conditional_status(node_name, seen):
+            if self.graph[node_name].type == "split-switch":
+                # stop recursive cleanup if we hit a new split-switch
+                return
+            if node_name in self.conditional_nodes:
+                self.conditional_nodes.remove(node_name)
+            node_conditional_parents[node_name] = []
+            node_conditional_branches[node_name] = []
+            for p in self.graph[node_name].out_funcs:
+                if p not in seen:
+                    _cleanup_conditional_status(p, seen + [p])
+        # Then we traverse again in order to determine conditional join nodes, and matching conditional join info
+        for node in self.graph:
+            if node_conditional_parents.get(node.name, False):
+                # do the required postprocessing for anything requiring node.in_funcs
+                # check that in previous parsing we have not closed all conditional in_funcs.
+                # If so, this step can not be conditional either
+                is_conditional = any(
+                    in_func in self.conditional_nodes
+                    or self.graph[in_func].type == "split-switch"
+                    for in_func in node.in_funcs
+                )
+                if is_conditional:
+                    self.conditional_nodes.add(node.name)
+                else:
+                    if node.name in self.conditional_nodes:
+                        self.conditional_nodes.remove(node.name)
+                # does this node close the latest conditional parent branches?
+                conditional_in_funcs = [
+                    in_func
+                    for in_func in node.in_funcs
+                    if node_conditional_branches.get(in_func, False)
+                ]
+                closed_conditional_parents = []
+                for last_split_switch in node_conditional_parents.get(node.name, [])[
+                    ::-1
+                ]:
+                    last_conditional_split_nodes = self.graph[
+                        last_split_switch
+                    ].out_funcs
+                    # NOTE: How do we define a conditional join step?
+                    # The idea here is that we check if the conditional branches(e.g. chains of conditional steps leading to) of all the in_funcs
+                    # manage to tick off every step name that follows a split-switch
+                    # For example, consider the following structure
+                    # switch_step -> A, B, C
+                    # A -> A2 -> A3 -> A4 -> B2
+                    # B -> B2 -> B3 -> C3
+                    # C -> C2 -> C3 -> end
+                    #
+                    # if we look at the in_funcs for C3, they are (C2, B3)
+                    # B3 closes off branches started by A and B
+                    # C3 closes off branches started by C
+                    # therefore C3 is a conditional join step for the 'switch_step'
+                    # NOTE: Then what about a skip step?
+                    # some switch cases might not introduce any distinct steps of their own, opting to instead skip ahead to a later common step.
+                    # Example:
+                    # switch_step -> A, B, C
+                    # A -> A1 -> B2 -> C
+                    # B -> B1 -> B2 -> C
+                    #
+                    # In this case, C is a skip step as it does not add any conditional branching of its own.
+                    # C is also a conditional join, as it closes all branches started by 'switch_step'
+                    closes_branches = all(
+                        (
+                            # branch_root_node_name needs to be in at least one conditional_branch for it to be closed.
+                            any(
+                                branch_root_node_name
+                                in node_conditional_branches.get(in_func, [])
+                                for in_func in conditional_in_funcs
+                            )
+                            # need to account for a switch case skipping completely, not having a conditional-branch of its own.
+                            if branch_root_node_name != node.name
+                            else True
+                        )
+                        for branch_root_node_name in last_conditional_split_nodes
+                    )
+                    if closes_branches:
+                        closed_conditional_parents.append(last_split_switch)
+                        self.conditional_join_nodes.add(node.name)
+                        self.matching_conditional_join_dict[last_split_switch] = (
+                            node.name
+                        )
+                # Did we close all conditionals? Then this branch and all its children are not conditional anymore (unless a new conditional branch is encountered).
+                if not [
+                    p
+                    for p in node_conditional_parents.get(node.name, [])
+                    if p not in closed_conditional_parents
+                ]:
+                    _cleanup_conditional_status(node.name, [])
+    def _is_conditional_node(self, node):
+        return node.name in self.conditional_nodes
+    def _is_conditional_skip_node(self, node):
+        return (
+            self._is_conditional_node(node)
+            and any(
+                self.graph[in_func].type == "split-switch" for in_func in node.in_funcs
+            )
+            and len(
+                [
+                    in_func
+                    for in_func in node.in_funcs
+                    if self._is_conditional_node(self.graph[in_func])
+                    or self.graph[in_func].type == "split-switch"
+                ]
             )
+            > 1
         )
+    def _is_conditional_join_node(self, node):
+        return node.name in self.conditional_join_nodes
+    def _many_in_funcs_all_conditional(self, node):
+        cond_in_funcs = [
+            in_func
+            for in_func in node.in_funcs
+            if self._is_conditional_node(self.graph[in_func])
+        ]
+        return len(cond_in_funcs) > 1 and len(cond_in_funcs) == len(node.in_funcs)
+    def _is_recursive_node(self, node):
+        return node.name in self.recursive_nodes
+    def _matching_conditional_join(self, node):
+        # If no earlier conditional join step is found during parsing, then 'end' is always one.
+        return self.matching_conditional_join_dict.get(node.name, "end")
     # Visit every node and yield the uber DAGTemplate(s).
     def _dag_templates(self):
         def _visit(
-            node, exit_node=None, templates=None, dag_tasks=None, parent_foreach=None
-        ):
-            if node.parallel_foreach:
-                raise ArgoWorkflowsException(
-                    "Deploying flows with @parallel decorator(s) "
-                    "as Argo Workflows is not supported currently."
-                )
+            node,
+            exit_node=None,
+            templates=None,
+            dag_tasks=None,
+            parent_foreach=None,
+            seen=None,
+        ):  # Returns Tuple[List[Template], List[DAGTask]]
+            """ """
             # Every for-each node results in a separate subDAG and an equivalent
             # DAGTemplate rooted at the child of the for-each node. Each DAGTemplate
             # has a unique name - the top-level DAGTemplate is named as the name of
@@ -851,28 +1231,111 @@ class ArgoWorkflows(object):
             # of the for-each node.
             # Emit if we have reached the end of the sub workflow
+            if seen is None:
+                seen = []
             if dag_tasks is None:
                 dag_tasks = []
             if templates is None:
                 templates = []
             if exit_node is not None and exit_node is node.name:
                 return templates, dag_tasks
+            if node.name in seen:
+                return templates, dag_tasks
+            seen.append(node.name)
+            # helper variable for recursive conditional inputs
+            has_foreach_inputs = False
             if node.name == "start":
                 # Start node has no dependencies.
                 dag_task = DAGTask(self._sanitize(node.name)).template(
                     self._sanitize(node.name)
                 )
-            elif (
+            if (
                 node.is_inside_foreach
                 and self.graph[node.in_funcs[0]].type == "foreach"
+                and not self.graph[node.in_funcs[0]].parallel_foreach
+                # We need to distinguish what is a "regular" foreach (i.e something that doesn't care about to gang semantics)
+                # vs what is a "num_parallel" based foreach (i.e. something that follows gang semantics.)
+                # A `regular` foreach is basically any arbitrary kind of foreach.
             ):
+                # helper variable for recursive conditional inputs
+                has_foreach_inputs = True
                 # Child of a foreach node needs input-paths as well as split-index
                 # This child is the first node of the sub workflow and has no dependency
                 parameters = [
                     Parameter("input-paths").value("{{inputs.parameters.input-paths}}"),
                     Parameter("split-index").value("{{inputs.parameters.split-index}}"),
                 ]
+                dag_task = (
+                    DAGTask(self._sanitize(node.name))
+                    .template(self._sanitize(node.name))
+                    .arguments(Arguments().parameters(parameters))
+                )
+            elif node.parallel_step:
+                # This is the step where the @parallel decorator is defined.
+                # Since this DAGTask will call the for the `resource` [based templates]
+                # (https://argo-workflows.readthedocs.io/en/stable/walk-through/kubernetes-resources/)
+                # we have certain constraints on the way we can pass information inside the Jobset manifest
+                # [All templates will have access](https://argo-workflows.readthedocs.io/en/stable/variables/#all-templates)
+                # to the `inputs.parameters` so we will pass down ANY/ALL information using the
+                # input parameters.
+                # We define the usual parameters like input-paths/split-index etc. but we will also
+                # define the following:
+                # - `workerCount`:  parameter which will be used to determine the number of
+                #                   parallel worker jobs
+                # - `jobset-name`:  parameter which will be used to determine the name of the jobset.
+                #                   This parameter needs to be dynamic so that when we have retries we don't
+                #                   end up using the name of the jobset again (if we do, it will crash since k8s wont allow duplicated job names)
+                # - `retryCount`:   parameter which will be used to determine the number of retries
+                #                   This parameter will *only* be available within the container templates like we
+                #                   have it for all other DAGTasks and NOT for custom kubernetes resource templates.
+                #                   So as a work-around, we will set it as the `retryCount` parameter instead of
+                #                   setting it as a {{ retries }} in the CLI code. Once set as a input parameter,
+                #                   we can use it in the Jobset Manifest templates as `{{inputs.parameters.retryCount}}`
+                # - `task-id-entropy`: This is a parameter which will help derive task-ids and jobset names. This parameter
+                #                   contains the relevant amount of entropy to ensure that task-ids and jobset names
+                #                   are uniquish. We will also use this in the join task to construct the task-ids of
+                #                   all parallel tasks since the task-ids for parallel task are minted formulaically.
+                parameters = [
+                    Parameter("input-paths").value("{{inputs.parameters.input-paths}}"),
+                    Parameter("num-parallel").value(
+                        "{{inputs.parameters.num-parallel}}"
+                    ),
+                    Parameter("split-index").value("{{inputs.parameters.split-index}}"),
+                    Parameter("task-id-entropy").value(
+                        "{{inputs.parameters.task-id-entropy}}"
+                    ),
+                    # we cant just use hyphens with sprig.
+                    # https://github.com/argoproj/argo-workflows/issues/10567#issuecomment-1452410948
+                    Parameter("workerCount").value(
+                        "{{=sprig.int(sprig.sub(sprig.int(inputs.parameters['num-parallel']),1))}}"
+                    ),
+                ]
+                if any(d.name == "retry" for d in node.decorators):
+                    parameters.extend(
+                        [
+                            Parameter("retryCount").value("{{retries}}"),
+                            # The job-setname needs to be unique for each retry
+                            # and we cannot use the `generateName` field in the
+                            # Jobset Manifest since we need to construct the subdomain
+                            # and control pod domain name pre-hand. So we will use
+                            # the retry count to ensure that the jobset name is unique
+                            Parameter("jobset-name").value(
+                                "js-{{inputs.parameters.task-id-entropy}}{{retries}}",
+                            ),
+                        ]
+                    )
+                else:
+                    parameters.extend(
+                        [
+                            Parameter("jobset-name").value(
+                                "js-{{inputs.parameters.task-id-entropy}}",
+                            )
+                        ]
+                    )
                 dag_task = (
                     DAGTask(self._sanitize(node.name))
                     .template(self._sanitize(node.name))
@@ -887,7 +1350,9 @@ class ArgoWorkflows(object):
                                 "argo-{{workflow.name}}/%s/{{tasks.%s.outputs.parameters.task-id}}"
                                 % (n, self._sanitize(n))
                                 for n in node.in_funcs
-                            ]
+                            ],
+                            # NOTE: We set zlibmin to infinite because zlib compression for the Argo input-paths breaks template value substitution.
+                            zlibmin=inf,
                         )
                     )
                 ]
@@ -922,23 +1387,89 @@ class ArgoWorkflows(object):
                             ]
                         )
+                conditional_deps = [
+                    "%s.Succeeded" % self._sanitize(in_func)
+                    for in_func in node.in_funcs
+                    if self._is_conditional_node(self.graph[in_func])
+                    or self.graph[in_func].type == "split-switch"
+                ]
+                required_deps = [
+                    "%s.Succeeded" % self._sanitize(in_func)
+                    for in_func in node.in_funcs
+                    if not self._is_conditional_node(self.graph[in_func])
+                    and self.graph[in_func].type != "split-switch"
+                ]
+                if self._is_conditional_skip_node(
+                    node
+                ) or self._many_in_funcs_all_conditional(node):
+                    # skip nodes need unique condition handling
+                    conditional_deps = [
+                        "%s.Succeeded" % self._sanitize(in_func)
+                        for in_func in node.in_funcs
+                    ]
+                    required_deps = []
+                both_conditions = required_deps and conditional_deps
+                depends_str = "{required}{_and}{conditional}".format(
+                    required=("(%s)" if both_conditions else "%s")
+                    % " && ".join(required_deps),
+                    _and=" && " if both_conditions else "",
+                    conditional=("(%s)" if both_conditions else "%s")
+                    % " || ".join(conditional_deps),
+                )
                 dag_task = (
                     DAGTask(self._sanitize(node.name))
-                    .dependencies(
-                        [self._sanitize(in_func) for in_func in node.in_funcs]
-                    )
+                    .depends(depends_str)
                     .template(self._sanitize(node.name))
                     .arguments(Arguments().parameters(parameters))
                 )
-            dag_tasks.append(dag_task)
+                # Add conditional if this is the first step in a conditional branch
+                switch_in_funcs = [
+                    in_func
+                    for in_func in node.in_funcs
+                    if self.graph[in_func].type == "split-switch"
+                ]
+                if (
+                    self._is_conditional_node(node)
+                    or self._is_conditional_skip_node(node)
+                    or self._is_conditional_join_node(node)
+                ) and switch_in_funcs:
+                    conditional_when = "||".join(
+                        [
+                            "{{tasks.%s.outputs.parameters.switch-step}}==%s"
+                            % (self._sanitize(switch_in_func), node.name)
+                            for switch_in_func in switch_in_funcs
+                        ]
+                    )
+                    non_switch_in_funcs = [
+                        in_func
+                        for in_func in node.in_funcs
+                        if in_func not in switch_in_funcs
+                    ]
+                    status_when = ""
+                    if non_switch_in_funcs:
+                        status_when = "||".join(
+                            [
+                                "{{tasks.%s.status}}==Succeeded"
+                                % self._sanitize(in_func)
+                                for in_func in non_switch_in_funcs
+                            ]
+                        )
+                    total_when = (
+                        f"({status_when}) || ({conditional_when})"
+                        if status_when
+                        else conditional_when
+                    )
+                    dag_task.when(total_when)
+            dag_tasks.append(dag_task)
             # End the workflow if we have reached the end of the flow
             if node.type == "end":
-                return [
-                    Template(self.flow.name).dag(
-                        DAGTemplate().fail_fast().tasks(dag_tasks)
-                    )
-                ] + templates, dag_tasks
+                return templates, dag_tasks
             # For split nodes traverse all the children
             if node.type == "split":
                 for n in node.out_funcs:
@@ -948,6 +1479,7 @@ class ArgoWorkflows(object):
                         templates,
                         dag_tasks,
                         parent_foreach,
+                        seen,
                     )
                 return _visit(
                     self.graph[node.matching_join],
@@ -955,46 +1487,201 @@ class ArgoWorkflows(object):
                     templates,
                     dag_tasks,
                     parent_foreach,
+                    seen,
                 )
-            # For foreach nodes generate a new sub DAGTemplate
-            elif node.type == "foreach":
-                foreach_template_name = self._sanitize(
-                    "%s-foreach-%s"
-                    % (
-                        node.name,
-                        node.foreach_param,
-                    )
-                )
-                foreach_task = (
-                    DAGTask(foreach_template_name)
-                    .dependencies([self._sanitize(node.name)])
-                    .template(foreach_template_name)
-                    .arguments(
-                        Arguments().parameters(
+            elif node.type == "split-switch":
+                if self._is_recursive_node(node):
+                    # we need an additional recursive template if the step is recursive
+                    # NOTE: in the recursive case, the original step is renamed in the container templates to 'recursive-<step_name>'
+                    # so that we do not have to touch the step references in the DAG.
+                    #
+                    # NOTE: The way that recursion in Argo Workflows is achieved is with the following structure:
+                    # - the usual 'example-step' template which would match example_step in flow code is renamed to 'recursive-example-step'
+                    # - templates has another template with the original task name: 'example-step'
+                    # - the template 'example-step' in turn has steps
+                    #   - 'example-step-internal' which uses the metaflow step executing template 'recursive-example-step'
+                    #   - 'example-step-recursion' which calls the parent template 'example-step' if switch-step output from 'example-step-internal' matches the condition.
+                    sanitized_name = self._sanitize(node.name)
+                    templates.append(
+                        Template(sanitized_name)
+                        .steps(
                             [
-                                Parameter("input-paths").value(
-                                    "argo-{{workflow.name}}/%s/{{tasks.%s.outputs.parameters.task-id}}"
-                                    % (node.name, self._sanitize(node.name))
-                                ),
-                                Parameter("split-index").value("{{item}}"),
+                                WorkflowStep()
+                                .name("%s-internal" % sanitized_name)
+                                .template("recursive-%s" % sanitized_name)
+                                .arguments(
+                                    Arguments().parameters(
+                                        [
+                                            Parameter("input-paths").value(
+                                                "{{inputs.parameters.input-paths}}"
+                                            )
+                                        ]
+                                        # Add the additional inputs required by specific node types.
+                                        # We do not need to cover joins or @parallel, as a split-switch step can not be either one of these.
+                                        + (
+                                            [
+                                                Parameter("split-index").value(
+                                                    "{{inputs.parameters.split-index}}"
+                                                )
+                                            ]
+                                            if has_foreach_inputs
+                                            else []
+                                        )
+                                    )
+                                )
                             ]
-                            + (
-                                [
-                                    Parameter("root-input-path").value(
-                                        "argo-{{workflow.name}}/%s/{{tasks.%s.outputs.parameters.task-id}}"
+                        )
+                        .steps(
+                            [
+                                WorkflowStep()
+                                .name("%s-recursion" % sanitized_name)
+                                .template(sanitized_name)
+                                .when(
+                                    "{{steps.%s-internal.outputs.parameters.switch-step}}==%s"
+                                    % (sanitized_name, node.name)
+                                )
+                                .arguments(
+                                    Arguments().parameters(
+                                        [
+                                            Parameter("input-paths").value(
+                                                "argo-{{workflow.name}}/%s/{{steps.%s-internal.outputs.parameters.task-id}}"
+                                                % (node.name, sanitized_name)
+                                            )
+                                        ]
+                                        + (
+                                            [
+                                                Parameter("split-index").value(
+                                                    "{{inputs.parameters.split-index}}"
+                                                )
+                                            ]
+                                            if has_foreach_inputs
+                                            else []
+                                        )
+                                    )
+                                ),
+                            ]
+                        )
+                        .inputs(Inputs().parameters(parameters))
+                        .outputs(
+                            # NOTE: We try to read the output parameters from the recursive template call first (<step>-recursion), and the internal step second (<step>-internal).
+                            # This guarantees that we always get the output parameters of the last recursive step that executed.
+                            Outputs().parameters(
+                                [
+                                    Parameter("task-id").valueFrom(
+                                        {
+                                            "expression": "(steps['%s-recursion']?.outputs ?? steps['%s-internal']?.outputs).parameters['task-id']"
+                                            % (sanitized_name, sanitized_name)
+                                        }
+                                    ),
+                                    Parameter("switch-step").valueFrom(
+                                        {
+                                            "expression": "(steps['%s-recursion']?.outputs ?? steps['%s-internal']?.outputs).parameters['switch-step']"
+                                            % (sanitized_name, sanitized_name)
+                                        }
+                                    ),
+                                ]
+                            )
+                        )
+                    )
+                for n in node.out_funcs:
+                    _visit(
+                        self.graph[n],
+                        self._matching_conditional_join(node),
+                        templates,
+                        dag_tasks,
+                        parent_foreach,
+                        seen,
+                    )
+                return _visit(
+                    self.graph[self._matching_conditional_join(node)],
+                    exit_node,
+                    templates,
+                    dag_tasks,
+                    parent_foreach,
+                    seen,
+                )
+            # For foreach nodes generate a new sub DAGTemplate
+            # We do this for "regular" foreaches (ie. `self.next(self.a, foreach=)`)
+            elif node.type == "foreach":
+                foreach_template_name = self._sanitize(
+                    "%s-foreach-%s"
+                    % (
+                        node.name,
+                        "parallel" if node.parallel_foreach else node.foreach_param,
+                        # Since foreach's are derived based on `self.next(self.a, foreach="<varname>")`
+                        # vs @parallel foreach are done based on `self.next(self.a, num_parallel="<some-number>")`,
+                        # we need to ensure that `foreach_template_name` suffix is appropriately set based on the kind
+                        # of foreach.
+                    )
+                )
+                # There are two separate "DAGTask"s created for the foreach node.
+                # - The first one is a "jump-off" DAGTask where we propagate the
+                # input-paths and split-index. This thing doesn't create
+                # any actual containers and it responsible for only propagating
+                # the parameters.
+                # - The DAGTask that follows first DAGTask is the one
+                # that uses the ContainerTemplate. This DAGTask is named the same
+                # thing as the foreach node. We will leverage a similar pattern for the
+                # @parallel tasks.
+                #
+                foreach_task = (
+                    DAGTask(foreach_template_name)
+                    .depends(f"{self._sanitize(node.name)}.Succeeded")
+                    .template(foreach_template_name)
+                    .arguments(
+                        Arguments().parameters(
+                            [
+                                Parameter("input-paths").value(
+                                    "argo-{{workflow.name}}/%s/{{tasks.%s.outputs.parameters.task-id}}"
+                                    % (node.name, self._sanitize(node.name))
+                                ),
+                                Parameter("split-index").value("{{item}}"),
+                            ]
+                            + (
+                                [
+                                    Parameter("root-input-path").value(
+                                        "argo-{{workflow.name}}/%s/{{tasks.%s.outputs.parameters.task-id}}"
                                         % (node.name, self._sanitize(node.name))
                                     ),
                                 ]
                                 if parent_foreach
                                 else []
                             )
+                            + (
+                                # Disabiguate parameters for a regular `foreach` vs a `@parallel` foreach
+                                [
+                                    Parameter("num-parallel").value(
+                                        "{{tasks.%s.outputs.parameters.num-parallel}}"
+                                        % self._sanitize(node.name)
+                                    ),
+                                    Parameter("task-id-entropy").value(
+                                        "{{tasks.%s.outputs.parameters.task-id-entropy}}"
+                                        % self._sanitize(node.name)
+                                    ),
+                                ]
+                                if node.parallel_foreach
+                                else []
+                            )
                         )
                     )
                     .with_param(
+                        # For @parallel workloads `num-splits` will be explicitly set to one so that
+                        # we can piggyback on the current mechanism with which we leverage argo.
                         "{{tasks.%s.outputs.parameters.num-splits}}"
                         % self._sanitize(node.name)
                     )
                 )
+                # Add conditional if this is the first step in a conditional branch
+                if self._is_conditional_node(node) and not any(
+                    self._is_conditional_node(self.graph[in_func])
+                    for in_func in node.in_funcs
+                ):
+                    in_func = node.in_funcs[0]
+                    foreach_task.when(
+                        "{{tasks.%s.outputs.parameters.switch-step}}==%s"
+                        % (self._sanitize(in_func), node.name)
+                    )
                 dag_tasks.append(foreach_task)
                 templates, dag_tasks_1 = _visit(
                     self.graph[node.out_funcs[0]],
@@ -1002,18 +1689,36 @@ class ArgoWorkflows(object):
                     templates,
                     [],
                     node.name,
+                    seen,
                 )
+                # How do foreach's work on Argo:
+                # Lets say you have the following dag: (start[sets `foreach="x"`]) --> (task-a [actual foreach]) --> (join) --> (end)
+                # With argo we will :
+                # (start [sets num-splits]) --> (task-a-foreach-(0,0) [dummy task]) --> (task-a) --> (join) --> (end)
+                # The (task-a-foreach-(0,0) [dummy task]) propagates the values of the `split-index` and the input paths.
+                # to the actual foreach task.
                 templates.append(
                     Template(foreach_template_name)
                     .inputs(
                         Inputs().parameters(
                             [Parameter("input-paths"), Parameter("split-index")]
                             + ([Parameter("root-input-path")] if parent_foreach else [])
+                            + (
+                                [
+                                    Parameter("num-parallel"),
+                                    Parameter("task-id-entropy"),
+                                    # Parameter("workerCount")
+                                ]
+                                if node.parallel_foreach
+                                else []
+                            )
                         )
                     )
                     .outputs(
                         Outputs().parameters(
                             [
+                                # non @parallel tasks set task-ids as outputs
                                 Parameter("task-id").valueFrom(
                                     {
                                         "parameter": "{{tasks.%s.outputs.parameters.task-id}}"
@@ -1021,31 +1726,84 @@ class ArgoWorkflows(object):
                                             self.graph[node.matching_join].in_funcs[0]
                                         )
                                     }
-                                )
+                                    if not self._is_conditional_join_node(
+                                        self.graph[node.matching_join]
+                                    )
+                                    else
+                                    # Note: If the nodes leading to the join are conditional, then we need to use an expression to pick the outputs from the task that executed.
+                                    # ref for operators: https://github.com/expr-lang/expr/blob/master/docs/language-definition.md
+                                    {
+                                        "expression": "get((%s)?.parameters, 'task-id')"
+                                        % " ?? ".join(
+                                            f"tasks['{self._sanitize(func)}']?.outputs"
+                                            for func in self.graph[
+                                                node.matching_join
+                                            ].in_funcs
+                                        )
+                                    }
+                                ),
+                            ]
+                            if not node.parallel_foreach
+                            else [
+                                # @parallel tasks set `task-id-entropy` and `num-parallel`
+                                # as outputs so task-ids can be derived in the join step.
+                                # Both of these values should be propagated from the
+                                # jobset labels.
+                                Parameter("num-parallel").valueFrom(
+                                    {
+                                        "parameter": "{{tasks.%s.outputs.parameters.num-parallel}}"
+                                        % self._sanitize(
+                                            self.graph[node.matching_join].in_funcs[0]
+                                        )
+                                    }
+                                ),
+                                Parameter("task-id-entropy").valueFrom(
+                                    {
+                                        "parameter": "{{tasks.%s.outputs.parameters.task-id-entropy}}"
+                                        % self._sanitize(
+                                            self.graph[node.matching_join].in_funcs[0]
+                                        )
+                                    }
+                                ),
                             ]
                         )
                     )
                     .dag(DAGTemplate().fail_fast().tasks(dag_tasks_1))
                 )
                 join_foreach_task = (
                     DAGTask(self._sanitize(self.graph[node.matching_join].name))
                     .template(self._sanitize(self.graph[node.matching_join].name))
-                    .dependencies([foreach_template_name])
+                    .depends(f"{foreach_template_name}.Succeeded")
                     .arguments(
                         Arguments().parameters(
-                            [
-                                Parameter("input-paths").value(
-                                    "argo-{{workflow.name}}/%s/{{tasks.%s.outputs.parameters.task-id}}"
-                                    % (node.name, self._sanitize(node.name))
-                                ),
-                                Parameter("split-cardinality").value(
-                                    "{{tasks.%s.outputs.parameters.split-cardinality}}"
-                                    % self._sanitize(node.name)
-                                ),
-                            ]
+                            (
+                                [
+                                    Parameter("input-paths").value(
+                                        "argo-{{workflow.name}}/%s/{{tasks.%s.outputs.parameters.task-id}}"
+                                        % (node.name, self._sanitize(node.name))
+                                    ),
+                                    Parameter("split-cardinality").value(
+                                        "{{tasks.%s.outputs.parameters.split-cardinality}}"
+                                        % self._sanitize(node.name)
+                                    ),
+                                ]
+                                if not node.parallel_foreach
+                                else [
+                                    Parameter("num-parallel").value(
+                                        "{{tasks.%s.outputs.parameters.num-parallel}}"
+                                        % self._sanitize(node.name)
+                                    ),
+                                    Parameter("task-id-entropy").value(
+                                        "{{tasks.%s.outputs.parameters.task-id-entropy}}"
+                                        % self._sanitize(node.name)
+                                    ),
+                                ]
+                            )
                             + (
                                 [
                                     Parameter("split-index").value(
+                                        # TODO : Pass down these parameters to the jobset stuff.
                                         "{{inputs.parameters.split-index}}"
                                     ),
                                     Parameter("root-input-path").value(
@@ -1065,6 +1823,7 @@ class ArgoWorkflows(object):
                     templates,
                     dag_tasks,
                     parent_foreach,
+                    seen,
                 )
             # For linear nodes continue traversing to the next node
             if node.type in ("linear", "join", "start"):
@@ -1074,6 +1833,7 @@ class ArgoWorkflows(object):
                     templates,
                     dag_tasks,
                     parent_foreach,
+                    seen,
                 )
             else:
                 raise ArgoWorkflowsException(
@@ -1081,7 +1841,17 @@ class ArgoWorkflows(object):
                     "Argo Workflows." % (node.type, node.name)
                 )
-        templates, _ = _visit(node=self.graph["start"])
+        # Generate daemon tasks
+        daemon_tasks = [
+            DAGTask("%s-task" % daemon_template.name).template(daemon_template.name)
+            for daemon_template in self._daemon_templates()
+        ]
+        templates, dag_tasks = _visit(node=self.graph["start"], dag_tasks=daemon_tasks)
+        # Add the DAG template only after fully traversing the graph so we are guaranteed to have all the dag_tasks collected.
+        templates.append(
+            Template(self.flow.name).dag(DAGTemplate().fail_fast().tasks(dag_tasks))
+        )
         return templates
     # Visit every node and yield ContainerTemplates.
@@ -1123,10 +1893,32 @@ class ArgoWorkflows(object):
             # export input_paths as it is used multiple times in the container script
             # and we do not want to repeat the values.
             input_paths_expr = "export INPUT_PATHS=''"
-            if node.name != "start":
+            # If node is not a start step or a @parallel join then we will set the input paths.
+            # To set the input-paths as a parameter, we need to ensure that the node
+            # is not (a start node or a parallel join node). Start nodes will have no
+            # input paths and parallel join will derive input paths based on a
+            # formulaic approach using `num-parallel` and `task-id-entropy`.
+            if not (
+                node.name == "start"
+                or (node.type == "join" and self.graph[node.in_funcs[0]].parallel_step)
+            ):
+                # For parallel joins we don't pass the INPUT_PATHS but are dynamically constructed.
+                # So we don't need to set the input paths.
                 input_paths_expr = (
                     "export INPUT_PATHS={{inputs.parameters.input-paths}}"
                 )
+                if (
+                    self._is_conditional_join_node(node)
+                    or self._many_in_funcs_all_conditional(node)
+                    or self._is_conditional_skip_node(node)
+                ):
+                    # NOTE: Argo template expressions that fail to resolve, output the expression itself as a value.
+                    # With conditional steps, some of the input-paths are therefore 'broken' due to containing a nil expression
+                    # e.g. "{{ tasks['A'].outputs.parameters.task-id }}" when task A never executed.
+                    # We base64 encode the input-paths in order to not pollute the execution environment with templating expressions.
+                    # NOTE: Adding conditionals that check if a key exists or not does not work either, due to an issue with how Argo
+                    # handles tasks in a nested foreach (withParam template) leading to all such expressions getting evaluated as false.
+                    input_paths_expr = "export INPUT_PATHS={{=toBase64(inputs.parameters['input-paths'])}}"
                 input_paths = "$(echo $INPUT_PATHS)"
             if any(self.graph[n].type == "foreach" for n in node.in_funcs):
                 task_idx = "{{inputs.parameters.split-index}}"
@@ -1142,7 +1934,6 @@ class ArgoWorkflows(object):
                     # foreaches
                     task_idx = "{{inputs.parameters.split-index}}"
                     root_input = "{{inputs.parameters.root-input-path}}"
             # Task string to be hashed into an ID
             task_str = "-".join(
                 [
@@ -1152,13 +1943,23 @@ class ArgoWorkflows(object):
                     task_idx,
                 ]
             )
+            if node.parallel_step:
+                task_str = "-".join(
+                    [
+                        "$TASK_ID_PREFIX",
+                        "{{inputs.parameters.task-id-entropy}}",
+                        "$TASK_ID_SUFFIX",
+                    ]
+                )
+            else:
+                # Generated task_ids need to be non-numeric - see register_task_id in
+                # service.py. We do so by prefixing `t-`
+                _task_id_base = (
+                    "$(echo %s | md5sum | cut -d ' ' -f 1 | tail -c 9)" % task_str
+                )
+                task_str = "(t-%s)" % _task_id_base
-            # Generated task_ids need to be non-numeric - see register_task_id in
-            # service.py. We do so by prefixing `t-`
-            task_id_expr = (
-                "export METAFLOW_TASK_ID="
-                "(t-$(echo %s | md5sum | cut -d ' ' -f 1 | tail -c 9))" % task_str
-            )
+            task_id_expr = "export METAFLOW_TASK_ID=" "%s" % task_str
             task_id = "$METAFLOW_TASK_ID"
             # Resolve retry strategy.
@@ -1177,9 +1978,18 @@ class ArgoWorkflows(object):
             user_code_retries = max_user_code_retries
             total_retries = max_user_code_retries + max_error_retries
             # {{retries}} is only available if retryStrategy is specified
+            # For custom kubernetes manifests, we will pass the retryCount as a parameter
+            # and use that in the manifest.
             retry_count = (
-                "{{retries}}" if max_user_code_retries + max_error_retries else 0
+                (
+                    "{{retries}}"
+                    if not node.parallel_step
+                    else "{{inputs.parameters.retryCount}}"
+                )
+                if total_retries
+                else 0
             )
             minutes_between_retries = int(minutes_between_retries)
             # Configure log capture.
@@ -1206,7 +2016,9 @@ class ArgoWorkflows(object):
                     mflog_expr,
                 ]
                 + self.environment.get_package_commands(
-                    self.code_package_url, self.flow_datastore.TYPE
+                    self.code_package_url,
+                    self.flow_datastore.TYPE,
+                    self.code_package_metadata,
                 )
             )
             step_cmds = self.environment.bootstrap_commands(
@@ -1218,12 +2030,13 @@ class ArgoWorkflows(object):
                     decorator.make_decorator_spec()
                     for decorator in node.decorators
                     if not decorator.statically_defined
+                    and decorator.inserted_by is None
                 ]
             }
             # FlowDecorators can define their own top-level options. They are
             # responsible for adding their own top-level options and values through
             # the get_top_level_options() hook. See similar logic in runtime.py.
-            for deco in flow_decorators():
+            for deco in flow_decorators(self.flow):
                 top_opts_dict.update(deco.get_top_level_options())
             top_level = list(dict_to_cli_options(top_opts_dict)) + [
@@ -1255,7 +2068,7 @@ class ArgoWorkflows(object):
                         # {{foo.bar['param_name']}}.
                         # https://argoproj.github.io/argo-events/tutorials/02-parameterization/
                         # http://masterminds.github.io/sprig/strings.html
-                        "--%s={{workflow.parameters.%s}}"
+                        "--%s=\\\"$(python -m metaflow.plugins.argo.param_val {{=toBase64(workflow.parameters['%s'])}})\\\""
                         % (parameter["name"], parameter["name"])
                         for parameter in self.parameters.values()
                     ]
@@ -1277,21 +2090,63 @@ class ArgoWorkflows(object):
                     ]
                 )
                 input_paths = "%s/_parameters/%s" % (run_id, task_id_params)
+            # Only for static joins and conditional_joins
+            elif (
+                self._is_conditional_join_node(node)
+                or self._many_in_funcs_all_conditional(node)
+                or self._is_conditional_skip_node(node)
+            ) and not (
+                node.type == "join"
+                and self.graph[node.split_parents[-1]].type == "foreach"
+            ):
+                # we need to pass in the set of conditional in_funcs to the pathspec generating script as in the case of split-switch skipping cases,
+                # non-conditional input-paths need to be ignored in favour of conditional ones when they have executed.
+                skippable_input_steps = ",".join(
+                    [
+                        in_func
+                        for in_func in node.in_funcs
+                        if self.graph[in_func].type == "split-switch"
+                    ]
+                )
+                input_paths = (
+                    "$(python -m metaflow.plugins.argo.conditional_input_paths %s %s)"
+                    % (input_paths, skippable_input_steps)
+                )
             elif (
                 node.type == "join"
                 and self.graph[node.split_parents[-1]].type == "foreach"
             ):
+                # foreach-joins straight out of conditional branches are not yet supported
+                if self._is_conditional_join_node(node) and len(node.in_funcs) > 1:
+                    raise ArgoWorkflowsException(
+                        "Conditional steps inside a foreach that transition directly into a join step are not currently supported.\n"
+                        "As a workaround, add a common step after the conditional steps %s "
+                        "that will transition to a join."
+                        % ", ".join("*%s*" % f for f in node.in_funcs)
+                    )
                 # Set aggregated input-paths for a for-each join
                 foreach_step = next(
                     n for n in node.in_funcs if self.graph[n].is_inside_foreach
                 )
-                input_paths = (
-                    "$(python -m metaflow.plugins.argo.generate_input_paths %s {{workflow.creationTimestamp}} %s {{inputs.parameters.split-cardinality}})"
-                    % (
-                        foreach_step,
-                        input_paths,
+                if not self.graph[node.split_parents[-1]].parallel_foreach:
+                    input_paths = (
+                        "$(python -m metaflow.plugins.argo.generate_input_paths %s {{workflow.creationTimestamp}} %s {{inputs.parameters.split-cardinality}})"
+                        % (
+                            foreach_step,
+                            input_paths,
+                        )
                     )
-                )
+                else:
+                    # Handle @parallel where output from volume mount isn't accessible
+                    input_paths = (
+                        "$(python -m metaflow.plugins.argo.jobset_input_paths %s %s {{inputs.parameters.task-id-entropy}} {{inputs.parameters.num-parallel}})"
+                        % (
+                            run_id,
+                            foreach_step,
+                        )
+                    )
+            # NOTE: input-paths might be extremely lengthy so we dump these to disk instead of passing them directly to the cmd
+            step_cmds.append("echo %s >> /tmp/mf-input-paths" % input_paths)
             step = [
                 "step",
                 node.name,
@@ -1299,9 +2154,16 @@ class ArgoWorkflows(object):
                 "--task-id %s" % task_id,
                 "--retry-count %s" % retry_count,
                 "--max-user-code-retries %d" % user_code_retries,
-                "--input-paths %s" % input_paths,
+                "--input-paths-filename /tmp/mf-input-paths",
             ]
-            if any(self.graph[n].type == "foreach" for n in node.in_funcs):
+            if node.parallel_step:
+                step.append(
+                    "--split-index ${MF_CONTROL_INDEX:-$((MF_WORKER_REPLICA_INDEX + 1))}"
+                )
+                # This is needed for setting the value of the UBF context in the CLI.
+                step.append("--ubf-context $UBF_CONTEXT")
+            elif any(self.graph[n].type == "foreach" for n in node.in_funcs):
                 # Pass split-index to a foreach task
                 step.append("--split-index {{inputs.parameters.split-index}}")
             if self.tags:
@@ -1367,6 +2229,7 @@ class ArgoWorkflows(object):
                     **{
                         # These values are needed by Metaflow to set it's internal
                         # state appropriately.
+                        "METAFLOW_CODE_METADATA": self.code_package_metadata,
                         "METAFLOW_CODE_URL": self.code_package_url,
                         "METAFLOW_CODE_SHA": self.code_package_sha,
                         "METAFLOW_CODE_DS": self.flow_datastore.TYPE,
@@ -1395,6 +2258,7 @@ class ArgoWorkflows(object):
                     },
                     **{
                         # Some optional values for bookkeeping
+                        "METAFLOW_FLOW_FILENAME": os.path.basename(sys.argv[0]),
                         "METAFLOW_FLOW_NAME": self.flow.name,
                         "METAFLOW_STEP_NAME": node.name,
                         "METAFLOW_RUN_ID": run_id,
@@ -1413,20 +2277,30 @@ class ArgoWorkflows(object):
             # support Metaflow sandboxes
             env["METAFLOW_INIT_SCRIPT"] = KUBERNETES_SANDBOX_INIT_SCRIPT
+            env["METAFLOW_KUBERNETES_SANDBOX_INIT_SCRIPT"] = (
+                KUBERNETES_SANDBOX_INIT_SCRIPT
+            )
             # support for @secret
             env["METAFLOW_DEFAULT_SECRETS_BACKEND_TYPE"] = DEFAULT_SECRETS_BACKEND_TYPE
-            env[
-                "METAFLOW_AWS_SECRETS_MANAGER_DEFAULT_REGION"
-            ] = AWS_SECRETS_MANAGER_DEFAULT_REGION
+            env["METAFLOW_AWS_SECRETS_MANAGER_DEFAULT_REGION"] = (
+                AWS_SECRETS_MANAGER_DEFAULT_REGION
+            )
             env["METAFLOW_GCP_SECRET_MANAGER_PREFIX"] = GCP_SECRET_MANAGER_PREFIX
+            env["METAFLOW_AZURE_KEY_VAULT_PREFIX"] = AZURE_KEY_VAULT_PREFIX
             # support for Azure
-            env[
-                "METAFLOW_AZURE_STORAGE_BLOB_SERVICE_ENDPOINT"
-            ] = AZURE_STORAGE_BLOB_SERVICE_ENDPOINT
+            env["METAFLOW_AZURE_STORAGE_BLOB_SERVICE_ENDPOINT"] = (
+                AZURE_STORAGE_BLOB_SERVICE_ENDPOINT
+            )
             env["METAFLOW_DATASTORE_SYSROOT_AZURE"] = DATASTORE_SYSROOT_AZURE
             env["METAFLOW_CARD_AZUREROOT"] = CARD_AZUREROOT
+            env["METAFLOW_ARGO_WORKFLOWS_KUBERNETES_SECRETS"] = (
+                ARGO_WORKFLOWS_KUBERNETES_SECRETS
+            )
+            env["METAFLOW_ARGO_WORKFLOWS_ENV_VARS_TO_SKIP"] = (
+                ARGO_WORKFLOWS_ENV_VARS_TO_SKIP
+            )
             # support for GCP
             env["METAFLOW_DATASTORE_SYSROOT_GS"] = DATASTORE_SYSROOT_GS
@@ -1449,6 +2323,13 @@ class ArgoWorkflows(object):
             metaflow_version["production_token"] = self.production_token
             env["METAFLOW_VERSION"] = json.dumps(metaflow_version)
+            # map config values
+            cfg_env = {
+                param["name"]: param["kv_name"] for param in self.config_parameters
+            }
+            if cfg_env:
+                env["METAFLOW_FLOW_CONFIG_VALUE"] = json.dumps(cfg_env)
             # Set the template inputs and outputs for passing state. Very simply,
             # the container template takes in input-paths as input and outputs
             # the task-id (which feeds in as input-paths to the subsequent task).
@@ -1463,17 +2344,45 @@ class ArgoWorkflows(object):
             # join task deterministically inside the join task without resorting to
             # passing a rather long list of (albiet compressed)
             inputs = []
-            if node.name != "start":
+            # To set the input-paths as a parameter, we need to ensure that the node
+            # is not (a start node or a parallel join node). Start nodes will have no
+            # input paths and parallel join will derive input paths based on a
+            # formulaic approach.
+            if not (
+                node.name == "start"
+                or (node.type == "join" and self.graph[node.in_funcs[0]].parallel_step)
+            ):
                 inputs.append(Parameter("input-paths"))
             if any(self.graph[n].type == "foreach" for n in node.in_funcs):
                 # Fetch split-index from parent
                 inputs.append(Parameter("split-index"))
             if (
                 node.type == "join"
                 and self.graph[node.split_parents[-1]].type == "foreach"
             ):
-                # append this only for joins of foreaches, not static splits
-                inputs.append(Parameter("split-cardinality"))
+                # @parallel join tasks require `num-parallel` and `task-id-entropy`
+                # to construct the input paths, so we pass them down as input parameters.
+                if self.graph[node.split_parents[-1]].parallel_foreach:
+                    inputs.extend(
+                        [Parameter("num-parallel"), Parameter("task-id-entropy")]
+                    )
+                else:
+                    # append these only for joins of foreaches, not static splits
+                    inputs.append(Parameter("split-cardinality"))
+            # check if the node is a @parallel node.
+            elif node.parallel_step:
+                inputs.extend(
+                    [
+                        Parameter("num-parallel"),
+                        Parameter("task-id-entropy"),
+                        Parameter("jobset-name"),
+                        Parameter("workerCount"),
+                    ]
+                )
+                if any(d.name == "retry" for d in node.decorators):
+                    inputs.append(Parameter("retryCount"))
             if node.is_inside_foreach and self.graph[node.out_funcs[0]].type == "join":
                 if any(
                     self.graph[parent].matching_join
@@ -1490,8 +2399,17 @@ class ArgoWorkflows(object):
                     inputs.append(Parameter("root-input-path"))
             outputs = []
-            if node.name != "end":
+            # @parallel steps will not have a task-id as an output parameter since task-ids
+            # are derived at runtime.
+            if not (node.name == "end" or node.parallel_step):
                 outputs = [Parameter("task-id").valueFrom({"path": "/mnt/out/task_id"})]
+            # If this step is a split-switch one, we need to output the switch step name
+            if node.type == "split-switch":
+                outputs.append(
+                    Parameter("switch-step").valueFrom({"path": "/mnt/out/switch_step"})
+                )
             if node.type == "foreach":
                 # Emit split cardinality from foreach task
                 outputs.append(
@@ -1503,6 +2421,19 @@ class ArgoWorkflows(object):
                     )
                 )
+            if node.parallel_foreach:
+                outputs.extend(
+                    [
+                        Parameter("num-parallel").valueFrom(
+                            {"path": "/mnt/out/num_parallel"}
+                        ),
+                        Parameter("task-id-entropy").valueFrom(
+                            {"path": "/mnt/out/task_id_entropy"}
+                        ),
+                    ]
+                )
+            # Outputs should be defined over here and not in the _dag_template for @parallel.
             # It makes no sense to set env vars to None (shows up as "None" string)
             # Also we skip some env vars (e.g. in case we want to pull them from KUBERNETES_SECRETS)
             env = {
@@ -1512,6 +2443,12 @@ class ArgoWorkflows(object):
                 and k not in set(ARGO_WORKFLOWS_ENV_VARS_TO_SKIP.split(","))
             }
+            # OBP configs
+            additional_obp_configs = {
+                "OBP_PERIMETER": self.initial_configs["OBP_PERIMETER"],
+                "OBP_INTEGRATIONS_URL": self.initial_configs["OBP_INTEGRATIONS_URL"],
+            }
             # Tmpfs variables
             use_tmpfs = resources["use_tmpfs"]
             tmpfs_size = resources["tmpfs_size"]
@@ -1528,262 +2465,938 @@ class ArgoWorkflows(object):
             if tmpfs_enabled and tmpfs_tempdir:
                 env["METAFLOW_TEMPDIR"] = tmpfs_path
+            qos_requests, qos_limits = qos_requests_and_limits(
+                resources["qos"],
+                resources["cpu"],
+                resources["memory"],
+                resources["disk"],
+            )
+            security_context = resources.get("security_context", None)
+            _security_context = {}
+            if security_context is not None and len(security_context) > 0:
+                _security_context = {
+                    "security_context": kubernetes_sdk.V1SecurityContext(
+                        **security_context
+                    )
+                }
             # Create a ContainerTemplate for this node. Ideally, we would have
             # liked to inline this ContainerTemplate and avoid scanning the workflow
             # twice, but due to issues with variable substitution, we will have to
             # live with this routine.
-            yield (
-                Template(self._sanitize(node.name))
-                # Set @timeout values
-                .active_deadline_seconds(run_time_limit)
-                # Set service account
-                .service_account_name(resources["service_account"])
-                # Configure template input
-                .inputs(Inputs().parameters(inputs))
-                # Configure template output
-                .outputs(Outputs().parameters(outputs))
-                # Fail fast!
-                .fail_fast()
-                # Set @retry/@catch values
-                .retry_strategy(
-                    times=total_retries,
-                    minutes_between_retries=minutes_between_retries,
-                )
-                .metadata(
-                    ObjectMeta().annotation("metaflow/step_name", node.name)
-                    # Unfortunately, we can't set the task_id since it is generated
-                    # inside the pod. However, it can be inferred from the annotation
-                    # set by argo-workflows - `workflows.argoproj.io/outputs` - refer
-                    # the field 'task-id' in 'parameters'
-                    # .annotation("metaflow/task_id", ...)
-                    .annotation("metaflow/attempt", retry_count)
-                )
-                # Set emptyDir volume for state management
-                .empty_dir_volume("out")
-                # Set tmpfs emptyDir volume if enabled
-                .empty_dir_volume(
-                    "tmpfs-ephemeral-volume",
-                    medium="Memory",
-                    size_limit=tmpfs_size if tmpfs_enabled else 0,
-                )
-                .empty_dir_volume("dhsm", medium="Memory", size_limit=shared_memory)
-                .pvc_volumes(resources.get("persistent_volume_claims"))
-                # Set node selectors
-                .node_selectors(resources.get("node_selector"))
-                # Set tolerations
-                .tolerations(resources.get("tolerations"))
-                # Set container
-                .container(
-                    # TODO: Unify the logic with kubernetes.py
-                    # Important note - Unfortunately, V1Container uses snakecase while
-                    # Argo Workflows uses camel. For most of the attributes, both cases
-                    # are indistinguishable, but unfortunately, not for all - (
-                    # env_from, value_from, etc.) - so we need to handle the conversion
-                    # ourselves using to_camelcase. We need to be vigilant about
-                    # resources attributes in particular where the keys maybe user
-                    # defined.
-                    to_camelcase(
-                        kubernetes_sdk.V1Container(
-                            name=self._sanitize(node.name),
-                            command=cmds,
-                            ports=[kubernetes_sdk.V1ContainerPort(container_port=port)]
-                            if port
-                            else None,
-                            env=[
-                                kubernetes_sdk.V1EnvVar(name=k, value=str(v))
-                                for k, v in env.items()
-                            ]
-                            # Add environment variables for book-keeping.
-                            # https://argoproj.github.io/argo-workflows/fields/#fields_155
-                            + [
-                                kubernetes_sdk.V1EnvVar(
-                                    name=k,
-                                    value_from=kubernetes_sdk.V1EnvVarSource(
-                                        field_ref=kubernetes_sdk.V1ObjectFieldSelector(
-                                            field_path=str(v)
-                                        )
-                                    ),
-                                )
-                                for k, v in {
-                                    "METAFLOW_KUBERNETES_POD_NAMESPACE": "metadata.namespace",
-                                    "METAFLOW_KUBERNETES_POD_NAME": "metadata.name",
-                                    "METAFLOW_KUBERNETES_POD_ID": "metadata.uid",
-                                    "METAFLOW_KUBERNETES_SERVICE_ACCOUNT_NAME": "spec.serviceAccountName",
-                                    "METAFLOW_KUBERNETES_NODE_IP": "status.hostIP",
-                                }.items()
-                            ],
-                            image=resources["image"],
-                            image_pull_policy=resources["image_pull_policy"],
-                            resources=kubernetes_sdk.V1ResourceRequirements(
-                                requests={
-                                    "cpu": str(resources["cpu"]),
-                                    "memory": "%sM" % str(resources["memory"]),
-                                    "ephemeral-storage": "%sM" % str(resources["disk"]),
-                                },
-                                limits={
-                                    "%s.com/gpu".lower()
-                                    % resources["gpu_vendor"]: str(resources["gpu"])
-                                    for k in [0]
-                                    if resources["gpu"] is not None
-                                },
-                            ),
-                            # Configure secrets
-                            env_from=[
-                                kubernetes_sdk.V1EnvFromSource(
-                                    secret_ref=kubernetes_sdk.V1SecretEnvSource(
-                                        name=str(k),
-                                        # optional=True
-                                    )
-                                )
-                                for k in list(
+            if node.parallel_step:
+                jobset_name = "{{inputs.parameters.jobset-name}}"
+                jobset = KubernetesArgoJobSet(
+                    kubernetes_sdk=kubernetes_sdk,
+                    name=jobset_name,
+                    flow_name=self.flow.name,
+                    run_id=run_id,
+                    step_name=self._sanitize(node.name),
+                    task_id=task_id,
+                    attempt=retry_count,
+                    user=self.username,
+                    subdomain=jobset_name,
+                    command=cmds,
+                    namespace=resources["namespace"],
+                    image=resources["image"],
+                    image_pull_policy=resources["image_pull_policy"],
+                    image_pull_secrets=resources["image_pull_secrets"],
+                    service_account=resources["service_account"],
+                    secrets=(
+                        [
+                            k
+                            for k in (
+                                list(
                                     []
                                     if not resources.get("secrets")
-                                    else [resources.get("secrets")]
-                                    if isinstance(resources.get("secrets"), str)
-                                    else resources.get("secrets")
+                                    else (
+                                        [resources.get("secrets")]
+                                        if isinstance(resources.get("secrets"), str)
+                                        else resources.get("secrets")
+                                    )
                                 )
                                 + KUBERNETES_SECRETS.split(",")
                                 + ARGO_WORKFLOWS_KUBERNETES_SECRETS.split(",")
-                                if k
-                            ],
-                            volume_mounts=[
-                                # Assign a volume mount to pass state to the next task.
-                                kubernetes_sdk.V1VolumeMount(
-                                    name="out", mount_path="/mnt/out"
-                                )
+                            )
+                            if k
+                        ]
+                    ),
+                    node_selector=resources.get("node_selector"),
+                    cpu=str(resources["cpu"]),
+                    memory=str(resources["memory"]),
+                    disk=str(resources["disk"]),
+                    gpu=resources["gpu"],
+                    gpu_vendor=str(resources["gpu_vendor"]),
+                    tolerations=resources["tolerations"],
+                    use_tmpfs=use_tmpfs,
+                    tmpfs_tempdir=tmpfs_tempdir,
+                    tmpfs_size=tmpfs_size,
+                    tmpfs_path=tmpfs_path,
+                    timeout_in_seconds=run_time_limit,
+                    persistent_volume_claims=resources["persistent_volume_claims"],
+                    shared_memory=shared_memory,
+                    port=port,
+                    qos=resources["qos"],
+                    security_context=security_context,
+                )
+                for k, v in env.items():
+                    jobset.environment_variable(k, v)
+                for k, v in additional_obp_configs.items():
+                    jobset.environment_variable(k, v)
+                # Set labels. Do not allow user-specified task labels to override internal ones.
+                #
+                # Explicitly add the task-id-hint label. This is important because this label
+                # is returned as an Output parameter of this step and is used subsequently as an
+                # an input in the join step.
+                kubernetes_labels = {
+                    "task_id_entropy": "{{inputs.parameters.task-id-entropy}}",
+                    "num_parallel": "{{inputs.parameters.num-parallel}}",
+                    "metaflow/argo-workflows-name": "{{workflow.name}}",
+                    "workflows.argoproj.io/workflow": "{{workflow.name}}",
+                }
+                jobset.labels(
+                    {
+                        **resources["labels"],
+                        **self._base_labels,
+                        **kubernetes_labels,
+                    }
+                )
+                jobset.environment_variable(
+                    "MF_MASTER_ADDR", jobset.jobset_control_addr
+                )
+                jobset.environment_variable("MF_MASTER_PORT", str(port))
+                jobset.environment_variable(
+                    "MF_WORLD_SIZE", "{{inputs.parameters.num-parallel}}"
+                )
+                # We need this task-id set so that all the nodes are aware of the control
+                # task's task-id. These "MF_" variables populate the `current.parallel` namedtuple
+                jobset.environment_variable(
+                    "MF_PARALLEL_CONTROL_TASK_ID",
+                    "control-{{inputs.parameters.task-id-entropy}}-0",
+                )
+                # for k, v in .items():
+                jobset.environment_variables_from_selectors(
+                    {
+                        "MF_WORKER_REPLICA_INDEX": "metadata.annotations['jobset.sigs.k8s.io/job-index']",
+                        "JOBSET_RESTART_ATTEMPT": "metadata.annotations['jobset.sigs.k8s.io/restart-attempt']",
+                        "METAFLOW_KUBERNETES_JOBSET_NAME": "metadata.annotations['jobset.sigs.k8s.io/jobset-name']",
+                        "METAFLOW_KUBERNETES_POD_NAMESPACE": "metadata.namespace",
+                        "METAFLOW_KUBERNETES_POD_NAME": "metadata.name",
+                        "METAFLOW_KUBERNETES_POD_ID": "metadata.uid",
+                        "METAFLOW_KUBERNETES_SERVICE_ACCOUNT_NAME": "spec.serviceAccountName",
+                        "METAFLOW_KUBERNETES_NODE_IP": "status.hostIP",
+                        "TASK_ID_SUFFIX": "metadata.annotations['jobset.sigs.k8s.io/job-index']",
+                    }
+                )
+                # Set annotations. Do not allow user-specified task-specific annotations to override internal ones.
+                annotations = {
+                    # setting annotations explicitly as they wont be
+                    # passed down from WorkflowTemplate level
+                    "metaflow/step_name": node.name,
+                    "metaflow/attempt": str(retry_count),
+                    "metaflow/run_id": run_id,
+                }
+                jobset.annotations(
+                    {
+                        **resources["annotations"],
+                        **self._base_annotations,
+                        **annotations,
+                    }
+                )
+                jobset.control.replicas(1)
+                jobset.worker.replicas("{{=asInt(inputs.parameters.workerCount)}}")
+                jobset.control.environment_variable("UBF_CONTEXT", UBF_CONTROL)
+                jobset.worker.environment_variable("UBF_CONTEXT", UBF_TASK)
+                jobset.control.environment_variable("MF_CONTROL_INDEX", "0")
+                # `TASK_ID_PREFIX` needs to explicitly be `control` or `worker`
+                # because the join task uses a formulaic approach to infer the task-ids
+                jobset.control.environment_variable("TASK_ID_PREFIX", "control")
+                jobset.worker.environment_variable("TASK_ID_PREFIX", "worker")
+                yield (
+                    Template(ArgoWorkflows._sanitize(node.name))
+                    .resource(
+                        "create",
+                        jobset.dump(),
+                        "status.terminalState == Completed",
+                        "status.terminalState == Failed",
+                    )
+                    .inputs(Inputs().parameters(inputs))
+                    .outputs(
+                        Outputs().parameters(
+                            [
+                                Parameter("task-id-entropy").valueFrom(
+                                    {"jsonPath": "{.metadata.labels.task_id_entropy}"}
+                                ),
+                                Parameter("num-parallel").valueFrom(
+                                    {"jsonPath": "{.metadata.labels.num_parallel}"}
+                                ),
                             ]
-                            # Support tmpfs.
-                            + (
-                                [
-                                    kubernetes_sdk.V1VolumeMount(
-                                        name="tmpfs-ephemeral-volume",
-                                        mount_path=tmpfs_path,
-                                    )
+                        )
+                    )
+                    .retry_strategy(
+                        times=total_retries,
+                        minutes_between_retries=minutes_between_retries,
+                    )
+                )
+            else:
+                template_name = self._sanitize(node.name)
+                if self._is_recursive_node(node):
+                    # The recursive template has the original step name,
+                    # this becomes a template within the recursive ones 'steps'
+                    template_name = self._sanitize("recursive-%s" % node.name)
+                yield (
+                    Template(template_name)
+                    # Set @timeout values
+                    .active_deadline_seconds(run_time_limit)
+                    # Set service account
+                    .service_account_name(resources["service_account"])
+                    # Configure template input
+                    .inputs(Inputs().parameters(inputs))
+                    # Configure template output
+                    .outputs(Outputs().parameters(outputs))
+                    # Fail fast!
+                    .fail_fast()
+                    # Set @retry/@catch values
+                    .retry_strategy(
+                        times=total_retries,
+                        minutes_between_retries=minutes_between_retries,
+                    )
+                    .metadata(
+                        ObjectMeta()
+                        .annotation("metaflow/step_name", node.name)
+                        # Unfortunately, we can't set the task_id since it is generated
+                        # inside the pod. However, it can be inferred from the annotation
+                        # set by argo-workflows - `workflows.argoproj.io/outputs` - refer
+                        # the field 'task-id' in 'parameters'
+                        # .annotation("metaflow/task_id", ...)
+                        .annotation("metaflow/attempt", retry_count)
+                        .annotations(resources["annotations"])
+                        .labels(resources["labels"])
+                    )
+                    # Set emptyDir volume for state management
+                    .empty_dir_volume("out")
+                    # Set tmpfs emptyDir volume if enabled
+                    .empty_dir_volume(
+                        "tmpfs-ephemeral-volume",
+                        medium="Memory",
+                        size_limit=tmpfs_size if tmpfs_enabled else 0,
+                    )
+                    .empty_dir_volume("dhsm", medium="Memory", size_limit=shared_memory)
+                    .pvc_volumes(resources.get("persistent_volume_claims"))
+                    # Set node selectors
+                    .node_selectors(resources.get("node_selector"))
+                    # Set tolerations
+                    .tolerations(resources.get("tolerations"))
+                    # Set image pull secrets if present. We need to use pod_spec_patch due to Argo not supporting this on a template level.
+                    .pod_spec_patch(
+                        {
+                            "imagePullSecrets": [
+                                {"name": secret}
+                                for secret in resources["image_pull_secrets"]
+                            ]
+                        }
+                        if resources["image_pull_secrets"]
+                        else None
+                    )
+                    # Set container
+                    .container(
+                        # TODO: Unify the logic with kubernetes.py
+                        # Important note - Unfortunately, V1Container uses snakecase while
+                        # Argo Workflows uses camel. For most of the attributes, both cases
+                        # are indistinguishable, but unfortunately, not for all - (
+                        # env_from, value_from, etc.) - so we need to handle the conversion
+                        # ourselves using to_camelcase. We need to be vigilant about
+                        # resources attributes in particular where the keys maybe user
+                        # defined.
+                        to_camelcase(
+                            kubernetes_sdk.V1Container(
+                                name=self._sanitize(node.name),
+                                command=cmds,
+                                termination_message_policy="FallbackToLogsOnError",
+                                ports=(
+                                    [
+                                        kubernetes_sdk.V1ContainerPort(
+                                            container_port=port
+                                        )
+                                    ]
+                                    if port
+                                    else None
+                                ),
+                                env=[
+                                    kubernetes_sdk.V1EnvVar(name=k, value=str(v))
+                                    for k, v in env.items()
                                 ]
-                                if tmpfs_enabled
-                                else []
-                            )
-                            # Support shared_memory
-                            + (
-                                [
-                                    kubernetes_sdk.V1VolumeMount(
-                                        name="dhsm",
-                                        mount_path="/dev/shm",
+                                # Add environment variables for book-keeping.
+                                # https://argoproj.github.io/argo-workflows/fields/#fields_155
+                                + [
+                                    kubernetes_sdk.V1EnvVar(
+                                        name=k,
+                                        value_from=kubernetes_sdk.V1EnvVarSource(
+                                            field_ref=kubernetes_sdk.V1ObjectFieldSelector(
+                                                field_path=str(v)
+                                            )
+                                        ),
                                     )
+                                    for k, v in {
+                                        "METAFLOW_KUBERNETES_NAMESPACE": "metadata.namespace",
+                                        "METAFLOW_KUBERNETES_POD_NAMESPACE": "metadata.namespace",
+                                        "METAFLOW_KUBERNETES_POD_NAME": "metadata.name",
+                                        "METAFLOW_KUBERNETES_POD_ID": "metadata.uid",
+                                        "METAFLOW_KUBERNETES_SERVICE_ACCOUNT_NAME": "spec.serviceAccountName",
+                                        "METAFLOW_KUBERNETES_NODE_IP": "status.hostIP",
+                                    }.items()
                                 ]
-                                if shared_memory
-                                else []
-                            )
-                            # Support persistent volume claims.
-                            + (
-                                [
+                                + [
+                                    kubernetes_sdk.V1EnvVar(
+                                        name=k,
+                                        value=v,
+                                    )
+                                    for k, v in additional_obp_configs.items()
+                                ],
+                                image=resources["image"],
+                                image_pull_policy=resources["image_pull_policy"],
+                                resources=kubernetes_sdk.V1ResourceRequirements(
+                                    requests=qos_requests,
+                                    limits={
+                                        **qos_limits,
+                                        **{
+                                            "%s.com/gpu".lower()
+                                            % resources["gpu_vendor"]: str(
+                                                resources["gpu"]
+                                            )
+                                            for k in [0]
+                                            if resources["gpu"] is not None
+                                        },
+                                    },
+                                ),
+                                # Configure secrets
+                                env_from=[
+                                    kubernetes_sdk.V1EnvFromSource(
+                                        secret_ref=kubernetes_sdk.V1SecretEnvSource(
+                                            name=str(k),
+                                            # optional=True
+                                        )
+                                    )
+                                    for k in list(
+                                        []
+                                        if not resources.get("secrets")
+                                        else (
+                                            [resources.get("secrets")]
+                                            if isinstance(resources.get("secrets"), str)
+                                            else resources.get("secrets")
+                                        )
+                                    )
+                                    + KUBERNETES_SECRETS.split(",")
+                                    + ARGO_WORKFLOWS_KUBERNETES_SECRETS.split(",")
+                                    if k
+                                ],
+                                volume_mounts=[
+                                    # Assign a volume mount to pass state to the next task.
                                     kubernetes_sdk.V1VolumeMount(
-                                        name=claim, mount_path=path
+                                        name="out", mount_path="/mnt/out"
                                     )
-                                    for claim, path in resources.get(
-                                        "persistent_volume_claims"
-                                    ).items()
                                 ]
-                                if resources.get("persistent_volume_claims") is not None
-                                else []
-                            ),
-                        ).to_dict()
+                                # Support tmpfs.
+                                + (
+                                    [
+                                        kubernetes_sdk.V1VolumeMount(
+                                            name="tmpfs-ephemeral-volume",
+                                            mount_path=tmpfs_path,
+                                        )
+                                    ]
+                                    if tmpfs_enabled
+                                    else []
+                                )
+                                # Support shared_memory
+                                + (
+                                    [
+                                        kubernetes_sdk.V1VolumeMount(
+                                            name="dhsm",
+                                            mount_path="/dev/shm",
+                                        )
+                                    ]
+                                    if shared_memory
+                                    else []
+                                )
+                                # Support persistent volume claims.
+                                + (
+                                    [
+                                        kubernetes_sdk.V1VolumeMount(
+                                            name=claim, mount_path=path
+                                        )
+                                        for claim, path in resources.get(
+                                            "persistent_volume_claims"
+                                        ).items()
+                                    ]
+                                    if resources.get("persistent_volume_claims")
+                                    is not None
+                                    else []
+                                ),
+                                **_security_context,
+                            ).to_dict()
+                        )
                     )
                 )
+    # Return daemon container templates for workflow execution notifications.
+    def _daemon_templates(self):
+        templates = []
+        if self.enable_heartbeat_daemon:
+            templates.append(self._heartbeat_daemon_template())
+        return templates
+    # Return lifecycle hooks for workflow execution notifications.
+    def _lifecycle_hooks(self):
+        hooks = []
+        if self.notify_on_error:
+            hooks.append(self._slack_error_template())
+            hooks.append(self._pager_duty_alert_template())
+            hooks.append(self._incident_io_alert_template())
+        if self.notify_on_success:
+            hooks.append(self._slack_success_template())
+            hooks.append(self._pager_duty_change_template())
+            hooks.append(self._incident_io_change_template())
+        exit_hook_decos = self.flow._flow_decorators.get("exit_hook", [])
+        for deco in exit_hook_decos:
+            hooks.extend(self._lifecycle_hook_from_deco(deco))
+        # Clean up None values from templates.
+        hooks = list(filter(None, hooks))
+        if hooks:
+            hooks.append(
+                ExitHookHack(
+                    url=(
+                        self.notify_slack_webhook_url
+                        or "https://events.pagerduty.com/v2/enqueue"
+                    )
+                )
+            )
+        return hooks
+    def _lifecycle_hook_from_deco(self, deco):
+        from kubernetes import client as kubernetes_sdk
+        start_step = [step for step in self.graph if step.name == "start"][0]
+        # We want to grab the base image used by the start step, as this is known to be pullable from within the cluster,
+        # and it might contain the required libraries, allowing us to start up faster.
+        start_kube_deco = [
+            deco for deco in start_step.decorators if deco.name == "kubernetes"
+        ][0]
+        resources = dict(start_kube_deco.attributes)
+        kube_defaults = dict(start_kube_deco.defaults)
+        # OBP Configs
+        additional_obp_configs = {
+            "OBP_PERIMETER": self.initial_configs["OBP_PERIMETER"],
+            "OBP_INTEGRATIONS_URL": self.initial_configs["OBP_INTEGRATIONS_URL"],
+        }
+        run_id_template = "argo-{{workflow.name}}"
+        metaflow_version = self.environment.get_environment_info()
+        metaflow_version["flow_name"] = self.graph.name
+        metaflow_version["production_token"] = self.production_token
+        env = {
+            # These values are needed by Metaflow to set it's internal
+            # state appropriately.
+            "METAFLOW_CODE_URL": self.code_package_url,
+            "METAFLOW_CODE_SHA": self.code_package_sha,
+            "METAFLOW_CODE_DS": self.flow_datastore.TYPE,
+            "METAFLOW_SERVICE_URL": SERVICE_INTERNAL_URL,
+            "METAFLOW_SERVICE_HEADERS": json.dumps(SERVICE_HEADERS),
+            "METAFLOW_USER": "argo-workflows",
+            "METAFLOW_DEFAULT_DATASTORE": self.flow_datastore.TYPE,
+            "METAFLOW_DEFAULT_METADATA": DEFAULT_METADATA,
+            "METAFLOW_OWNER": self.username,
+        }
+        # pass on the Run pathspec for script
+        env["RUN_PATHSPEC"] = f"{self.graph.name}/{run_id_template}"
+        # support Metaflow sandboxes
+        env["METAFLOW_INIT_SCRIPT"] = KUBERNETES_SANDBOX_INIT_SCRIPT
+        # support fetching secrets
+        env.update(additional_obp_configs)
+        env["METAFLOW_WORKFLOW_NAME"] = "{{workflow.name}}"
+        env["METAFLOW_WORKFLOW_NAMESPACE"] = "{{workflow.namespace}}"
+        env = {
+            k: v
+            for k, v in env.items()
+            if v is not None
+            and k not in set(ARGO_WORKFLOWS_ENV_VARS_TO_SKIP.split(","))
+        }
+        def _cmd(fn_name):
+            mflog_expr = export_mflog_env_vars(
+                datastore_type=self.flow_datastore.TYPE,
+                stdout_path="$PWD/.logs/mflog_stdout",
+                stderr_path="$PWD/.logs/mflog_stderr",
+                flow_name=self.flow.name,
+                run_id=run_id_template,
+                step_name=f"_hook_{fn_name}",
+                task_id="1",
+                retry_count="0",
+            )
+            cmds = " && ".join(
+                [
+                    # For supporting sandboxes, ensure that a custom script is executed
+                    # before anything else is executed. The script is passed in as an
+                    # env var.
+                    '${METAFLOW_INIT_SCRIPT:+eval \\"${METAFLOW_INIT_SCRIPT}\\"}',
+                    "mkdir -p $PWD/.logs",
+                    mflog_expr,
+                ]
+                + self.environment.get_package_commands(
+                    self.code_package_url, self.flow_datastore.TYPE
+                )[:-1]
+                # Replace the line 'Task in starting'
+                + [f"mflog 'Lifecycle hook {fn_name} is starting.'"]
+                + [
+                    f"python -m metaflow.plugins.exit_hook.exit_hook_script {metaflow_version['script']} {fn_name} $RUN_PATHSPEC"
+                ]
+            )
+            cmds = shlex.split('bash -c "%s"' % cmds)
+            return cmds
+        def _container(cmds):
+            return to_camelcase(
+                kubernetes_sdk.V1Container(
+                    name="main",
+                    command=cmds,
+                    image=deco.attributes["options"].get("image", None)
+                    or resources["image"],
+                    env=[
+                        kubernetes_sdk.V1EnvVar(name=k, value=str(v))
+                        for k, v in env.items()
+                    ],
+                    env_from=[
+                        kubernetes_sdk.V1EnvFromSource(
+                            secret_ref=kubernetes_sdk.V1SecretEnvSource(
+                                name=str(k),
+                                # optional=True
+                            )
+                        )
+                        for k in list(
+                            []
+                            if not resources.get("secrets")
+                            else (
+                                [resources.get("secrets")]
+                                if isinstance(resources.get("secrets"), str)
+                                else resources.get("secrets")
+                            )
+                        )
+                        + KUBERNETES_SECRETS.split(",")
+                        + ARGO_WORKFLOWS_KUBERNETES_SECRETS.split(",")
+                        if k
+                    ],
+                    resources=kubernetes_sdk.V1ResourceRequirements(
+                        requests={
+                            "cpu": str(kube_defaults["cpu"]),
+                            "memory": "%sM" % str(kube_defaults["memory"]),
+                        }
+                    ),
+                ).to_dict()
+            )
+        # create lifecycle hooks from deco
+        hooks = []
+        for success_fn_name in deco.success_hooks:
+            hook = ContainerHook(
+                name=f"success-{success_fn_name.replace('_', '-')}",
+                container=_container(cmds=_cmd(success_fn_name)),
+                service_account_name=resources["service_account"],
+                on_success=True,
+            )
+            hooks.append(hook)
+        for error_fn_name in deco.error_hooks:
+            hook = ContainerHook(
+                name=f"error-{error_fn_name.replace('_', '-')}",
+                service_account_name=resources["service_account"],
+                container=_container(cmds=_cmd(error_fn_name)),
+                on_error=True,
             )
+            hooks.append(hook)
+        return hooks
-    # Return exit hook templates for workflow execution notifications.
     def _exit_hook_templates(self):
         templates = []
-        if self.notify_on_error:
-            templates.append(self._slack_error_template())
-            templates.append(self._pager_duty_alert_template())
-        if self.notify_on_success:
-            templates.append(self._slack_success_template())
-            templates.append(self._pager_duty_change_template())
-        if self.notify_on_error or self.notify_on_success:
-            # Warning: terrible hack to workaround a bug in Argo Workflow where the
-            #          templates listed above do not execute unless there is an
-            #          explicit exit hook. as and when this bug is patched, we should
-            #          remove this effectively no-op template.
-            # Note: We use the Http template because changing this to an actual no-op container had the side-effect of
-            # leaving LifecycleHooks in a pending state even when they have finished execution.
-            templates.append(
-                Template("exit-hook-hack").http(
-                    Http("GET")
-                    .url(
-                        self.notify_slack_webhook_url
-                        or "https://events.pagerduty.com/v2/enqueue"
-                    )
-                    .success_condition("true == true")
-                )
-            )
+        if self.enable_error_msg_capture:
+            templates.extend(self._error_msg_capture_hook_templates())
         return templates
+    def _error_msg_capture_hook_templates(self):
+        from kubernetes import client as kubernetes_sdk
+        start_step = [step for step in self.graph if step.name == "start"][0]
+        # We want to grab the base image used by the start step, as this is known to be pullable from within the cluster,
+        # and it might contain the required libraries, allowing us to start up faster.
+        resources = dict(
+            [deco for deco in start_step.decorators if deco.name == "kubernetes"][
+                0
+            ].attributes
+        )
+        run_id_template = "argo-{{workflow.name}}"
+        metaflow_version = self.environment.get_environment_info()
+        metaflow_version["flow_name"] = self.graph.name
+        metaflow_version["production_token"] = self.production_token
+        mflog_expr = export_mflog_env_vars(
+            datastore_type=self.flow_datastore.TYPE,
+            stdout_path="$PWD/.logs/mflog_stdout",
+            stderr_path="$PWD/.logs/mflog_stderr",
+            flow_name=self.flow.name,
+            run_id=run_id_template,
+            step_name="_run_capture_error",
+            task_id="1",
+            retry_count="0",
+        )
+        cmds = " && ".join(
+            [
+                # For supporting sandboxes, ensure that a custom script is executed
+                # before anything else is executed. The script is passed in as an
+                # env var.
+                '${METAFLOW_INIT_SCRIPT:+eval \\"${METAFLOW_INIT_SCRIPT}\\"}',
+                "mkdir -p $PWD/.logs",
+                mflog_expr,
+            ]
+            + self.environment.get_package_commands(
+                self.code_package_url,
+                self.flow_datastore.TYPE,
+                self.code_package_metadata,
+            )[:-1]
+            # Replace the line 'Task in starting'
+            # FIXME: this can be brittle.
+            + ["mflog 'Error capture hook is starting.'"]
+            + ["argo_error=$(python -m 'metaflow.plugins.argo.capture_error')"]
+            + ["export METAFLOW_ARGO_ERROR=$argo_error"]
+            + [
+                """python -c 'import json, os; error_obj=os.getenv(\\"METAFLOW_ARGO_ERROR\\");data=json.loads(error_obj); print(data[\\"message\\"])'"""
+            ]
+            + [
+                'if [ -n \\"${METAFLOW_ARGO_WORKFLOWS_CAPTURE_ERROR_SCRIPT}\\" ]; then eval \\"${METAFLOW_ARGO_WORKFLOWS_CAPTURE_ERROR_SCRIPT}\\"; fi'
+            ]
+        )
+        # TODO: Also capture the first failed task id
+        cmds = shlex.split('bash -c "%s"' % cmds)
+        env = {
+            # These values are needed by Metaflow to set it's internal
+            # state appropriately.
+            "METAFLOW_CODE_METADATA": self.code_package_metadata,
+            "METAFLOW_CODE_URL": self.code_package_url,
+            "METAFLOW_CODE_SHA": self.code_package_sha,
+            "METAFLOW_CODE_DS": self.flow_datastore.TYPE,
+            "METAFLOW_SERVICE_URL": SERVICE_INTERNAL_URL,
+            "METAFLOW_SERVICE_HEADERS": json.dumps(SERVICE_HEADERS),
+            "METAFLOW_USER": "argo-workflows",
+            "METAFLOW_DEFAULT_DATASTORE": self.flow_datastore.TYPE,
+            "METAFLOW_DEFAULT_METADATA": DEFAULT_METADATA,
+            "METAFLOW_OWNER": self.username,
+        }
+        # support Metaflow sandboxes
+        env["METAFLOW_INIT_SCRIPT"] = KUBERNETES_SANDBOX_INIT_SCRIPT
+        env["METAFLOW_ARGO_WORKFLOWS_CAPTURE_ERROR_SCRIPT"] = (
+            ARGO_WORKFLOWS_CAPTURE_ERROR_SCRIPT
+        )
+        env["METAFLOW_WORKFLOW_NAME"] = "{{workflow.name}}"
+        env["METAFLOW_WORKFLOW_NAMESPACE"] = "{{workflow.namespace}}"
+        env["METAFLOW_ARGO_WORKFLOW_FAILURES"] = "{{workflow.failures}}"
+        env = {
+            k: v
+            for k, v in env.items()
+            if v is not None
+            and k not in set(ARGO_WORKFLOWS_ENV_VARS_TO_SKIP.split(","))
+        }
+        return [
+            Template("error-msg-capture-hook")
+            .service_account_name(resources["service_account"])
+            .container(
+                to_camelcase(
+                    kubernetes_sdk.V1Container(
+                        name="main",
+                        command=cmds,
+                        image=resources["image"],
+                        env=[
+                            kubernetes_sdk.V1EnvVar(name=k, value=str(v))
+                            for k, v in env.items()
+                        ],
+                        env_from=[
+                            kubernetes_sdk.V1EnvFromSource(
+                                secret_ref=kubernetes_sdk.V1SecretEnvSource(
+                                    name=str(k),
+                                    # optional=True
+                                )
+                            )
+                            for k in list(
+                                []
+                                if not resources.get("secrets")
+                                else (
+                                    [resources.get("secrets")]
+                                    if isinstance(resources.get("secrets"), str)
+                                    else resources.get("secrets")
+                                )
+                            )
+                            + KUBERNETES_SECRETS.split(",")
+                            + ARGO_WORKFLOWS_KUBERNETES_SECRETS.split(",")
+                            if k
+                        ],
+                        resources=kubernetes_sdk.V1ResourceRequirements(
+                            # NOTE: base resources for this are kept to a minimum to save on running costs.
+                            # This has an adverse effect on startup time for the daemon, which can be completely
+                            # alleviated by using a base image that has the required dependencies pre-installed
+                            requests={
+                                "cpu": "200m",
+                                "memory": "100Mi",
+                            },
+                            limits={
+                                "cpu": "200m",
+                                "memory": "500Mi",
+                            },
+                        ),
+                    ).to_dict()
+                )
+            ),
+            Template("capture-error-hook-fn-preflight").steps(
+                [
+                    WorkflowStep()
+                    .name("capture-error-hook-fn-preflight")
+                    .template("error-msg-capture-hook")
+                    .when("{{workflow.status}} != Succeeded")
+                ]
+            ),
+        ]
     def _pager_duty_alert_template(self):
         # https://developer.pagerduty.com/docs/ZG9jOjExMDI5NTgx-send-an-alert-event
         if self.notify_pager_duty_integration_key is None:
             return None
-        return Template("notify-pager-duty-on-error").http(
-            Http("POST")
-            .url("https://events.pagerduty.com/v2/enqueue")
-            .header("Content-Type", "application/json")
-            .body(
-                json.dumps(
-                    {
-                        "event_action": "trigger",
-                        "routing_key": self.notify_pager_duty_integration_key,
-                        # "dedup_key": self.flow.name,  # TODO: Do we need deduplication?
-                        "payload": {
-                            "source": "{{workflow.name}}",
-                            "severity": "info",
-                            "summary": "Metaflow run %s/argo-{{workflow.name}} failed!"
-                            % self.flow.name,
-                            "custom_details": {
-                                "Flow": self.flow.name,
-                                "Run ID": "argo-{{workflow.name}}",
-                            },
+        return HttpExitHook(
+            name="notify-pager-duty-on-error",
+            method="POST",
+            url="https://events.pagerduty.com/v2/enqueue",
+            headers={"Content-Type": "application/json"},
+            body=json.dumps(
+                {
+                    "event_action": "trigger",
+                    "routing_key": self.notify_pager_duty_integration_key,
+                    # "dedup_key": self.flow.name,  # TODO: Do we need deduplication?
+                    "payload": {
+                        "source": "{{workflow.name}}",
+                        "severity": "info",
+                        "summary": "Metaflow run %s/argo-{{workflow.name}} failed!"
+                        % self.flow.name,
+                        "custom_details": {
+                            "Flow": self.flow.name,
+                            "Run ID": "argo-{{workflow.name}}",
                         },
-                        "links": self._pager_duty_notification_links(),
-                    }
-                )
+                    },
+                    "links": self._pager_duty_notification_links(),
+                }
+            ),
+            on_error=True,
+        )
+    def _incident_io_alert_template(self):
+        if self.notify_incident_io_api_key is None:
+            return None
+        if self.incident_io_alert_source_config_id is None:
+            raise MetaflowException(
+                "Creating alerts for errors requires a alert source config ID."
+            )
+        ui_links = self._incident_io_ui_urls_for_run()
+        return HttpExitHook(
+            name="notify-incident-io-on-error",
+            method="POST",
+            url=(
+                "https://api.incident.io/v2/alert_events/http/%s"
+                % self.incident_io_alert_source_config_id
+            ),
+            headers={
+                "Content-Type": "application/json",
+                "Authorization": "Bearer %s" % self.notify_incident_io_api_key,
+            },
+            body=json.dumps(
+                {
+                    "idempotency_key": "argo-{{workflow.name}}",  # use run id to deduplicate alerts.
+                    "status": "firing",
+                    "title": "Flow %s has failed." % self.flow.name,
+                    "description": "Metaflow run {run_pathspec} failed!{urls}".format(
+                        run_pathspec="%s/argo-{{workflow.name}}" % self.flow.name,
+                        urls=(
+                            "\n\nSee details for the run at:\n\n"
+                            + "\n\n".join(ui_links)
+                            if ui_links
+                            else ""
+                        ),
+                    ),
+                    "source_url": (
+                        "%s/%s/%s"
+                        % (
+                            UI_URL.rstrip("/"),
+                            self.flow.name,
+                            "argo-{{workflow.name}}",
+                        )
+                        if UI_URL
+                        else None
+                    ),
+                    "metadata": {
+                        **(self.incident_io_metadata or {}),
+                        **{
+                            "run_status": "failed",
+                            "flow_name": self.flow.name,
+                            "run_id": "argo-{{workflow.name}}",
+                        },
+                    },
+                }
+            ),
+            on_error=True,
+        )
+    def _incident_io_change_template(self):
+        if self.notify_incident_io_api_key is None:
+            return None
+        if self.incident_io_alert_source_config_id is None:
+            raise MetaflowException(
+                "Creating alerts for successes requires an alert source config ID."
             )
+        ui_links = self._incident_io_ui_urls_for_run()
+        return HttpExitHook(
+            name="notify-incident-io-on-success",
+            method="POST",
+            url=(
+                "https://api.incident.io/v2/alert_events/http/%s"
+                % self.incident_io_alert_source_config_id
+            ),
+            headers={
+                "Content-Type": "application/json",
+                "Authorization": "Bearer %s" % self.notify_incident_io_api_key,
+            },
+            body=json.dumps(
+                {
+                    "idempotency_key": "argo-{{workflow.name}}",  # use run id to deduplicate alerts.
+                    "status": "firing",
+                    "title": "Flow %s has succeeded." % self.flow.name,
+                    "description": "Metaflow run {run_pathspec} succeeded!{urls}".format(
+                        run_pathspec="%s/argo-{{workflow.name}}" % self.flow.name,
+                        urls=(
+                            "\n\nSee details for the run at:\n\n"
+                            + "\n\n".join(ui_links)
+                            if ui_links
+                            else ""
+                        ),
+                    ),
+                    "source_url": (
+                        "%s/%s/%s"
+                        % (
+                            UI_URL.rstrip("/"),
+                            self.flow.name,
+                            "argo-{{workflow.name}}",
+                        )
+                        if UI_URL
+                        else None
+                    ),
+                    "metadata": {
+                        **(self.incident_io_metadata or {}),
+                        **{
+                            "run_status": "succeeded",
+                            "flow_name": self.flow.name,
+                            "run_id": "argo-{{workflow.name}}",
+                        },
+                    },
+                }
+            ),
+            on_success=True,
         )
+    def _incident_io_ui_urls_for_run(self):
+        links = []
+        if UI_URL:
+            url = "[Metaflow UI](%s/%s/%s)" % (
+                UI_URL.rstrip("/"),
+                self.flow.name,
+                "argo-{{workflow.name}}",
+            )
+            links.append(url)
+        if ARGO_WORKFLOWS_UI_URL:
+            url = "[Argo UI](%s/workflows/%s/%s)" % (
+                ARGO_WORKFLOWS_UI_URL.rstrip("/"),
+                "{{workflow.namespace}}",
+                "{{workflow.name}}",
+            )
+            links.append(url)
+        return links
     def _pager_duty_change_template(self):
         # https://developer.pagerduty.com/docs/ZG9jOjExMDI5NTgy-send-a-change-event
         if self.notify_pager_duty_integration_key is None:
             return None
-        return Template("notify-pager-duty-on-success").http(
-            Http("POST")
-            .url("https://events.pagerduty.com/v2/change/enqueue")
-            .header("Content-Type", "application/json")
-            .body(
-                json.dumps(
-                    {
-                        "routing_key": self.notify_pager_duty_integration_key,
-                        "payload": {
-                            "summary": "Metaflow run %s/argo-{{workflow.name}} Succeeded"
-                            % self.flow.name,
-                            "source": "{{workflow.name}}",
-                            "custom_details": {
-                                "Flow": self.flow.name,
-                                "Run ID": "argo-{{workflow.name}}",
-                            },
+        return HttpExitHook(
+            name="notify-pager-duty-on-success",
+            method="POST",
+            url="https://events.pagerduty.com/v2/change/enqueue",
+            headers={"Content-Type": "application/json"},
+            body=json.dumps(
+                {
+                    "routing_key": self.notify_pager_duty_integration_key,
+                    "payload": {
+                        "summary": "Metaflow run %s/argo-{{workflow.name}} Succeeded"
+                        % self.flow.name,
+                        "source": "{{workflow.name}}",
+                        "custom_details": {
+                            "Flow": self.flow.name,
+                            "Run ID": "argo-{{workflow.name}}",
                         },
-                        "links": self._pager_duty_notification_links(),
-                    }
-                )
-            )
+                    },
+                    "links": self._pager_duty_notification_links(),
+                }
+            ),
+            on_success=True,
         )
     def _pager_duty_notification_links(self):
         links = []
         if UI_URL:
+            if PAGERDUTY_TEMPLATE_URL:
+                pdproject = ""
+                pdbranch = ""
+                if getattr(current, "project_name", None):
+                    pdproject = current.project_name
+                    pdbranch = current.branch_name
+                href_val = PAGERDUTY_TEMPLATE_URL.format(
+                    pd_flow=self.flow.name,
+                    pd_namespace=KUBERNETES_NAMESPACE,
+                    pd_template=self.name,
+                    pd_project=pdproject,
+                    pd_branch=pdbranch,
+                )
+            else:
+                href_val = "%s/%s/%s" % (
+                    UI_URL.rstrip("/"),
+                    self.flow.name,
+                    "argo-{{workflow.name}}",
+                )
             links.append(
                 {
-                    "href": "%s/%s/%s"
-                    % (UI_URL.rstrip("/"), self.flow.name, "argo-{{workflow.name}}"),
+                    "href": href_val,
                     "text": "Metaflow UI",
                 }
             )
@@ -1807,7 +3420,7 @@ class ArgoWorkflows(object):
         Use Slack's Block Kit to add general information about the environment and
         execution metadata, including a link to the UI and an optional message.
         """
-        ui_link = "%s%s/argo-{{workflow.name}}" % (UI_URL, self.flow.name)
+        ui_link = "%s/%s/argo-{{workflow.name}}" % (UI_URL.rstrip("/"), self.flow.name)
         # fmt: off
         if getattr(current, "project_name", None):
             # Add @project metadata when available.
@@ -1815,12 +3428,12 @@ class ArgoWorkflows(object):
                 "type": "section",
                 "text": {
                     "type": "mrkdwn",
-                    "text": ":metaflow: Environment details"
+                    "text": "Environment details"
                 },
                 "fields": [
                     {
                         "type": "mrkdwn",
-                        "text": "*Project:* %s" % current.project_name
+                        "text": "*Project:* %s" % current.project_name
                     },
                     {
                         "type": "mrkdwn",
@@ -1833,7 +3446,7 @@ class ArgoWorkflows(object):
                 "type": "section",
                 "text": {
                     "type": "mrkdwn",
-                    "text": ":metaflow: Environment details"
+                    "text": "Environment details"
                 }
             }
@@ -1878,8 +3491,12 @@ class ArgoWorkflows(object):
             blocks = self._get_slack_blocks(message)
             payload = {"text": message, "blocks": blocks}
-        return Template("notify-slack-on-error").http(
-            Http("POST").url(self.notify_slack_webhook_url).body(json.dumps(payload))
+        return HttpExitHook(
+            name="notify-slack-on-error",
+            method="POST",
+            url=self.notify_slack_webhook_url,
+            body=json.dumps(payload),
+            on_error=True,
         )
     def _slack_success_template(self):
@@ -1894,8 +3511,178 @@ class ArgoWorkflows(object):
             blocks = self._get_slack_blocks(message)
             payload = {"text": message, "blocks": blocks}
-        return Template("notify-slack-on-success").http(
-            Http("POST").url(self.notify_slack_webhook_url).body(json.dumps(payload))
+        return HttpExitHook(
+            name="notify-slack-on-success",
+            method="POST",
+            url=self.notify_slack_webhook_url,
+            body=json.dumps(payload),
+            on_success=True,
+        )
+    def _heartbeat_daemon_template(self):
+        # Use all the affordances available to _parameters task
+        executable = self.environment.executable("_parameters")
+        run_id = "argo-{{workflow.name}}"
+        script_name = os.path.basename(sys.argv[0])
+        entrypoint = [executable, script_name]
+        # FlowDecorators can define their own top-level options. These might affect run level information
+        # so it is important to pass these to the heartbeat process as well, as it might be the first task to register a run.
+        top_opts_dict = {}
+        for deco in flow_decorators(self.flow):
+            top_opts_dict.update(deco.get_top_level_options())
+        top_level = list(dict_to_cli_options(top_opts_dict)) + [
+            "--quiet",
+            "--metadata=%s" % self.metadata.TYPE,
+            "--environment=%s" % self.environment.TYPE,
+            "--datastore=%s" % self.flow_datastore.TYPE,
+            "--datastore-root=%s" % self.flow_datastore.datastore_root,
+            "--event-logger=%s" % self.event_logger.TYPE,
+            "--monitor=%s" % self.monitor.TYPE,
+            "--no-pylint",
+            "--with=argo_workflows_internal:auto-emit-argo-events=%i"
+            % self.auto_emit_argo_events,
+        ]
+        heartbeat_cmds = "{entrypoint} {top_level} argo-workflows heartbeat --run_id {run_id} {tags}".format(
+            entrypoint=" ".join(entrypoint),
+            top_level=" ".join(top_level) if top_level else "",
+            run_id=run_id,
+            tags=" ".join(["--tag %s" % t for t in self.tags]) if self.tags else "",
+        )
+        # TODO: we do not really need MFLOG logging for the daemon at the moment, but might be good for the future.
+        # Consider if we can do without this setup.
+        # Configure log capture.
+        mflog_expr = export_mflog_env_vars(
+            datastore_type=self.flow_datastore.TYPE,
+            stdout_path="$PWD/.logs/mflog_stdout",
+            stderr_path="$PWD/.logs/mflog_stderr",
+            flow_name=self.flow.name,
+            run_id=run_id,
+            step_name="_run_heartbeat_daemon",
+            task_id="1",
+            retry_count="0",
+        )
+        # TODO: Can the init be trimmed down?
+        # Can we do without get_package_commands fetching the whole code package?
+        init_cmds = " && ".join(
+            [
+                # For supporting sandboxes, ensure that a custom script is executed
+                # before anything else is executed. The script is passed in as an
+                # env var.
+                '${METAFLOW_INIT_SCRIPT:+eval \\"${METAFLOW_INIT_SCRIPT}\\"}',
+                "mkdir -p $PWD/.logs",
+                mflog_expr,
+            ]
+            + self.environment.get_package_commands(
+                self.code_package_url,
+                self.flow_datastore.TYPE,
+            )[:-1]
+            # Replace the line 'Task in starting'
+            # FIXME: this can be brittle.
+            + ["mflog 'Heartbeat daemon is starting.'"]
+        )
+        cmd_str = " && ".join([init_cmds, heartbeat_cmds])
+        cmds = shlex.split('bash -c "%s"' % cmd_str)
+        # Env required for sending heartbeats to the metadata service, nothing extra.
+        # prod token / runtime info is required to correctly register flow branches
+        env = {
+            # These values are needed by Metaflow to set it's internal
+            # state appropriately.
+            "METAFLOW_CODE_METADATA": self.code_package_metadata,
+            "METAFLOW_CODE_URL": self.code_package_url,
+            "METAFLOW_CODE_SHA": self.code_package_sha,
+            "METAFLOW_CODE_DS": self.flow_datastore.TYPE,
+            "METAFLOW_SERVICE_URL": SERVICE_INTERNAL_URL,
+            "METAFLOW_SERVICE_HEADERS": json.dumps(SERVICE_HEADERS),
+            "METAFLOW_USER": "argo-workflows",
+            "METAFLOW_DATASTORE_SYSROOT_S3": DATASTORE_SYSROOT_S3,
+            "METAFLOW_DATATOOLS_S3ROOT": DATATOOLS_S3ROOT,
+            "METAFLOW_DEFAULT_DATASTORE": self.flow_datastore.TYPE,
+            "METAFLOW_DEFAULT_METADATA": DEFAULT_METADATA,
+            "METAFLOW_CARD_S3ROOT": CARD_S3ROOT,
+            "METAFLOW_KUBERNETES_WORKLOAD": 1,
+            "METAFLOW_KUBERNETES_FETCH_EC2_METADATA": KUBERNETES_FETCH_EC2_METADATA,
+            "METAFLOW_RUNTIME_ENVIRONMENT": "kubernetes",
+            "METAFLOW_OWNER": self.username,
+            "METAFLOW_PRODUCTION_TOKEN": self.production_token,  # Used in identity resolving. This affects system tags.
+        }
+        # support Metaflow sandboxes
+        env["METAFLOW_INIT_SCRIPT"] = KUBERNETES_SANDBOX_INIT_SCRIPT
+        # cleanup env values
+        env = {
+            k: v
+            for k, v in env.items()
+            if v is not None
+            and k not in set(ARGO_WORKFLOWS_ENV_VARS_TO_SKIP.split(","))
+        }
+        # We want to grab the base image used by the start step, as this is known to be pullable from within the cluster,
+        # and it might contain the required libraries, allowing us to start up faster.
+        start_step = next(step for step in self.flow if step.name == "start")
+        resources = dict(
+            [deco for deco in start_step.decorators if deco.name == "kubernetes"][
+                0
+            ].attributes
+        )
+        from kubernetes import client as kubernetes_sdk
+        return (
+            DaemonTemplate("heartbeat-daemon")
+            # NOTE: Even though a retry strategy does not work for Argo daemon containers,
+            # this has the side-effect of protecting the exit hooks of the workflow from failing in case the daemon container errors out.
+            .retry_strategy(10, 1)
+            .service_account_name(resources["service_account"])
+            .container(
+                to_camelcase(
+                    kubernetes_sdk.V1Container(
+                        name="main",
+                        # TODO: Make the image configurable
+                        image=resources["image"],
+                        command=cmds,
+                        env=[
+                            kubernetes_sdk.V1EnvVar(name=k, value=str(v))
+                            for k, v in env.items()
+                        ],
+                        env_from=[
+                            kubernetes_sdk.V1EnvFromSource(
+                                secret_ref=kubernetes_sdk.V1SecretEnvSource(
+                                    name=str(k),
+                                    # optional=True
+                                )
+                            )
+                            for k in list(
+                                []
+                                if not resources.get("secrets")
+                                else (
+                                    [resources.get("secrets")]
+                                    if isinstance(resources.get("secrets"), str)
+                                    else resources.get("secrets")
+                                )
+                            )
+                            + KUBERNETES_SECRETS.split(",")
+                            + ARGO_WORKFLOWS_KUBERNETES_SECRETS.split(",")
+                            if k
+                        ],
+                        resources=kubernetes_sdk.V1ResourceRequirements(
+                            # NOTE: base resources for this are kept to a minimum to save on running costs.
+                            # This has an adverse effect on startup time for the daemon, which can be completely
+                            # alleviated by using a base image that has the required dependencies pre-installed
+                            requests={
+                                "cpu": "200m",
+                                "memory": "100Mi",
+                            },
+                            limits={
+                                "cpu": "200m",
+                                "memory": "100Mi",
+                            },
+                        ),
+                    )
+                ).to_dict()
+            )
         )
     def _compile_sensor(self):
@@ -1997,44 +3784,16 @@ class ArgoWorkflows(object):
                 "sdk (https://pypi.org/project/kubernetes/) first."
             )
-        labels = {"app.kubernetes.io/part-of": "metaflow"}
-        annotations = {
-            "metaflow/production_token": self.production_token,
-            "metaflow/owner": self.username,
-            "metaflow/user": "argo-workflows",
-            "metaflow/flow_name": self.flow.name,
-        }
-        if current.get("project_name"):
-            annotations.update(
-                {
-                    "metaflow/project_name": current.project_name,
-                    "metaflow/branch_name": current.branch_name,
-                    "metaflow/project_flow_name": current.project_flow_name,
-                }
-            )
-        # Useful to paint the UI
-        trigger_annotations = {
-            "metaflow/triggered_by": json.dumps(
-                [
-                    {key: trigger.get(key) for key in ["name", "type"]}
-                    for trigger in self.triggers
-                ]
-            )
-        }
         return (
             Sensor()
             .metadata(
                 # Sensor metadata.
                 ObjectMeta()
-                .name(self.name.replace(".", "-"))
-                .namespace(KUBERNETES_NAMESPACE)
+                .name(ArgoWorkflows._sensor_name(self.name))
+                .namespace(ARGO_EVENTS_SENSOR_NAMESPACE)
+                .labels(self._base_labels)
                 .label("app.kubernetes.io/name", "metaflow-sensor")
-                .label("app.kubernetes.io/part-of", "metaflow")
-                .labels(self.kubernetes_labels)
-                .annotations(annotations)
+                .annotations(self._base_annotations)
             )
             .spec(
                 SensorSpec().template(
@@ -2044,7 +3803,7 @@ class ArgoWorkflows(object):
                         ObjectMeta()
                         .label("app.kubernetes.io/name", "metaflow-sensor")
                         .label("app.kubernetes.io/part-of", "metaflow")
-                        .annotations(annotations)
+                        .annotations(self._base_annotations)
                     )
                     .container(
                         # Run sensor in guaranteed QoS. The sensor isn't doing a lot
@@ -2064,7 +3823,7 @@ class ArgoWorkflows(object):
                                         "memory": "250Mi",
                                     },
                                 ),
-                            )
+                            ).to_dict()
                         )
                     )
                     .service_account_name(ARGO_EVENTS_SERVICE_ACCOUNT)
@@ -2081,8 +3840,8 @@ class ArgoWorkflows(object):
                     Trigger().template(
                         TriggerTemplate(self.name)
                         # Trigger a deployed workflow template
-                        .argo_workflow_trigger(
-                            ArgoWorkflowTrigger()
+                        .k8s_trigger(
+                            StandardK8STrigger()
                             .source(
                                 {
                                     "resource": {
@@ -2091,6 +3850,7 @@ class ArgoWorkflows(object):
                                         "metadata": {
                                             "generateName": "%s-" % self.name,
                                             "namespace": KUBERNETES_NAMESPACE,
+                                            # Useful to paint the UI
                                             "annotations": {
                                                 "metaflow/triggered_by": json.dumps(
                                                     [
@@ -2139,8 +3899,21 @@ class ArgoWorkflows(object):
                                                 # everything within the body.
                                                 # NOTE: We need the conditional logic in order to successfully fall back to the default value
                                                 # when the event payload does not contain a key for a parameter.
-                                                data_template='{{ if (hasKey $.Input.body.payload "%s") }}{{- (.Input.body.payload.%s | toJson) -}}{{- else -}}{{ (fail "use-default-instead") }}{{- end -}}'
-                                                % (v, v),
+                                                # NOTE: Keys might contain dashes, so use the safer 'get' for fetching the value
+                                                data_template='{{ if (hasKey $.Input.body.payload "%s") }}%s{{- else -}}{{ (fail "use-default-instead") }}{{- end -}}'
+                                                % (
+                                                    v,
+                                                    (
+                                                        '{{- $pv:=(get $.Input.body.payload "%s") -}}{{ if kindIs "string" $pv }}{{- $pv | toRawJson -}}{{- else -}}{{ $pv | toRawJson | toRawJson }}{{- end -}}'
+                                                        % v
+                                                        if self.parameters[
+                                                            parameter_name
+                                                        ]["type"]
+                                                        == "JSON"
+                                                        else '{{- (get $.Input.body.payload "%s" | toRawJson) -}}'
+                                                        % v
+                                                    ),
+                                                ),
                                                 # Unfortunately the sensor needs to
                                                 # record the default values for
                                                 # the parameters - there doesn't seem
@@ -2351,6 +4124,38 @@ class ObjectMeta(object):
         return json.dumps(self.to_json(), indent=4)
+class WorkflowStep(object):
+    def __init__(self):
+        tree = lambda: defaultdict(tree)
+        self.payload = tree()
+    def name(self, name):
+        self.payload["name"] = str(name)
+        return self
+    def template(self, template):
+        self.payload["template"] = str(template)
+        return self
+    def arguments(self, arguments):
+        self.payload["arguments"] = arguments.to_json()
+        return self
+    def when(self, condition):
+        self.payload["when"] = str(condition)
+        return self
+    def step(self, expression):
+        self.payload["expression"] = str(expression)
+        return self
+    def to_json(self):
+        return self.payload
+    def __str__(self):
+        return json.dumps(self.to_json(), indent=4)
 class WorkflowSpec(object):
     # https://argoproj.github.io/argo-workflows/fields/#workflowspec
     # This object sets all Workflow level properties.
@@ -2381,6 +4186,11 @@ class WorkflowSpec(object):
         self.payload["entrypoint"] = entrypoint
         return self
+    def onExit(self, on_exit_template):
+        if on_exit_template:
+            self.payload["onExit"] = on_exit_template
+        return self
     def parallelism(self, parallelism):
         # Set parallelism at Workflow level
         self.payload["parallelism"] = int(parallelism)
@@ -2469,6 +4279,38 @@ class Metadata(object):
         return json.dumps(self.to_json(), indent=4)
+class DaemonTemplate(object):
+    def __init__(self, name):
+        tree = lambda: defaultdict(tree)
+        self.name = name
+        self.payload = tree()
+        self.payload["daemon"] = True
+        self.payload["name"] = name
+    def container(self, container):
+        self.payload["container"] = container
+        return self
+    def service_account_name(self, service_account_name):
+        self.payload["serviceAccountName"] = service_account_name
+        return self
+    def retry_strategy(self, times, minutes_between_retries):
+        if times > 0:
+            self.payload["retryStrategy"] = {
+                "retryPolicy": "Always",
+                "limit": times,
+                "backoff": {"duration": "%sm" % minutes_between_retries},
+            }
+        return self
+    def to_json(self):
+        return self.payload
+    def __str__(self):
+        return json.dumps(self.payload, indent=4)
 class Template(object):
     # https://argoproj.github.io/argo-workflows/fields/#template
@@ -2487,6 +4329,18 @@ class Template(object):
         self.payload["dag"] = dag_template.to_json()
         return self
+    def steps(self, steps):
+        if "steps" not in self.payload:
+            self.payload["steps"] = []
+        # steps is a list of lists.
+        # hence we go over every item in the incoming list
+        # serialize it and then append the list to the payload
+        step_list = []
+        for step in steps:
+            step_list.append(step.to_json())
+        self.payload["steps"].append(step_list)
+        return self
     def container(self, container):
         # Luckily this can simply be V1Container and we are spared from writing more
         # boilerplate - https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1Container.md.
@@ -2579,6 +4433,14 @@ class Template(object):
             )
         return self
+    def pod_spec_patch(self, pod_spec_patch=None):
+        if pod_spec_patch is None:
+            return self
+        self.payload["podSpecPatch"] = json.dumps(pod_spec_patch)
+        return self
     def node_selectors(self, node_selectors):
         if "nodeSelector" not in self.payload:
             self.payload["nodeSelector"] = {}
@@ -2593,6 +4455,15 @@ class Template(object):
     def to_json(self):
         return self.payload
+    def resource(self, action, manifest, success_criteria, failure_criteria):
+        self.payload["resource"] = {}
+        self.payload["resource"]["action"] = action
+        self.payload["resource"]["setOwnerReference"] = True
+        self.payload["resource"]["successCondition"] = success_criteria
+        self.payload["resource"]["failureCondition"] = failure_criteria
+        self.payload["resource"]["manifest"] = manifest
+        return self
     def __str__(self):
         return json.dumps(self.payload, indent=4)
@@ -2712,6 +4583,10 @@ class DAGTask(object):
         self.payload["dependencies"] = dependencies
         return self
+    def depends(self, depends: str):
+        self.payload["depends"] = depends
+        return self
     def template(self, template):
         # Template reference
         self.payload["template"] = template
@@ -2723,6 +4598,10 @@ class DAGTask(object):
         self.payload["inline"] = template.to_json()
         return self
+    def when(self, when: str):
+        self.payload["when"] = when
+        return self
     def with_param(self, with_param):
         self.payload["withParam"] = with_param
         return self
@@ -2942,6 +4821,10 @@ class TriggerTemplate(object):
         self.payload = tree()
         self.payload["name"] = name
+    def k8s_trigger(self, k8s_trigger):
+        self.payload["k8s"] = k8s_trigger.to_json()
+        return self
     def argo_workflow_trigger(self, argo_workflow_trigger):
         self.payload["argoWorkflow"] = argo_workflow_trigger.to_json()
         return self
@@ -3018,51 +4901,51 @@ class TriggerParameter(object):
         return json.dumps(self.payload, indent=4)
-class Http(object):
-    # https://argoproj.github.io/argo-workflows/fields/#http
+class StandardK8STrigger(object):
+    # https://pkg.go.dev/github.com/argoproj/argo-events/pkg/apis/sensor/v1alpha1#StandardK8STrigger
-    def __init__(self, method):
+    def __init__(self):
         tree = lambda: defaultdict(tree)
         self.payload = tree()
-        self.payload["method"] = method
-        self.payload["headers"] = []
+        self.payload["operation"] = "create"
-    def header(self, header, value):
-        self.payload["headers"].append({"name": header, "value": value})
+    def operation(self, operation):
+        self.payload["operation"] = operation
         return self
-    def body(self, body):
-        self.payload["body"] = str(body)
+    def group(self, group):
+        self.payload["group"] = group
         return self
-    def url(self, url):
-        self.payload["url"] = url
+    def version(self, version):
+        self.payload["version"] = version
         return self
-    def success_condition(self, success_condition):
-        self.payload["successCondition"] = success_condition
+    def resource(self, resource):
+        self.payload["resource"] = resource
         return self
-    def to_json(self):
-        return self.payload
-    def __str__(self):
-        return json.dumps(self.payload, indent=4)
+    def namespace(self, namespace):
+        self.payload["namespace"] = namespace
+        return self
-class LifecycleHook(object):
-    # https://argoproj.github.io/argo-workflows/fields/#lifecyclehook
+    def source(self, source):
+        self.payload["source"] = source
+        return self
-    def __init__(self):
-        tree = lambda: defaultdict(tree)
-        self.payload = tree()
+    def parameters(self, trigger_parameters):
+        if "parameters" not in self.payload:
+            self.payload["parameters"] = []
+        for trigger_parameter in trigger_parameters:
+            self.payload["parameters"].append(trigger_parameter.to_json())
+        return self
-    def expression(self, expression):
-        self.payload["expression"] = str(expression)
+    def live_object(self, live_object=True):
+        self.payload["liveObject"] = live_object
         return self
-    def template(self, template):
-        self.payload["template"] = template
+    def patch_strategy(self, patch_strategy):
+        self.payload["patchStrategy"] = patch_strategy
         return self
     def to_json(self):

ob-metaflow 2.11.13.1__py2.py3-none-any.whl → 2.19.7.1rc0__py2.py3-none-any.whl

ob-metaflow 2.11.13.1py2.py3-none-any.whl → 2.19.7.1rc0py2.py3-none-any.whl