ob-metaflow 2.12.10.1rc2__py2.py3-none-any.whl → 2.12.11.0__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ob-metaflow might be problematic. Click here for more details.
- metaflow/client/core.py +6 -6
- metaflow/client/filecache.py +16 -3
- metaflow/cmd/develop/stub_generator.py +62 -47
- metaflow/datastore/content_addressed_store.py +1 -1
- metaflow/datastore/task_datastore.py +1 -1
- metaflow/decorators.py +2 -4
- metaflow/extension_support/__init__.py +3 -3
- metaflow/extension_support/plugins.py +3 -3
- metaflow/metaflow_config.py +35 -18
- metaflow/parameters.py +3 -3
- metaflow/plugins/airflow/airflow.py +6 -6
- metaflow/plugins/airflow/airflow_utils.py +5 -3
- metaflow/plugins/argo/argo_workflows.py +555 -192
- metaflow/plugins/argo/argo_workflows_cli.py +27 -4
- metaflow/plugins/argo/argo_workflows_decorator.py +6 -13
- metaflow/plugins/argo/capture_error.py +70 -0
- metaflow/plugins/argo/daemon.py +59 -0
- metaflow/plugins/aws/step_functions/step_functions.py +3 -3
- metaflow/plugins/cards/card_modules/basic.py +5 -3
- metaflow/plugins/cards/card_modules/convert_to_native_type.py +2 -2
- metaflow/plugins/cards/card_modules/renderer_tools.py +1 -0
- metaflow/plugins/cards/card_modules/test_cards.py +0 -2
- metaflow/plugins/datastores/gs_storage.py +3 -10
- metaflow/plugins/datatools/s3/s3op.py +5 -3
- metaflow/plugins/kubernetes/kubernetes.py +1 -0
- metaflow/plugins/kubernetes/kubernetes_job.py +32 -42
- metaflow/plugins/kubernetes/kubernetes_jobsets.py +16 -14
- metaflow/plugins/logs_cli.py +1 -0
- metaflow/plugins/pypi/conda_environment.py +1 -3
- metaflow/plugins/pypi/pip.py +3 -3
- metaflow/plugins/storage_executor.py +1 -5
- metaflow/plugins/tag_cli.py +3 -3
- metaflow/procpoll.py +1 -1
- metaflow/runtime.py +1 -0
- metaflow/tracing/__init__.py +0 -5
- metaflow/tracing/tracing_modules.py +1 -4
- metaflow/util.py +6 -6
- metaflow/version.py +1 -1
- {ob_metaflow-2.12.10.1rc2.dist-info → ob_metaflow-2.12.11.0.dist-info}/METADATA +2 -2
- {ob_metaflow-2.12.10.1rc2.dist-info → ob_metaflow-2.12.11.0.dist-info}/RECORD +44 -43
- metaflow/tracing/threadpool.py +0 -30
- {ob_metaflow-2.12.10.1rc2.dist-info → ob_metaflow-2.12.11.0.dist-info}/LICENSE +0 -0
- {ob_metaflow-2.12.10.1rc2.dist-info → ob_metaflow-2.12.11.0.dist-info}/WHEEL +0 -0
- {ob_metaflow-2.12.10.1rc2.dist-info → ob_metaflow-2.12.11.0.dist-info}/entry_points.txt +0 -0
- {ob_metaflow-2.12.10.1rc2.dist-info → ob_metaflow-2.12.11.0.dist-info}/top_level.txt +0 -0
|
@@ -4,15 +4,15 @@ import os
|
|
|
4
4
|
import re
|
|
5
5
|
import shlex
|
|
6
6
|
import sys
|
|
7
|
-
from typing import Tuple, List
|
|
8
7
|
from collections import defaultdict
|
|
9
8
|
from hashlib import sha1
|
|
10
9
|
from math import inf
|
|
10
|
+
from typing import List, Tuple
|
|
11
11
|
|
|
12
12
|
from metaflow import JSONType, current
|
|
13
|
-
from metaflow.graph import DAGNode
|
|
14
13
|
from metaflow.decorators import flow_decorators
|
|
15
14
|
from metaflow.exception import MetaflowException
|
|
15
|
+
from metaflow.graph import DAGNode, FlowGraph
|
|
16
16
|
from metaflow.includefile import FilePathClass
|
|
17
17
|
from metaflow.metaflow_config import (
|
|
18
18
|
ARGO_EVENTS_EVENT,
|
|
@@ -21,10 +21,12 @@ from metaflow.metaflow_config import (
|
|
|
21
21
|
ARGO_EVENTS_INTERNAL_WEBHOOK_URL,
|
|
22
22
|
ARGO_EVENTS_SERVICE_ACCOUNT,
|
|
23
23
|
ARGO_EVENTS_WEBHOOK_AUTH,
|
|
24
|
+
ARGO_WORKFLOWS_CAPTURE_ERROR_SCRIPT,
|
|
24
25
|
ARGO_WORKFLOWS_ENV_VARS_TO_SKIP,
|
|
25
26
|
ARGO_WORKFLOWS_KUBERNETES_SECRETS,
|
|
26
27
|
ARGO_WORKFLOWS_UI_URL,
|
|
27
28
|
AWS_SECRETS_MANAGER_DEFAULT_REGION,
|
|
29
|
+
AZURE_KEY_VAULT_PREFIX,
|
|
28
30
|
AZURE_STORAGE_BLOB_SERVICE_ENDPOINT,
|
|
29
31
|
CARD_AZUREROOT,
|
|
30
32
|
CARD_GSROOT,
|
|
@@ -36,7 +38,6 @@ from metaflow.metaflow_config import (
|
|
|
36
38
|
DEFAULT_METADATA,
|
|
37
39
|
DEFAULT_SECRETS_BACKEND_TYPE,
|
|
38
40
|
GCP_SECRET_MANAGER_PREFIX,
|
|
39
|
-
AZURE_KEY_VAULT_PREFIX,
|
|
40
41
|
KUBERNETES_FETCH_EC2_METADATA,
|
|
41
42
|
KUBERNETES_LABELS,
|
|
42
43
|
KUBERNETES_NAMESPACE,
|
|
@@ -50,7 +51,6 @@ from metaflow.metaflow_config import (
|
|
|
50
51
|
UI_URL,
|
|
51
52
|
PAGERDUTY_TEMPLATE_URL,
|
|
52
53
|
)
|
|
53
|
-
from metaflow.unbounded_foreach import UBF_CONTROL, UBF_TASK
|
|
54
54
|
from metaflow.metaflow_config_funcs import config_values
|
|
55
55
|
from metaflow.mflog import BASH_SAVE_LOGS, bash_capture_logs, export_mflog_env_vars
|
|
56
56
|
from metaflow.parameters import deploy_time_eval
|
|
@@ -58,7 +58,8 @@ from metaflow.plugins.kubernetes.kubernetes import (
|
|
|
58
58
|
parse_kube_keyvalue_list,
|
|
59
59
|
validate_kube_labels,
|
|
60
60
|
)
|
|
61
|
-
from metaflow.
|
|
61
|
+
from metaflow.plugins.kubernetes.kubernetes_jobsets import KubernetesArgoJobSet
|
|
62
|
+
from metaflow.unbounded_foreach import UBF_CONTROL, UBF_TASK
|
|
62
63
|
from metaflow.util import (
|
|
63
64
|
compress_list,
|
|
64
65
|
dict_to_cli_options,
|
|
@@ -66,9 +67,6 @@ from metaflow.util import (
|
|
|
66
67
|
to_camelcase,
|
|
67
68
|
to_unicode,
|
|
68
69
|
)
|
|
69
|
-
from metaflow.plugins.kubernetes.kubernetes_jobsets import (
|
|
70
|
-
KubernetesArgoJobSet,
|
|
71
|
-
)
|
|
72
70
|
|
|
73
71
|
from .argo_client import ArgoClient
|
|
74
72
|
|
|
@@ -118,6 +116,8 @@ class ArgoWorkflows(object):
|
|
|
118
116
|
notify_on_success=False,
|
|
119
117
|
notify_slack_webhook_url=None,
|
|
120
118
|
notify_pager_duty_integration_key=None,
|
|
119
|
+
enable_heartbeat_daemon=True,
|
|
120
|
+
enable_error_msg_capture=False,
|
|
121
121
|
):
|
|
122
122
|
# Some high-level notes -
|
|
123
123
|
#
|
|
@@ -165,7 +165,8 @@ class ArgoWorkflows(object):
|
|
|
165
165
|
self.notify_on_success = notify_on_success
|
|
166
166
|
self.notify_slack_webhook_url = notify_slack_webhook_url
|
|
167
167
|
self.notify_pager_duty_integration_key = notify_pager_duty_integration_key
|
|
168
|
-
|
|
168
|
+
self.enable_heartbeat_daemon = enable_heartbeat_daemon
|
|
169
|
+
self.enable_error_msg_capture = enable_error_msg_capture
|
|
169
170
|
self.parameters = self._process_parameters()
|
|
170
171
|
self.triggers, self.trigger_options = self._process_triggers()
|
|
171
172
|
self._schedule, self._timezone = self._get_schedule()
|
|
@@ -785,6 +786,12 @@ class ArgoWorkflows(object):
|
|
|
785
786
|
)
|
|
786
787
|
# Set the entrypoint to flow name
|
|
787
788
|
.entrypoint(self.flow.name)
|
|
789
|
+
# OnExit hooks
|
|
790
|
+
.onExit(
|
|
791
|
+
"capture-error-hook-fn-preflight"
|
|
792
|
+
if self.enable_error_msg_capture
|
|
793
|
+
else None
|
|
794
|
+
)
|
|
788
795
|
# Set exit hook handlers if notifications are enabled
|
|
789
796
|
.hooks(
|
|
790
797
|
{
|
|
@@ -854,6 +861,8 @@ class ArgoWorkflows(object):
|
|
|
854
861
|
.templates(self._container_templates())
|
|
855
862
|
# Exit hook template(s)
|
|
856
863
|
.templates(self._exit_hook_templates())
|
|
864
|
+
# Sidecar templates (Daemon Containers)
|
|
865
|
+
.templates(self._daemon_templates())
|
|
857
866
|
)
|
|
858
867
|
)
|
|
859
868
|
|
|
@@ -1060,7 +1069,7 @@ class ArgoWorkflows(object):
|
|
|
1060
1069
|
"%s-foreach-%s"
|
|
1061
1070
|
% (
|
|
1062
1071
|
node.name,
|
|
1063
|
-
"parallel" if node.parallel_foreach else node.foreach_param
|
|
1072
|
+
"parallel" if node.parallel_foreach else node.foreach_param,
|
|
1064
1073
|
# Since foreach's are derived based on `self.next(self.a, foreach="<varname>")`
|
|
1065
1074
|
# vs @parallel foreach are done based on `self.next(self.a, num_parallel="<some-number>")`,
|
|
1066
1075
|
# we need to ensure that `foreach_template_name` suffix is appropriately set based on the kind
|
|
@@ -1266,7 +1275,13 @@ class ArgoWorkflows(object):
|
|
|
1266
1275
|
"Argo Workflows." % (node.type, node.name)
|
|
1267
1276
|
)
|
|
1268
1277
|
|
|
1269
|
-
|
|
1278
|
+
# Generate daemon tasks
|
|
1279
|
+
daemon_tasks = [
|
|
1280
|
+
DAGTask("%s-task" % daemon_template.name).template(daemon_template.name)
|
|
1281
|
+
for daemon_template in self._daemon_templates()
|
|
1282
|
+
]
|
|
1283
|
+
|
|
1284
|
+
templates, _ = _visit(node=self.graph["start"], dag_tasks=daemon_tasks)
|
|
1270
1285
|
return templates
|
|
1271
1286
|
|
|
1272
1287
|
# Visit every node and yield ContainerTemplates.
|
|
@@ -1351,7 +1366,7 @@ class ArgoWorkflows(object):
|
|
|
1351
1366
|
task_str = "-".join(
|
|
1352
1367
|
[
|
|
1353
1368
|
"$TASK_ID_PREFIX",
|
|
1354
|
-
"{{inputs.parameters.task-id-entropy}}",
|
|
1369
|
+
"{{inputs.parameters.task-id-entropy}}",
|
|
1355
1370
|
"$TASK_ID_SUFFIX",
|
|
1356
1371
|
]
|
|
1357
1372
|
)
|
|
@@ -1382,8 +1397,6 @@ class ArgoWorkflows(object):
|
|
|
1382
1397
|
user_code_retries = max_user_code_retries
|
|
1383
1398
|
total_retries = max_user_code_retries + max_error_retries
|
|
1384
1399
|
# {{retries}} is only available if retryStrategy is specified
|
|
1385
|
-
# and they are only available in the container templates NOT for custom
|
|
1386
|
-
# Kubernetes manifests like Jobsets.
|
|
1387
1400
|
# For custom kubernetes manifests, we will pass the retryCount as a parameter
|
|
1388
1401
|
# and use that in the manifest.
|
|
1389
1402
|
retry_count = (
|
|
@@ -1510,8 +1523,7 @@ class ArgoWorkflows(object):
|
|
|
1510
1523
|
)
|
|
1511
1524
|
)
|
|
1512
1525
|
else:
|
|
1513
|
-
#
|
|
1514
|
-
# because our current strategy of using volume mounts for outputs won't work with Jobsets
|
|
1526
|
+
# Handle @parallel where output from volume mount isn't accessible
|
|
1515
1527
|
input_paths = (
|
|
1516
1528
|
"$(python -m metaflow.plugins.argo.jobset_input_paths %s %s {{inputs.parameters.task-id-entropy}} {{inputs.parameters.num-parallel}})"
|
|
1517
1529
|
% (
|
|
@@ -1650,16 +1662,16 @@ class ArgoWorkflows(object):
|
|
|
1650
1662
|
|
|
1651
1663
|
# support for @secret
|
|
1652
1664
|
env["METAFLOW_DEFAULT_SECRETS_BACKEND_TYPE"] = DEFAULT_SECRETS_BACKEND_TYPE
|
|
1653
|
-
env[
|
|
1654
|
-
|
|
1655
|
-
|
|
1665
|
+
env["METAFLOW_AWS_SECRETS_MANAGER_DEFAULT_REGION"] = (
|
|
1666
|
+
AWS_SECRETS_MANAGER_DEFAULT_REGION
|
|
1667
|
+
)
|
|
1656
1668
|
env["METAFLOW_GCP_SECRET_MANAGER_PREFIX"] = GCP_SECRET_MANAGER_PREFIX
|
|
1657
1669
|
env["METAFLOW_AZURE_KEY_VAULT_PREFIX"] = AZURE_KEY_VAULT_PREFIX
|
|
1658
1670
|
|
|
1659
1671
|
# support for Azure
|
|
1660
|
-
env[
|
|
1661
|
-
|
|
1662
|
-
|
|
1672
|
+
env["METAFLOW_AZURE_STORAGE_BLOB_SERVICE_ENDPOINT"] = (
|
|
1673
|
+
AZURE_STORAGE_BLOB_SERVICE_ENDPOINT
|
|
1674
|
+
)
|
|
1663
1675
|
env["METAFLOW_DATASTORE_SYSROOT_AZURE"] = DATASTORE_SYSROOT_AZURE
|
|
1664
1676
|
env["METAFLOW_CARD_AZUREROOT"] = CARD_AZUREROOT
|
|
1665
1677
|
|
|
@@ -1724,9 +1736,7 @@ class ArgoWorkflows(object):
|
|
|
1724
1736
|
else:
|
|
1725
1737
|
# append this only for joins of foreaches, not static splits
|
|
1726
1738
|
inputs.append(Parameter("split-cardinality"))
|
|
1727
|
-
#
|
|
1728
|
-
# a foreach join node, hence we can safely assume that if that condition fails then
|
|
1729
|
-
# we can check if the node is a @parallel node.
|
|
1739
|
+
# check if the node is a @parallel node.
|
|
1730
1740
|
elif node.parallel_step:
|
|
1731
1741
|
inputs.extend(
|
|
1732
1742
|
[
|
|
@@ -1781,7 +1791,7 @@ class ArgoWorkflows(object):
|
|
|
1781
1791
|
),
|
|
1782
1792
|
]
|
|
1783
1793
|
)
|
|
1784
|
-
# Outputs should be defined over here
|
|
1794
|
+
# Outputs should be defined over here and not in the _dag_template for @parallel.
|
|
1785
1795
|
|
|
1786
1796
|
# It makes no sense to set env vars to None (shows up as "None" string)
|
|
1787
1797
|
# Also we skip some env vars (e.g. in case we want to pull them from KUBERNETES_SECRETS)
|
|
@@ -1808,20 +1818,20 @@ class ArgoWorkflows(object):
|
|
|
1808
1818
|
|
|
1809
1819
|
if tmpfs_enabled and tmpfs_tempdir:
|
|
1810
1820
|
env["METAFLOW_TEMPDIR"] = tmpfs_path
|
|
1821
|
+
|
|
1811
1822
|
# Create a ContainerTemplate for this node. Ideally, we would have
|
|
1812
1823
|
# liked to inline this ContainerTemplate and avoid scanning the workflow
|
|
1813
1824
|
# twice, but due to issues with variable substitution, we will have to
|
|
1814
1825
|
# live with this routine.
|
|
1815
1826
|
if node.parallel_step:
|
|
1816
|
-
|
|
1817
1827
|
# Explicitly add the task-id-hint label. This is important because this label
|
|
1818
|
-
# is returned as an Output parameter of this step and is used subsequently an
|
|
1819
|
-
# an input in the join step.
|
|
1828
|
+
# is returned as an Output parameter of this step and is used subsequently as an
|
|
1829
|
+
# an input in the join step.
|
|
1820
1830
|
kubernetes_labels = self.kubernetes_labels.copy()
|
|
1821
1831
|
jobset_name = "{{inputs.parameters.jobset-name}}"
|
|
1822
|
-
kubernetes_labels[
|
|
1823
|
-
"
|
|
1824
|
-
|
|
1832
|
+
kubernetes_labels["task_id_entropy"] = (
|
|
1833
|
+
"{{inputs.parameters.task-id-entropy}}"
|
|
1834
|
+
)
|
|
1825
1835
|
kubernetes_labels["num_parallel"] = "{{inputs.parameters.num-parallel}}"
|
|
1826
1836
|
jobset = KubernetesArgoJobSet(
|
|
1827
1837
|
kubernetes_sdk=kubernetes_sdk,
|
|
@@ -1845,9 +1855,11 @@ class ArgoWorkflows(object):
|
|
|
1845
1855
|
list(
|
|
1846
1856
|
[]
|
|
1847
1857
|
if not resources.get("secrets")
|
|
1848
|
-
else
|
|
1849
|
-
|
|
1850
|
-
|
|
1858
|
+
else (
|
|
1859
|
+
[resources.get("secrets")]
|
|
1860
|
+
if isinstance(resources.get("secrets"), str)
|
|
1861
|
+
else resources.get("secrets")
|
|
1862
|
+
)
|
|
1851
1863
|
)
|
|
1852
1864
|
+ KUBERNETES_SECRETS.split(",")
|
|
1853
1865
|
+ ARGO_WORKFLOWS_KUBERNETES_SECRETS.split(",")
|
|
@@ -1878,7 +1890,6 @@ class ArgoWorkflows(object):
|
|
|
1878
1890
|
for k, v in kubernetes_labels.items():
|
|
1879
1891
|
jobset.label(k, v)
|
|
1880
1892
|
|
|
1881
|
-
## -----Jobset specific env vars START here-----
|
|
1882
1893
|
jobset.environment_variable(
|
|
1883
1894
|
"MF_MASTER_ADDR", jobset.jobset_control_addr
|
|
1884
1895
|
)
|
|
@@ -1897,7 +1908,6 @@ class ArgoWorkflows(object):
|
|
|
1897
1908
|
"METAFLOW_KUBERNETES_POD_ID": "metadata.uid",
|
|
1898
1909
|
"METAFLOW_KUBERNETES_SERVICE_ACCOUNT_NAME": "spec.serviceAccountName",
|
|
1899
1910
|
"METAFLOW_KUBERNETES_NODE_IP": "status.hostIP",
|
|
1900
|
-
# `TASK_ID_SUFFIX` is needed for the construction of the task-ids
|
|
1901
1911
|
"TASK_ID_SUFFIX": "metadata.annotations['jobset.sigs.k8s.io/job-index']",
|
|
1902
1912
|
}
|
|
1903
1913
|
)
|
|
@@ -1922,8 +1932,7 @@ class ArgoWorkflows(object):
|
|
|
1922
1932
|
)
|
|
1923
1933
|
for k, v in annotations.items():
|
|
1924
1934
|
jobset.annotation(k, v)
|
|
1925
|
-
|
|
1926
|
-
## ---- Jobset control/workers specific vars START here ----
|
|
1935
|
+
|
|
1927
1936
|
jobset.control.replicas(1)
|
|
1928
1937
|
jobset.worker.replicas("{{=asInt(inputs.parameters.workerCount)}}")
|
|
1929
1938
|
jobset.control.environment_variable("UBF_CONTEXT", UBF_CONTROL)
|
|
@@ -1934,7 +1943,6 @@ class ArgoWorkflows(object):
|
|
|
1934
1943
|
jobset.control.environment_variable("TASK_ID_PREFIX", "control")
|
|
1935
1944
|
jobset.worker.environment_variable("TASK_ID_PREFIX", "worker")
|
|
1936
1945
|
|
|
1937
|
-
## ---- Jobset control/workers specific vars END here ----
|
|
1938
1946
|
yield (
|
|
1939
1947
|
Template(ArgoWorkflows._sanitize(node.name))
|
|
1940
1948
|
.resource(
|
|
@@ -1961,167 +1969,185 @@ class ArgoWorkflows(object):
|
|
|
1961
1969
|
minutes_between_retries=minutes_between_retries,
|
|
1962
1970
|
)
|
|
1963
1971
|
)
|
|
1964
|
-
|
|
1965
|
-
|
|
1966
|
-
|
|
1967
|
-
|
|
1968
|
-
|
|
1969
|
-
|
|
1970
|
-
|
|
1971
|
-
|
|
1972
|
-
|
|
1973
|
-
|
|
1974
|
-
|
|
1975
|
-
|
|
1976
|
-
|
|
1977
|
-
|
|
1978
|
-
|
|
1979
|
-
|
|
1980
|
-
|
|
1981
|
-
|
|
1982
|
-
|
|
1983
|
-
|
|
1984
|
-
|
|
1985
|
-
|
|
1986
|
-
|
|
1987
|
-
|
|
1988
|
-
|
|
1989
|
-
|
|
1990
|
-
|
|
1991
|
-
|
|
1992
|
-
|
|
1993
|
-
|
|
1994
|
-
|
|
1995
|
-
|
|
1996
|
-
|
|
1997
|
-
|
|
1998
|
-
|
|
1999
|
-
|
|
2000
|
-
|
|
2001
|
-
|
|
2002
|
-
|
|
2003
|
-
|
|
2004
|
-
|
|
2005
|
-
|
|
2006
|
-
|
|
2007
|
-
|
|
2008
|
-
|
|
2009
|
-
|
|
2010
|
-
|
|
2011
|
-
|
|
2012
|
-
|
|
2013
|
-
|
|
2014
|
-
|
|
2015
|
-
|
|
2016
|
-
|
|
2017
|
-
|
|
2018
|
-
|
|
2019
|
-
|
|
2020
|
-
|
|
2021
|
-
|
|
2022
|
-
|
|
2023
|
-
|
|
2024
|
-
kubernetes_sdk.V1EnvVar(name=k, value=str(v))
|
|
2025
|
-
for k, v in env.items()
|
|
2026
|
-
]
|
|
2027
|
-
# Add environment variables for book-keeping.
|
|
2028
|
-
# https://argoproj.github.io/argo-workflows/fields/#fields_155
|
|
2029
|
-
+ [
|
|
2030
|
-
kubernetes_sdk.V1EnvVar(
|
|
2031
|
-
name=k,
|
|
2032
|
-
value_from=kubernetes_sdk.V1EnvVarSource(
|
|
2033
|
-
field_ref=kubernetes_sdk.V1ObjectFieldSelector(
|
|
2034
|
-
field_path=str(v)
|
|
1972
|
+
else:
|
|
1973
|
+
yield (
|
|
1974
|
+
Template(self._sanitize(node.name))
|
|
1975
|
+
# Set @timeout values
|
|
1976
|
+
.active_deadline_seconds(run_time_limit)
|
|
1977
|
+
# Set service account
|
|
1978
|
+
.service_account_name(resources["service_account"])
|
|
1979
|
+
# Configure template input
|
|
1980
|
+
.inputs(Inputs().parameters(inputs))
|
|
1981
|
+
# Configure template output
|
|
1982
|
+
.outputs(Outputs().parameters(outputs))
|
|
1983
|
+
# Fail fast!
|
|
1984
|
+
.fail_fast()
|
|
1985
|
+
# Set @retry/@catch values
|
|
1986
|
+
.retry_strategy(
|
|
1987
|
+
times=total_retries,
|
|
1988
|
+
minutes_between_retries=minutes_between_retries,
|
|
1989
|
+
)
|
|
1990
|
+
.metadata(
|
|
1991
|
+
ObjectMeta().annotation("metaflow/step_name", node.name)
|
|
1992
|
+
# Unfortunately, we can't set the task_id since it is generated
|
|
1993
|
+
# inside the pod. However, it can be inferred from the annotation
|
|
1994
|
+
# set by argo-workflows - `workflows.argoproj.io/outputs` - refer
|
|
1995
|
+
# the field 'task-id' in 'parameters'
|
|
1996
|
+
# .annotation("metaflow/task_id", ...)
|
|
1997
|
+
.annotation("metaflow/attempt", retry_count)
|
|
1998
|
+
)
|
|
1999
|
+
# Set emptyDir volume for state management
|
|
2000
|
+
.empty_dir_volume("out")
|
|
2001
|
+
# Set tmpfs emptyDir volume if enabled
|
|
2002
|
+
.empty_dir_volume(
|
|
2003
|
+
"tmpfs-ephemeral-volume",
|
|
2004
|
+
medium="Memory",
|
|
2005
|
+
size_limit=tmpfs_size if tmpfs_enabled else 0,
|
|
2006
|
+
)
|
|
2007
|
+
.empty_dir_volume("dhsm", medium="Memory", size_limit=shared_memory)
|
|
2008
|
+
.pvc_volumes(resources.get("persistent_volume_claims"))
|
|
2009
|
+
# Set node selectors
|
|
2010
|
+
.node_selectors(resources.get("node_selector"))
|
|
2011
|
+
# Set tolerations
|
|
2012
|
+
.tolerations(resources.get("tolerations"))
|
|
2013
|
+
# Set container
|
|
2014
|
+
.container(
|
|
2015
|
+
# TODO: Unify the logic with kubernetes.py
|
|
2016
|
+
# Important note - Unfortunately, V1Container uses snakecase while
|
|
2017
|
+
# Argo Workflows uses camel. For most of the attributes, both cases
|
|
2018
|
+
# are indistinguishable, but unfortunately, not for all - (
|
|
2019
|
+
# env_from, value_from, etc.) - so we need to handle the conversion
|
|
2020
|
+
# ourselves using to_camelcase. We need to be vigilant about
|
|
2021
|
+
# resources attributes in particular where the keys maybe user
|
|
2022
|
+
# defined.
|
|
2023
|
+
to_camelcase(
|
|
2024
|
+
kubernetes_sdk.V1Container(
|
|
2025
|
+
name=self._sanitize(node.name),
|
|
2026
|
+
command=cmds,
|
|
2027
|
+
termination_message_policy="FallbackToLogsOnError",
|
|
2028
|
+
ports=(
|
|
2029
|
+
[
|
|
2030
|
+
kubernetes_sdk.V1ContainerPort(
|
|
2031
|
+
container_port=port
|
|
2035
2032
|
)
|
|
2036
|
-
|
|
2037
|
-
|
|
2038
|
-
|
|
2039
|
-
|
|
2040
|
-
|
|
2041
|
-
|
|
2042
|
-
|
|
2043
|
-
|
|
2044
|
-
|
|
2045
|
-
|
|
2046
|
-
|
|
2047
|
-
|
|
2048
|
-
|
|
2049
|
-
|
|
2050
|
-
|
|
2051
|
-
|
|
2052
|
-
|
|
2053
|
-
|
|
2054
|
-
limits={
|
|
2055
|
-
"%s.com/gpu".lower()
|
|
2056
|
-
% resources["gpu_vendor"]: str(resources["gpu"])
|
|
2057
|
-
for k in [0]
|
|
2058
|
-
if resources["gpu"] is not None
|
|
2059
|
-
},
|
|
2060
|
-
),
|
|
2061
|
-
# Configure secrets
|
|
2062
|
-
env_from=[
|
|
2063
|
-
kubernetes_sdk.V1EnvFromSource(
|
|
2064
|
-
secret_ref=kubernetes_sdk.V1SecretEnvSource(
|
|
2065
|
-
name=str(k),
|
|
2066
|
-
# optional=True
|
|
2033
|
+
]
|
|
2034
|
+
if port
|
|
2035
|
+
else None
|
|
2036
|
+
),
|
|
2037
|
+
env=[
|
|
2038
|
+
kubernetes_sdk.V1EnvVar(name=k, value=str(v))
|
|
2039
|
+
for k, v in env.items()
|
|
2040
|
+
]
|
|
2041
|
+
# Add environment variables for book-keeping.
|
|
2042
|
+
# https://argoproj.github.io/argo-workflows/fields/#fields_155
|
|
2043
|
+
+ [
|
|
2044
|
+
kubernetes_sdk.V1EnvVar(
|
|
2045
|
+
name=k,
|
|
2046
|
+
value_from=kubernetes_sdk.V1EnvVarSource(
|
|
2047
|
+
field_ref=kubernetes_sdk.V1ObjectFieldSelector(
|
|
2048
|
+
field_path=str(v)
|
|
2049
|
+
)
|
|
2050
|
+
),
|
|
2067
2051
|
)
|
|
2068
|
-
|
|
2069
|
-
|
|
2070
|
-
|
|
2071
|
-
|
|
2072
|
-
|
|
2073
|
-
|
|
2074
|
-
|
|
2075
|
-
|
|
2076
|
-
|
|
2077
|
-
|
|
2078
|
-
|
|
2079
|
-
|
|
2080
|
-
|
|
2081
|
-
|
|
2082
|
-
|
|
2083
|
-
|
|
2084
|
-
|
|
2085
|
-
|
|
2086
|
-
|
|
2087
|
-
|
|
2088
|
-
|
|
2089
|
-
|
|
2090
|
-
|
|
2091
|
-
|
|
2052
|
+
for k, v in {
|
|
2053
|
+
"METAFLOW_KUBERNETES_NAMESPACE": "metadata.namespace",
|
|
2054
|
+
"METAFLOW_KUBERNETES_POD_NAMESPACE": "metadata.namespace",
|
|
2055
|
+
"METAFLOW_KUBERNETES_POD_NAME": "metadata.name",
|
|
2056
|
+
"METAFLOW_KUBERNETES_POD_ID": "metadata.uid",
|
|
2057
|
+
"METAFLOW_KUBERNETES_SERVICE_ACCOUNT_NAME": "spec.serviceAccountName",
|
|
2058
|
+
"METAFLOW_KUBERNETES_NODE_IP": "status.hostIP",
|
|
2059
|
+
}.items()
|
|
2060
|
+
],
|
|
2061
|
+
image=resources["image"],
|
|
2062
|
+
image_pull_policy=resources["image_pull_policy"],
|
|
2063
|
+
resources=kubernetes_sdk.V1ResourceRequirements(
|
|
2064
|
+
requests={
|
|
2065
|
+
"cpu": str(resources["cpu"]),
|
|
2066
|
+
"memory": "%sM" % str(resources["memory"]),
|
|
2067
|
+
"ephemeral-storage": "%sM"
|
|
2068
|
+
% str(resources["disk"]),
|
|
2069
|
+
},
|
|
2070
|
+
limits={
|
|
2071
|
+
"%s.com/gpu".lower()
|
|
2072
|
+
% resources["gpu_vendor"]: str(resources["gpu"])
|
|
2073
|
+
for k in [0]
|
|
2074
|
+
if resources["gpu"] is not None
|
|
2075
|
+
},
|
|
2076
|
+
),
|
|
2077
|
+
# Configure secrets
|
|
2078
|
+
env_from=[
|
|
2079
|
+
kubernetes_sdk.V1EnvFromSource(
|
|
2080
|
+
secret_ref=kubernetes_sdk.V1SecretEnvSource(
|
|
2081
|
+
name=str(k),
|
|
2082
|
+
# optional=True
|
|
2083
|
+
)
|
|
2092
2084
|
)
|
|
2093
|
-
|
|
2094
|
-
|
|
2095
|
-
|
|
2096
|
-
|
|
2097
|
-
|
|
2098
|
-
|
|
2099
|
-
|
|
2100
|
-
|
|
2101
|
-
name="dhsm",
|
|
2102
|
-
mount_path="/dev/shm",
|
|
2085
|
+
for k in list(
|
|
2086
|
+
[]
|
|
2087
|
+
if not resources.get("secrets")
|
|
2088
|
+
else (
|
|
2089
|
+
[resources.get("secrets")]
|
|
2090
|
+
if isinstance(resources.get("secrets"), str)
|
|
2091
|
+
else resources.get("secrets")
|
|
2092
|
+
)
|
|
2103
2093
|
)
|
|
2104
|
-
|
|
2105
|
-
|
|
2106
|
-
|
|
2107
|
-
|
|
2108
|
-
|
|
2109
|
-
|
|
2110
|
-
[
|
|
2094
|
+
+ KUBERNETES_SECRETS.split(",")
|
|
2095
|
+
+ ARGO_WORKFLOWS_KUBERNETES_SECRETS.split(",")
|
|
2096
|
+
if k
|
|
2097
|
+
],
|
|
2098
|
+
volume_mounts=[
|
|
2099
|
+
# Assign a volume mount to pass state to the next task.
|
|
2111
2100
|
kubernetes_sdk.V1VolumeMount(
|
|
2112
|
-
name=
|
|
2101
|
+
name="out", mount_path="/mnt/out"
|
|
2113
2102
|
)
|
|
2114
|
-
for claim, path in resources.get(
|
|
2115
|
-
"persistent_volume_claims"
|
|
2116
|
-
).items()
|
|
2117
2103
|
]
|
|
2118
|
-
|
|
2119
|
-
|
|
2120
|
-
|
|
2121
|
-
|
|
2104
|
+
# Support tmpfs.
|
|
2105
|
+
+ (
|
|
2106
|
+
[
|
|
2107
|
+
kubernetes_sdk.V1VolumeMount(
|
|
2108
|
+
name="tmpfs-ephemeral-volume",
|
|
2109
|
+
mount_path=tmpfs_path,
|
|
2110
|
+
)
|
|
2111
|
+
]
|
|
2112
|
+
if tmpfs_enabled
|
|
2113
|
+
else []
|
|
2114
|
+
)
|
|
2115
|
+
# Support shared_memory
|
|
2116
|
+
+ (
|
|
2117
|
+
[
|
|
2118
|
+
kubernetes_sdk.V1VolumeMount(
|
|
2119
|
+
name="dhsm",
|
|
2120
|
+
mount_path="/dev/shm",
|
|
2121
|
+
)
|
|
2122
|
+
]
|
|
2123
|
+
if shared_memory
|
|
2124
|
+
else []
|
|
2125
|
+
)
|
|
2126
|
+
# Support persistent volume claims.
|
|
2127
|
+
+ (
|
|
2128
|
+
[
|
|
2129
|
+
kubernetes_sdk.V1VolumeMount(
|
|
2130
|
+
name=claim, mount_path=path
|
|
2131
|
+
)
|
|
2132
|
+
for claim, path in resources.get(
|
|
2133
|
+
"persistent_volume_claims"
|
|
2134
|
+
).items()
|
|
2135
|
+
]
|
|
2136
|
+
if resources.get("persistent_volume_claims")
|
|
2137
|
+
is not None
|
|
2138
|
+
else []
|
|
2139
|
+
),
|
|
2140
|
+
).to_dict()
|
|
2141
|
+
)
|
|
2122
2142
|
)
|
|
2123
2143
|
)
|
|
2124
|
-
|
|
2144
|
+
|
|
2145
|
+
# Return daemon container templates for workflow execution notifications.
|
|
2146
|
+
def _daemon_templates(self):
|
|
2147
|
+
templates = []
|
|
2148
|
+
if self.enable_heartbeat_daemon:
|
|
2149
|
+
templates.append(self._heartbeat_daemon_template())
|
|
2150
|
+
return templates
|
|
2125
2151
|
|
|
2126
2152
|
# Return exit hook templates for workflow execution notifications.
|
|
2127
2153
|
def _exit_hook_templates(self):
|
|
@@ -2149,8 +2175,150 @@ class ArgoWorkflows(object):
|
|
|
2149
2175
|
.success_condition("true == true")
|
|
2150
2176
|
)
|
|
2151
2177
|
)
|
|
2178
|
+
if self.enable_error_msg_capture:
|
|
2179
|
+
templates.extend(self._error_msg_capture_hook_templates())
|
|
2152
2180
|
return templates
|
|
2153
2181
|
|
|
2182
|
+
def _error_msg_capture_hook_templates(self):
|
|
2183
|
+
from kubernetes import client as kubernetes_sdk
|
|
2184
|
+
|
|
2185
|
+
start_step = [step for step in self.graph if step.name == "start"][0]
|
|
2186
|
+
# We want to grab the base image used by the start step, as this is known to be pullable from within the cluster,
|
|
2187
|
+
# and it might contain the required libraries, allowing us to start up faster.
|
|
2188
|
+
resources = dict(
|
|
2189
|
+
[deco for deco in start_step.decorators if deco.name == "kubernetes"][
|
|
2190
|
+
0
|
|
2191
|
+
].attributes
|
|
2192
|
+
)
|
|
2193
|
+
|
|
2194
|
+
run_id_template = "argo-{{workflow.name}}"
|
|
2195
|
+
metaflow_version = self.environment.get_environment_info()
|
|
2196
|
+
metaflow_version["flow_name"] = self.graph.name
|
|
2197
|
+
metaflow_version["production_token"] = self.production_token
|
|
2198
|
+
|
|
2199
|
+
mflog_expr = export_mflog_env_vars(
|
|
2200
|
+
datastore_type=self.flow_datastore.TYPE,
|
|
2201
|
+
stdout_path="$PWD/.logs/mflog_stdout",
|
|
2202
|
+
stderr_path="$PWD/.logs/mflog_stderr",
|
|
2203
|
+
flow_name=self.flow.name,
|
|
2204
|
+
run_id=run_id_template,
|
|
2205
|
+
step_name="_run_capture_error",
|
|
2206
|
+
task_id="1",
|
|
2207
|
+
retry_count="0",
|
|
2208
|
+
)
|
|
2209
|
+
|
|
2210
|
+
cmds = " && ".join(
|
|
2211
|
+
[
|
|
2212
|
+
# For supporting sandboxes, ensure that a custom script is executed
|
|
2213
|
+
# before anything else is executed. The script is passed in as an
|
|
2214
|
+
# env var.
|
|
2215
|
+
'${METAFLOW_INIT_SCRIPT:+eval \\"${METAFLOW_INIT_SCRIPT}\\"}',
|
|
2216
|
+
"mkdir -p $PWD/.logs",
|
|
2217
|
+
mflog_expr,
|
|
2218
|
+
]
|
|
2219
|
+
+ self.environment.get_package_commands(
|
|
2220
|
+
self.code_package_url, self.flow_datastore.TYPE
|
|
2221
|
+
)[:-1]
|
|
2222
|
+
# Replace the line 'Task in starting'
|
|
2223
|
+
# FIXME: this can be brittle.
|
|
2224
|
+
+ ["mflog 'Error capture hook is starting.'"]
|
|
2225
|
+
+ ["argo_error=$(python -m 'metaflow.plugins.argo.capture_error')"]
|
|
2226
|
+
+ ["export METAFLOW_ARGO_ERROR=$argo_error"]
|
|
2227
|
+
+ [
|
|
2228
|
+
"""python -c 'import json, os; error_obj=os.getenv(\\"METAFLOW_ARGO_ERROR\\");data=json.loads(error_obj); print(data[\\"message\\"])'"""
|
|
2229
|
+
]
|
|
2230
|
+
+ [
|
|
2231
|
+
'if [ -n \\"${ARGO_WORKFLOWS_CAPTURE_ERROR_SCRIPT}\\" ]; then eval \\"${ARGO_WORKFLOWS_CAPTURE_ERROR_SCRIPT}\\"; fi'
|
|
2232
|
+
]
|
|
2233
|
+
)
|
|
2234
|
+
|
|
2235
|
+
# TODO: Also capture the first failed task id
|
|
2236
|
+
cmds = shlex.split('bash -c "%s"' % cmds)
|
|
2237
|
+
env = {
|
|
2238
|
+
# These values are needed by Metaflow to set it's internal
|
|
2239
|
+
# state appropriately.
|
|
2240
|
+
"METAFLOW_CODE_URL": self.code_package_url,
|
|
2241
|
+
"METAFLOW_CODE_SHA": self.code_package_sha,
|
|
2242
|
+
"METAFLOW_CODE_DS": self.flow_datastore.TYPE,
|
|
2243
|
+
"METAFLOW_SERVICE_URL": SERVICE_INTERNAL_URL,
|
|
2244
|
+
"METAFLOW_SERVICE_HEADERS": json.dumps(SERVICE_HEADERS),
|
|
2245
|
+
"METAFLOW_USER": "argo-workflows",
|
|
2246
|
+
"METAFLOW_DEFAULT_DATASTORE": self.flow_datastore.TYPE,
|
|
2247
|
+
"METAFLOW_DEFAULT_METADATA": DEFAULT_METADATA,
|
|
2248
|
+
"METAFLOW_OWNER": self.username,
|
|
2249
|
+
}
|
|
2250
|
+
# support Metaflow sandboxes
|
|
2251
|
+
env["METAFLOW_INIT_SCRIPT"] = KUBERNETES_SANDBOX_INIT_SCRIPT
|
|
2252
|
+
env["METAFLOW_ARGO_WORKFLOWS_CAPTURE_ERROR_SCRIPT"] = (
|
|
2253
|
+
ARGO_WORKFLOWS_CAPTURE_ERROR_SCRIPT
|
|
2254
|
+
)
|
|
2255
|
+
|
|
2256
|
+
env["METAFLOW_WORKFLOW_NAME"] = "{{workflow.name}}"
|
|
2257
|
+
env["METAFLOW_WORKFLOW_NAMESPACE"] = "{{workflow.namespace}}"
|
|
2258
|
+
env["METAFLOW_ARGO_WORKFLOW_FAILURES"] = "{{workflow.failures}}"
|
|
2259
|
+
env = {
|
|
2260
|
+
k: v
|
|
2261
|
+
for k, v in env.items()
|
|
2262
|
+
if v is not None
|
|
2263
|
+
and k not in set(ARGO_WORKFLOWS_ENV_VARS_TO_SKIP.split(","))
|
|
2264
|
+
}
|
|
2265
|
+
return [
|
|
2266
|
+
Template("error-msg-capture-hook").container(
|
|
2267
|
+
to_camelcase(
|
|
2268
|
+
kubernetes_sdk.V1Container(
|
|
2269
|
+
name="main",
|
|
2270
|
+
command=cmds,
|
|
2271
|
+
image=resources["image"],
|
|
2272
|
+
env=[
|
|
2273
|
+
kubernetes_sdk.V1EnvVar(name=k, value=str(v))
|
|
2274
|
+
for k, v in env.items()
|
|
2275
|
+
],
|
|
2276
|
+
env_from=[
|
|
2277
|
+
kubernetes_sdk.V1EnvFromSource(
|
|
2278
|
+
secret_ref=kubernetes_sdk.V1SecretEnvSource(
|
|
2279
|
+
name=str(k),
|
|
2280
|
+
# optional=True
|
|
2281
|
+
)
|
|
2282
|
+
)
|
|
2283
|
+
for k in list(
|
|
2284
|
+
[]
|
|
2285
|
+
if not resources.get("secrets")
|
|
2286
|
+
else (
|
|
2287
|
+
[resources.get("secrets")]
|
|
2288
|
+
if isinstance(resources.get("secrets"), str)
|
|
2289
|
+
else resources.get("secrets")
|
|
2290
|
+
)
|
|
2291
|
+
)
|
|
2292
|
+
+ KUBERNETES_SECRETS.split(",")
|
|
2293
|
+
+ ARGO_WORKFLOWS_KUBERNETES_SECRETS.split(",")
|
|
2294
|
+
if k
|
|
2295
|
+
],
|
|
2296
|
+
resources=kubernetes_sdk.V1ResourceRequirements(
|
|
2297
|
+
# NOTE: base resources for this are kept to a minimum to save on running costs.
|
|
2298
|
+
# This has an adverse effect on startup time for the daemon, which can be completely
|
|
2299
|
+
# alleviated by using a base image that has the required dependencies pre-installed
|
|
2300
|
+
requests={
|
|
2301
|
+
"cpu": "200m",
|
|
2302
|
+
"memory": "100Mi",
|
|
2303
|
+
},
|
|
2304
|
+
limits={
|
|
2305
|
+
"cpu": "200m",
|
|
2306
|
+
"memory": "500Mi",
|
|
2307
|
+
},
|
|
2308
|
+
),
|
|
2309
|
+
)
|
|
2310
|
+
)
|
|
2311
|
+
),
|
|
2312
|
+
Template("capture-error-hook-fn-preflight").steps(
|
|
2313
|
+
[
|
|
2314
|
+
WorkflowStep()
|
|
2315
|
+
.name("capture-error-hook-fn-preflight")
|
|
2316
|
+
.template("error-msg-capture-hook")
|
|
2317
|
+
.when("{{workflow.status}} != Succeeded")
|
|
2318
|
+
]
|
|
2319
|
+
),
|
|
2320
|
+
]
|
|
2321
|
+
|
|
2154
2322
|
def _pager_duty_alert_template(self):
|
|
2155
2323
|
# https://developer.pagerduty.com/docs/ZG9jOjExMDI5NTgx-send-an-alert-event
|
|
2156
2324
|
if self.notify_pager_duty_integration_key is None:
|
|
@@ -2337,6 +2505,137 @@ class ArgoWorkflows(object):
|
|
|
2337
2505
|
Http("POST").url(self.notify_slack_webhook_url).body(json.dumps(payload))
|
|
2338
2506
|
)
|
|
2339
2507
|
|
|
2508
|
+
def _heartbeat_daemon_template(self):
|
|
2509
|
+
# Use all the affordances available to _parameters task
|
|
2510
|
+
executable = self.environment.executable("_parameters")
|
|
2511
|
+
run_id = "argo-{{workflow.name}}"
|
|
2512
|
+
entrypoint = [executable, "-m metaflow.plugins.argo.daemon"]
|
|
2513
|
+
heartbeat_cmds = "{entrypoint} --flow_name {flow_name} --run_id {run_id} {tags} heartbeat".format(
|
|
2514
|
+
entrypoint=" ".join(entrypoint),
|
|
2515
|
+
flow_name=self.flow.name,
|
|
2516
|
+
run_id=run_id,
|
|
2517
|
+
tags=" ".join(["--tag %s" % t for t in self.tags]) if self.tags else "",
|
|
2518
|
+
)
|
|
2519
|
+
|
|
2520
|
+
# TODO: we do not really need MFLOG logging for the daemon at the moment, but might be good for the future.
|
|
2521
|
+
# Consider if we can do without this setup.
|
|
2522
|
+
# Configure log capture.
|
|
2523
|
+
mflog_expr = export_mflog_env_vars(
|
|
2524
|
+
datastore_type=self.flow_datastore.TYPE,
|
|
2525
|
+
stdout_path="$PWD/.logs/mflog_stdout",
|
|
2526
|
+
stderr_path="$PWD/.logs/mflog_stderr",
|
|
2527
|
+
flow_name=self.flow.name,
|
|
2528
|
+
run_id=run_id,
|
|
2529
|
+
step_name="_run_heartbeat_daemon",
|
|
2530
|
+
task_id="1",
|
|
2531
|
+
retry_count="0",
|
|
2532
|
+
)
|
|
2533
|
+
# TODO: Can the init be trimmed down?
|
|
2534
|
+
# Can we do without get_package_commands fetching the whole code package?
|
|
2535
|
+
init_cmds = " && ".join(
|
|
2536
|
+
[
|
|
2537
|
+
# For supporting sandboxes, ensure that a custom script is executed
|
|
2538
|
+
# before anything else is executed. The script is passed in as an
|
|
2539
|
+
# env var.
|
|
2540
|
+
'${METAFLOW_INIT_SCRIPT:+eval \\"${METAFLOW_INIT_SCRIPT}\\"}',
|
|
2541
|
+
"mkdir -p $PWD/.logs",
|
|
2542
|
+
mflog_expr,
|
|
2543
|
+
]
|
|
2544
|
+
+ self.environment.get_package_commands(
|
|
2545
|
+
self.code_package_url, self.flow_datastore.TYPE
|
|
2546
|
+
)[:-1]
|
|
2547
|
+
# Replace the line 'Task in starting'
|
|
2548
|
+
# FIXME: this can be brittle.
|
|
2549
|
+
+ ["mflog 'Heartbeat daemon is starting.'"]
|
|
2550
|
+
)
|
|
2551
|
+
|
|
2552
|
+
cmd_str = " && ".join([init_cmds, heartbeat_cmds])
|
|
2553
|
+
cmds = shlex.split('bash -c "%s"' % cmd_str)
|
|
2554
|
+
|
|
2555
|
+
# TODO: Check that this is the minimal env.
|
|
2556
|
+
# Env required for sending heartbeats to the metadata service, nothing extra.
|
|
2557
|
+
env = {
|
|
2558
|
+
# These values are needed by Metaflow to set it's internal
|
|
2559
|
+
# state appropriately.
|
|
2560
|
+
"METAFLOW_CODE_URL": self.code_package_url,
|
|
2561
|
+
"METAFLOW_CODE_SHA": self.code_package_sha,
|
|
2562
|
+
"METAFLOW_CODE_DS": self.flow_datastore.TYPE,
|
|
2563
|
+
"METAFLOW_SERVICE_URL": SERVICE_INTERNAL_URL,
|
|
2564
|
+
"METAFLOW_SERVICE_HEADERS": json.dumps(SERVICE_HEADERS),
|
|
2565
|
+
"METAFLOW_USER": "argo-workflows",
|
|
2566
|
+
"METAFLOW_DEFAULT_DATASTORE": self.flow_datastore.TYPE,
|
|
2567
|
+
"METAFLOW_DEFAULT_METADATA": DEFAULT_METADATA,
|
|
2568
|
+
"METAFLOW_OWNER": self.username,
|
|
2569
|
+
}
|
|
2570
|
+
# support Metaflow sandboxes
|
|
2571
|
+
env["METAFLOW_INIT_SCRIPT"] = KUBERNETES_SANDBOX_INIT_SCRIPT
|
|
2572
|
+
|
|
2573
|
+
# cleanup env values
|
|
2574
|
+
env = {
|
|
2575
|
+
k: v
|
|
2576
|
+
for k, v in env.items()
|
|
2577
|
+
if v is not None
|
|
2578
|
+
and k not in set(ARGO_WORKFLOWS_ENV_VARS_TO_SKIP.split(","))
|
|
2579
|
+
}
|
|
2580
|
+
|
|
2581
|
+
# We want to grab the base image used by the start step, as this is known to be pullable from within the cluster,
|
|
2582
|
+
# and it might contain the required libraries, allowing us to start up faster.
|
|
2583
|
+
start_step = next(step for step in self.flow if step.name == "start")
|
|
2584
|
+
resources = dict(
|
|
2585
|
+
[deco for deco in start_step.decorators if deco.name == "kubernetes"][
|
|
2586
|
+
0
|
|
2587
|
+
].attributes
|
|
2588
|
+
)
|
|
2589
|
+
from kubernetes import client as kubernetes_sdk
|
|
2590
|
+
|
|
2591
|
+
return DaemonTemplate("heartbeat-daemon").container(
|
|
2592
|
+
to_camelcase(
|
|
2593
|
+
kubernetes_sdk.V1Container(
|
|
2594
|
+
name="main",
|
|
2595
|
+
# TODO: Make the image configurable
|
|
2596
|
+
image=resources["image"],
|
|
2597
|
+
command=cmds,
|
|
2598
|
+
env=[
|
|
2599
|
+
kubernetes_sdk.V1EnvVar(name=k, value=str(v))
|
|
2600
|
+
for k, v in env.items()
|
|
2601
|
+
],
|
|
2602
|
+
env_from=[
|
|
2603
|
+
kubernetes_sdk.V1EnvFromSource(
|
|
2604
|
+
secret_ref=kubernetes_sdk.V1SecretEnvSource(
|
|
2605
|
+
name=str(k),
|
|
2606
|
+
# optional=True
|
|
2607
|
+
)
|
|
2608
|
+
)
|
|
2609
|
+
for k in list(
|
|
2610
|
+
[]
|
|
2611
|
+
if not resources.get("secrets")
|
|
2612
|
+
else (
|
|
2613
|
+
[resources.get("secrets")]
|
|
2614
|
+
if isinstance(resources.get("secrets"), str)
|
|
2615
|
+
else resources.get("secrets")
|
|
2616
|
+
)
|
|
2617
|
+
)
|
|
2618
|
+
+ KUBERNETES_SECRETS.split(",")
|
|
2619
|
+
+ ARGO_WORKFLOWS_KUBERNETES_SECRETS.split(",")
|
|
2620
|
+
if k
|
|
2621
|
+
],
|
|
2622
|
+
resources=kubernetes_sdk.V1ResourceRequirements(
|
|
2623
|
+
# NOTE: base resources for this are kept to a minimum to save on running costs.
|
|
2624
|
+
# This has an adverse effect on startup time for the daemon, which can be completely
|
|
2625
|
+
# alleviated by using a base image that has the required dependencies pre-installed
|
|
2626
|
+
requests={
|
|
2627
|
+
"cpu": "200m",
|
|
2628
|
+
"memory": "100Mi",
|
|
2629
|
+
},
|
|
2630
|
+
limits={
|
|
2631
|
+
"cpu": "200m",
|
|
2632
|
+
"memory": "100Mi",
|
|
2633
|
+
},
|
|
2634
|
+
),
|
|
2635
|
+
)
|
|
2636
|
+
)
|
|
2637
|
+
)
|
|
2638
|
+
|
|
2340
2639
|
def _compile_sensor(self):
|
|
2341
2640
|
# This method compiles a Metaflow @trigger decorator into Argo Events Sensor.
|
|
2342
2641
|
#
|
|
@@ -2791,6 +3090,34 @@ class ObjectMeta(object):
|
|
|
2791
3090
|
return json.dumps(self.to_json(), indent=4)
|
|
2792
3091
|
|
|
2793
3092
|
|
|
3093
|
+
class WorkflowStep(object):
|
|
3094
|
+
def __init__(self):
|
|
3095
|
+
tree = lambda: defaultdict(tree)
|
|
3096
|
+
self.payload = tree()
|
|
3097
|
+
|
|
3098
|
+
def name(self, name):
|
|
3099
|
+
self.payload["name"] = str(name)
|
|
3100
|
+
return self
|
|
3101
|
+
|
|
3102
|
+
def template(self, template):
|
|
3103
|
+
self.payload["template"] = str(template)
|
|
3104
|
+
return self
|
|
3105
|
+
|
|
3106
|
+
def when(self, condition):
|
|
3107
|
+
self.payload["when"] = str(condition)
|
|
3108
|
+
return self
|
|
3109
|
+
|
|
3110
|
+
def step(self, expression):
|
|
3111
|
+
self.payload["expression"] = str(expression)
|
|
3112
|
+
return self
|
|
3113
|
+
|
|
3114
|
+
def to_json(self):
|
|
3115
|
+
return self.payload
|
|
3116
|
+
|
|
3117
|
+
def __str__(self):
|
|
3118
|
+
return json.dumps(self.to_json(), indent=4)
|
|
3119
|
+
|
|
3120
|
+
|
|
2794
3121
|
class WorkflowSpec(object):
|
|
2795
3122
|
# https://argoproj.github.io/argo-workflows/fields/#workflowspec
|
|
2796
3123
|
# This object sets all Workflow level properties.
|
|
@@ -2821,6 +3148,11 @@ class WorkflowSpec(object):
|
|
|
2821
3148
|
self.payload["entrypoint"] = entrypoint
|
|
2822
3149
|
return self
|
|
2823
3150
|
|
|
3151
|
+
def onExit(self, on_exit_template):
|
|
3152
|
+
if on_exit_template:
|
|
3153
|
+
self.payload["onExit"] = on_exit_template
|
|
3154
|
+
return self
|
|
3155
|
+
|
|
2824
3156
|
def parallelism(self, parallelism):
|
|
2825
3157
|
# Set parallelism at Workflow level
|
|
2826
3158
|
self.payload["parallelism"] = int(parallelism)
|
|
@@ -2909,6 +3241,25 @@ class Metadata(object):
|
|
|
2909
3241
|
return json.dumps(self.to_json(), indent=4)
|
|
2910
3242
|
|
|
2911
3243
|
|
|
3244
|
+
class DaemonTemplate(object):
|
|
3245
|
+
def __init__(self, name):
|
|
3246
|
+
tree = lambda: defaultdict(tree)
|
|
3247
|
+
self.name = name
|
|
3248
|
+
self.payload = tree()
|
|
3249
|
+
self.payload["daemon"] = True
|
|
3250
|
+
self.payload["name"] = name
|
|
3251
|
+
|
|
3252
|
+
def container(self, container):
|
|
3253
|
+
self.payload["container"] = container
|
|
3254
|
+
return self
|
|
3255
|
+
|
|
3256
|
+
def to_json(self):
|
|
3257
|
+
return self.payload
|
|
3258
|
+
|
|
3259
|
+
def __str__(self):
|
|
3260
|
+
return json.dumps(self.payload, indent=4)
|
|
3261
|
+
|
|
3262
|
+
|
|
2912
3263
|
class Template(object):
|
|
2913
3264
|
# https://argoproj.github.io/argo-workflows/fields/#template
|
|
2914
3265
|
|
|
@@ -2927,6 +3278,18 @@ class Template(object):
|
|
|
2927
3278
|
self.payload["dag"] = dag_template.to_json()
|
|
2928
3279
|
return self
|
|
2929
3280
|
|
|
3281
|
+
def steps(self, steps):
|
|
3282
|
+
if "steps" not in self.payload:
|
|
3283
|
+
self.payload["steps"] = []
|
|
3284
|
+
# steps is a list of lists.
|
|
3285
|
+
# hence we go over every item in the incoming list
|
|
3286
|
+
# serialize it and then append the list to the payload
|
|
3287
|
+
step_list = []
|
|
3288
|
+
for step in steps:
|
|
3289
|
+
step_list.append(step.to_json())
|
|
3290
|
+
self.payload["steps"].append(step_list)
|
|
3291
|
+
return self
|
|
3292
|
+
|
|
2930
3293
|
def container(self, container):
|
|
2931
3294
|
# Luckily this can simply be V1Container and we are spared from writing more
|
|
2932
3295
|
# boilerplate - https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1Container.md.
|