metaflow 2.12.9__py2.py3-none-any.whl → 2.12.11__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. metaflow/client/core.py +6 -6
  2. metaflow/client/filecache.py +16 -3
  3. metaflow/cmd/develop/stub_generator.py +62 -47
  4. metaflow/datastore/content_addressed_store.py +1 -1
  5. metaflow/datastore/task_datastore.py +1 -1
  6. metaflow/decorators.py +2 -4
  7. metaflow/extension_support/__init__.py +3 -3
  8. metaflow/extension_support/plugins.py +3 -3
  9. metaflow/metaflow_config.py +35 -18
  10. metaflow/parameters.py +3 -3
  11. metaflow/plugins/airflow/airflow.py +6 -6
  12. metaflow/plugins/airflow/airflow_utils.py +5 -3
  13. metaflow/plugins/argo/argo_workflows.py +555 -191
  14. metaflow/plugins/argo/argo_workflows_cli.py +27 -4
  15. metaflow/plugins/argo/argo_workflows_decorator.py +6 -13
  16. metaflow/plugins/argo/capture_error.py +70 -0
  17. metaflow/plugins/argo/daemon.py +59 -0
  18. metaflow/plugins/aws/step_functions/step_functions.py +3 -3
  19. metaflow/plugins/cards/card_modules/basic.py +5 -3
  20. metaflow/plugins/cards/card_modules/convert_to_native_type.py +2 -2
  21. metaflow/plugins/cards/card_modules/renderer_tools.py +1 -0
  22. metaflow/plugins/cards/card_modules/test_cards.py +0 -2
  23. metaflow/plugins/datatools/s3/s3op.py +5 -3
  24. metaflow/plugins/kubernetes/kubernetes.py +1 -0
  25. metaflow/plugins/kubernetes/kubernetes_job.py +11 -8
  26. metaflow/plugins/kubernetes/kubernetes_jobsets.py +16 -14
  27. metaflow/plugins/logs_cli.py +1 -0
  28. metaflow/plugins/pypi/conda_environment.py +1 -3
  29. metaflow/plugins/pypi/pip.py +3 -3
  30. metaflow/plugins/tag_cli.py +3 -3
  31. metaflow/procpoll.py +1 -1
  32. metaflow/runtime.py +1 -0
  33. metaflow/util.py +6 -6
  34. metaflow/version.py +1 -1
  35. {metaflow-2.12.9.dist-info → metaflow-2.12.11.dist-info}/METADATA +2 -2
  36. {metaflow-2.12.9.dist-info → metaflow-2.12.11.dist-info}/RECORD +40 -38
  37. {metaflow-2.12.9.dist-info → metaflow-2.12.11.dist-info}/WHEEL +1 -1
  38. {metaflow-2.12.9.dist-info → metaflow-2.12.11.dist-info}/LICENSE +0 -0
  39. {metaflow-2.12.9.dist-info → metaflow-2.12.11.dist-info}/entry_points.txt +0 -0
  40. {metaflow-2.12.9.dist-info → metaflow-2.12.11.dist-info}/top_level.txt +0 -0
@@ -4,15 +4,15 @@ import os
4
4
  import re
5
5
  import shlex
6
6
  import sys
7
- from typing import Tuple, List
8
7
  from collections import defaultdict
9
8
  from hashlib import sha1
10
9
  from math import inf
10
+ from typing import List, Tuple
11
11
 
12
12
  from metaflow import JSONType, current
13
- from metaflow.graph import DAGNode
14
13
  from metaflow.decorators import flow_decorators
15
14
  from metaflow.exception import MetaflowException
15
+ from metaflow.graph import DAGNode, FlowGraph
16
16
  from metaflow.includefile import FilePathClass
17
17
  from metaflow.metaflow_config import (
18
18
  ARGO_EVENTS_EVENT,
@@ -21,10 +21,12 @@ from metaflow.metaflow_config import (
21
21
  ARGO_EVENTS_INTERNAL_WEBHOOK_URL,
22
22
  ARGO_EVENTS_SERVICE_ACCOUNT,
23
23
  ARGO_EVENTS_WEBHOOK_AUTH,
24
+ ARGO_WORKFLOWS_CAPTURE_ERROR_SCRIPT,
24
25
  ARGO_WORKFLOWS_ENV_VARS_TO_SKIP,
25
26
  ARGO_WORKFLOWS_KUBERNETES_SECRETS,
26
27
  ARGO_WORKFLOWS_UI_URL,
27
28
  AWS_SECRETS_MANAGER_DEFAULT_REGION,
29
+ AZURE_KEY_VAULT_PREFIX,
28
30
  AZURE_STORAGE_BLOB_SERVICE_ENDPOINT,
29
31
  CARD_AZUREROOT,
30
32
  CARD_GSROOT,
@@ -36,7 +38,6 @@ from metaflow.metaflow_config import (
36
38
  DEFAULT_METADATA,
37
39
  DEFAULT_SECRETS_BACKEND_TYPE,
38
40
  GCP_SECRET_MANAGER_PREFIX,
39
- AZURE_KEY_VAULT_PREFIX,
40
41
  KUBERNETES_FETCH_EC2_METADATA,
41
42
  KUBERNETES_LABELS,
42
43
  KUBERNETES_NAMESPACE,
@@ -49,7 +50,6 @@ from metaflow.metaflow_config import (
49
50
  SERVICE_INTERNAL_URL,
50
51
  UI_URL,
51
52
  )
52
- from metaflow.unbounded_foreach import UBF_CONTROL, UBF_TASK
53
53
  from metaflow.metaflow_config_funcs import config_values
54
54
  from metaflow.mflog import BASH_SAVE_LOGS, bash_capture_logs, export_mflog_env_vars
55
55
  from metaflow.parameters import deploy_time_eval
@@ -57,7 +57,8 @@ from metaflow.plugins.kubernetes.kubernetes import (
57
57
  parse_kube_keyvalue_list,
58
58
  validate_kube_labels,
59
59
  )
60
- from metaflow.graph import FlowGraph
60
+ from metaflow.plugins.kubernetes.kubernetes_jobsets import KubernetesArgoJobSet
61
+ from metaflow.unbounded_foreach import UBF_CONTROL, UBF_TASK
61
62
  from metaflow.util import (
62
63
  compress_list,
63
64
  dict_to_cli_options,
@@ -65,9 +66,6 @@ from metaflow.util import (
65
66
  to_camelcase,
66
67
  to_unicode,
67
68
  )
68
- from metaflow.plugins.kubernetes.kubernetes_jobsets import (
69
- KubernetesArgoJobSet,
70
- )
71
69
 
72
70
  from .argo_client import ArgoClient
73
71
 
@@ -117,6 +115,8 @@ class ArgoWorkflows(object):
117
115
  notify_on_success=False,
118
116
  notify_slack_webhook_url=None,
119
117
  notify_pager_duty_integration_key=None,
118
+ enable_heartbeat_daemon=True,
119
+ enable_error_msg_capture=False,
120
120
  ):
121
121
  # Some high-level notes -
122
122
  #
@@ -164,7 +164,8 @@ class ArgoWorkflows(object):
164
164
  self.notify_on_success = notify_on_success
165
165
  self.notify_slack_webhook_url = notify_slack_webhook_url
166
166
  self.notify_pager_duty_integration_key = notify_pager_duty_integration_key
167
-
167
+ self.enable_heartbeat_daemon = enable_heartbeat_daemon
168
+ self.enable_error_msg_capture = enable_error_msg_capture
168
169
  self.parameters = self._process_parameters()
169
170
  self.triggers, self.trigger_options = self._process_triggers()
170
171
  self._schedule, self._timezone = self._get_schedule()
@@ -784,6 +785,12 @@ class ArgoWorkflows(object):
784
785
  )
785
786
  # Set the entrypoint to flow name
786
787
  .entrypoint(self.flow.name)
788
+ # OnExit hooks
789
+ .onExit(
790
+ "capture-error-hook-fn-preflight"
791
+ if self.enable_error_msg_capture
792
+ else None
793
+ )
787
794
  # Set exit hook handlers if notifications are enabled
788
795
  .hooks(
789
796
  {
@@ -853,6 +860,8 @@ class ArgoWorkflows(object):
853
860
  .templates(self._container_templates())
854
861
  # Exit hook template(s)
855
862
  .templates(self._exit_hook_templates())
863
+ # Sidecar templates (Daemon Containers)
864
+ .templates(self._daemon_templates())
856
865
  )
857
866
  )
858
867
 
@@ -1059,7 +1068,7 @@ class ArgoWorkflows(object):
1059
1068
  "%s-foreach-%s"
1060
1069
  % (
1061
1070
  node.name,
1062
- "parallel" if node.parallel_foreach else node.foreach_param
1071
+ "parallel" if node.parallel_foreach else node.foreach_param,
1063
1072
  # Since foreach's are derived based on `self.next(self.a, foreach="<varname>")`
1064
1073
  # vs @parallel foreach are done based on `self.next(self.a, num_parallel="<some-number>")`,
1065
1074
  # we need to ensure that `foreach_template_name` suffix is appropriately set based on the kind
@@ -1265,7 +1274,13 @@ class ArgoWorkflows(object):
1265
1274
  "Argo Workflows." % (node.type, node.name)
1266
1275
  )
1267
1276
 
1268
- templates, _ = _visit(node=self.graph["start"])
1277
+ # Generate daemon tasks
1278
+ daemon_tasks = [
1279
+ DAGTask("%s-task" % daemon_template.name).template(daemon_template.name)
1280
+ for daemon_template in self._daemon_templates()
1281
+ ]
1282
+
1283
+ templates, _ = _visit(node=self.graph["start"], dag_tasks=daemon_tasks)
1269
1284
  return templates
1270
1285
 
1271
1286
  # Visit every node and yield ContainerTemplates.
@@ -1350,7 +1365,7 @@ class ArgoWorkflows(object):
1350
1365
  task_str = "-".join(
1351
1366
  [
1352
1367
  "$TASK_ID_PREFIX",
1353
- "{{inputs.parameters.task-id-entropy}}", # id_base is addition entropy to based on node-name of the workflow
1368
+ "{{inputs.parameters.task-id-entropy}}",
1354
1369
  "$TASK_ID_SUFFIX",
1355
1370
  ]
1356
1371
  )
@@ -1381,8 +1396,6 @@ class ArgoWorkflows(object):
1381
1396
  user_code_retries = max_user_code_retries
1382
1397
  total_retries = max_user_code_retries + max_error_retries
1383
1398
  # {{retries}} is only available if retryStrategy is specified
1384
- # and they are only available in the container templates NOT for custom
1385
- # Kubernetes manifests like Jobsets.
1386
1399
  # For custom kubernetes manifests, we will pass the retryCount as a parameter
1387
1400
  # and use that in the manifest.
1388
1401
  retry_count = (
@@ -1509,8 +1522,7 @@ class ArgoWorkflows(object):
1509
1522
  )
1510
1523
  )
1511
1524
  else:
1512
- # When we run Jobsets with Argo Workflows we need to ensure that `input_paths` are generated using the a formulaic approach
1513
- # because our current strategy of using volume mounts for outputs won't work with Jobsets
1525
+ # Handle @parallel where output from volume mount isn't accessible
1514
1526
  input_paths = (
1515
1527
  "$(python -m metaflow.plugins.argo.jobset_input_paths %s %s {{inputs.parameters.task-id-entropy}} {{inputs.parameters.num-parallel}})"
1516
1528
  % (
@@ -1649,16 +1661,16 @@ class ArgoWorkflows(object):
1649
1661
 
1650
1662
  # support for @secret
1651
1663
  env["METAFLOW_DEFAULT_SECRETS_BACKEND_TYPE"] = DEFAULT_SECRETS_BACKEND_TYPE
1652
- env[
1653
- "METAFLOW_AWS_SECRETS_MANAGER_DEFAULT_REGION"
1654
- ] = AWS_SECRETS_MANAGER_DEFAULT_REGION
1664
+ env["METAFLOW_AWS_SECRETS_MANAGER_DEFAULT_REGION"] = (
1665
+ AWS_SECRETS_MANAGER_DEFAULT_REGION
1666
+ )
1655
1667
  env["METAFLOW_GCP_SECRET_MANAGER_PREFIX"] = GCP_SECRET_MANAGER_PREFIX
1656
1668
  env["METAFLOW_AZURE_KEY_VAULT_PREFIX"] = AZURE_KEY_VAULT_PREFIX
1657
1669
 
1658
1670
  # support for Azure
1659
- env[
1660
- "METAFLOW_AZURE_STORAGE_BLOB_SERVICE_ENDPOINT"
1661
- ] = AZURE_STORAGE_BLOB_SERVICE_ENDPOINT
1671
+ env["METAFLOW_AZURE_STORAGE_BLOB_SERVICE_ENDPOINT"] = (
1672
+ AZURE_STORAGE_BLOB_SERVICE_ENDPOINT
1673
+ )
1662
1674
  env["METAFLOW_DATASTORE_SYSROOT_AZURE"] = DATASTORE_SYSROOT_AZURE
1663
1675
  env["METAFLOW_CARD_AZUREROOT"] = CARD_AZUREROOT
1664
1676
 
@@ -1723,9 +1735,7 @@ class ArgoWorkflows(object):
1723
1735
  else:
1724
1736
  # append this only for joins of foreaches, not static splits
1725
1737
  inputs.append(Parameter("split-cardinality"))
1726
- # We can use an `elif` condition because the first `if` condition validates if its
1727
- # a foreach join node, hence we can safely assume that if that condition fails then
1728
- # we can check if the node is a @parallel node.
1738
+ # check if the node is a @parallel node.
1729
1739
  elif node.parallel_step:
1730
1740
  inputs.extend(
1731
1741
  [
@@ -1780,7 +1790,7 @@ class ArgoWorkflows(object):
1780
1790
  ),
1781
1791
  ]
1782
1792
  )
1783
- # Outputs should be defined over here, Not in the _dag_template for the `num_parallel` stuff.
1793
+ # Outputs should be defined over here and not in the _dag_template for @parallel.
1784
1794
 
1785
1795
  # It makes no sense to set env vars to None (shows up as "None" string)
1786
1796
  # Also we skip some env vars (e.g. in case we want to pull them from KUBERNETES_SECRETS)
@@ -1807,20 +1817,20 @@ class ArgoWorkflows(object):
1807
1817
 
1808
1818
  if tmpfs_enabled and tmpfs_tempdir:
1809
1819
  env["METAFLOW_TEMPDIR"] = tmpfs_path
1820
+
1810
1821
  # Create a ContainerTemplate for this node. Ideally, we would have
1811
1822
  # liked to inline this ContainerTemplate and avoid scanning the workflow
1812
1823
  # twice, but due to issues with variable substitution, we will have to
1813
1824
  # live with this routine.
1814
1825
  if node.parallel_step:
1815
-
1816
1826
  # Explicitly add the task-id-hint label. This is important because this label
1817
- # is returned as an Output parameter of this step and is used subsequently an
1818
- # an input in the join step. Even the num_parallel is used as an output parameter
1827
+ # is returned as an Output parameter of this step and is used subsequently as an
1828
+ # an input in the join step.
1819
1829
  kubernetes_labels = self.kubernetes_labels.copy()
1820
1830
  jobset_name = "{{inputs.parameters.jobset-name}}"
1821
- kubernetes_labels[
1822
- "task_id_entropy"
1823
- ] = "{{inputs.parameters.task-id-entropy}}"
1831
+ kubernetes_labels["task_id_entropy"] = (
1832
+ "{{inputs.parameters.task-id-entropy}}"
1833
+ )
1824
1834
  kubernetes_labels["num_parallel"] = "{{inputs.parameters.num-parallel}}"
1825
1835
  jobset = KubernetesArgoJobSet(
1826
1836
  kubernetes_sdk=kubernetes_sdk,
@@ -1844,9 +1854,11 @@ class ArgoWorkflows(object):
1844
1854
  list(
1845
1855
  []
1846
1856
  if not resources.get("secrets")
1847
- else [resources.get("secrets")]
1848
- if isinstance(resources.get("secrets"), str)
1849
- else resources.get("secrets")
1857
+ else (
1858
+ [resources.get("secrets")]
1859
+ if isinstance(resources.get("secrets"), str)
1860
+ else resources.get("secrets")
1861
+ )
1850
1862
  )
1851
1863
  + KUBERNETES_SECRETS.split(",")
1852
1864
  + ARGO_WORKFLOWS_KUBERNETES_SECRETS.split(",")
@@ -1877,7 +1889,6 @@ class ArgoWorkflows(object):
1877
1889
  for k, v in kubernetes_labels.items():
1878
1890
  jobset.label(k, v)
1879
1891
 
1880
- ## -----Jobset specific env vars START here-----
1881
1892
  jobset.environment_variable(
1882
1893
  "MF_MASTER_ADDR", jobset.jobset_control_addr
1883
1894
  )
@@ -1896,7 +1907,6 @@ class ArgoWorkflows(object):
1896
1907
  "METAFLOW_KUBERNETES_POD_ID": "metadata.uid",
1897
1908
  "METAFLOW_KUBERNETES_SERVICE_ACCOUNT_NAME": "spec.serviceAccountName",
1898
1909
  "METAFLOW_KUBERNETES_NODE_IP": "status.hostIP",
1899
- # `TASK_ID_SUFFIX` is needed for the construction of the task-ids
1900
1910
  "TASK_ID_SUFFIX": "metadata.annotations['jobset.sigs.k8s.io/job-index']",
1901
1911
  }
1902
1912
  )
@@ -1921,8 +1931,7 @@ class ArgoWorkflows(object):
1921
1931
  )
1922
1932
  for k, v in annotations.items():
1923
1933
  jobset.annotation(k, v)
1924
- ## -----Jobset specific env vars END here-----
1925
- ## ---- Jobset control/workers specific vars START here ----
1934
+
1926
1935
  jobset.control.replicas(1)
1927
1936
  jobset.worker.replicas("{{=asInt(inputs.parameters.workerCount)}}")
1928
1937
  jobset.control.environment_variable("UBF_CONTEXT", UBF_CONTROL)
@@ -1933,7 +1942,6 @@ class ArgoWorkflows(object):
1933
1942
  jobset.control.environment_variable("TASK_ID_PREFIX", "control")
1934
1943
  jobset.worker.environment_variable("TASK_ID_PREFIX", "worker")
1935
1944
 
1936
- ## ---- Jobset control/workers specific vars END here ----
1937
1945
  yield (
1938
1946
  Template(ArgoWorkflows._sanitize(node.name))
1939
1947
  .resource(
@@ -1960,166 +1968,185 @@ class ArgoWorkflows(object):
1960
1968
  minutes_between_retries=minutes_between_retries,
1961
1969
  )
1962
1970
  )
1963
- continue
1964
- yield (
1965
- Template(self._sanitize(node.name))
1966
- # Set @timeout values
1967
- .active_deadline_seconds(run_time_limit)
1968
- # Set service account
1969
- .service_account_name(resources["service_account"])
1970
- # Configure template input
1971
- .inputs(Inputs().parameters(inputs))
1972
- # Configure template output
1973
- .outputs(Outputs().parameters(outputs))
1974
- # Fail fast!
1975
- .fail_fast()
1976
- # Set @retry/@catch values
1977
- .retry_strategy(
1978
- times=total_retries,
1979
- minutes_between_retries=minutes_between_retries,
1980
- )
1981
- .metadata(
1982
- ObjectMeta().annotation("metaflow/step_name", node.name)
1983
- # Unfortunately, we can't set the task_id since it is generated
1984
- # inside the pod. However, it can be inferred from the annotation
1985
- # set by argo-workflows - `workflows.argoproj.io/outputs` - refer
1986
- # the field 'task-id' in 'parameters'
1987
- # .annotation("metaflow/task_id", ...)
1988
- .annotation("metaflow/attempt", retry_count)
1989
- )
1990
- # Set emptyDir volume for state management
1991
- .empty_dir_volume("out")
1992
- # Set tmpfs emptyDir volume if enabled
1993
- .empty_dir_volume(
1994
- "tmpfs-ephemeral-volume",
1995
- medium="Memory",
1996
- size_limit=tmpfs_size if tmpfs_enabled else 0,
1997
- )
1998
- .empty_dir_volume("dhsm", medium="Memory", size_limit=shared_memory)
1999
- .pvc_volumes(resources.get("persistent_volume_claims"))
2000
- # Set node selectors
2001
- .node_selectors(resources.get("node_selector"))
2002
- # Set tolerations
2003
- .tolerations(resources.get("tolerations"))
2004
- # Set container
2005
- .container(
2006
- # TODO: Unify the logic with kubernetes.py
2007
- # Important note - Unfortunately, V1Container uses snakecase while
2008
- # Argo Workflows uses camel. For most of the attributes, both cases
2009
- # are indistinguishable, but unfortunately, not for all - (
2010
- # env_from, value_from, etc.) - so we need to handle the conversion
2011
- # ourselves using to_camelcase. We need to be vigilant about
2012
- # resources attributes in particular where the keys maybe user
2013
- # defined.
2014
- to_camelcase(
2015
- kubernetes_sdk.V1Container(
2016
- name=self._sanitize(node.name),
2017
- command=cmds,
2018
- ports=[kubernetes_sdk.V1ContainerPort(container_port=port)]
2019
- if port
2020
- else None,
2021
- env=[
2022
- kubernetes_sdk.V1EnvVar(name=k, value=str(v))
2023
- for k, v in env.items()
2024
- ]
2025
- # Add environment variables for book-keeping.
2026
- # https://argoproj.github.io/argo-workflows/fields/#fields_155
2027
- + [
2028
- kubernetes_sdk.V1EnvVar(
2029
- name=k,
2030
- value_from=kubernetes_sdk.V1EnvVarSource(
2031
- field_ref=kubernetes_sdk.V1ObjectFieldSelector(
2032
- field_path=str(v)
1971
+ else:
1972
+ yield (
1973
+ Template(self._sanitize(node.name))
1974
+ # Set @timeout values
1975
+ .active_deadline_seconds(run_time_limit)
1976
+ # Set service account
1977
+ .service_account_name(resources["service_account"])
1978
+ # Configure template input
1979
+ .inputs(Inputs().parameters(inputs))
1980
+ # Configure template output
1981
+ .outputs(Outputs().parameters(outputs))
1982
+ # Fail fast!
1983
+ .fail_fast()
1984
+ # Set @retry/@catch values
1985
+ .retry_strategy(
1986
+ times=total_retries,
1987
+ minutes_between_retries=minutes_between_retries,
1988
+ )
1989
+ .metadata(
1990
+ ObjectMeta().annotation("metaflow/step_name", node.name)
1991
+ # Unfortunately, we can't set the task_id since it is generated
1992
+ # inside the pod. However, it can be inferred from the annotation
1993
+ # set by argo-workflows - `workflows.argoproj.io/outputs` - refer
1994
+ # the field 'task-id' in 'parameters'
1995
+ # .annotation("metaflow/task_id", ...)
1996
+ .annotation("metaflow/attempt", retry_count)
1997
+ )
1998
+ # Set emptyDir volume for state management
1999
+ .empty_dir_volume("out")
2000
+ # Set tmpfs emptyDir volume if enabled
2001
+ .empty_dir_volume(
2002
+ "tmpfs-ephemeral-volume",
2003
+ medium="Memory",
2004
+ size_limit=tmpfs_size if tmpfs_enabled else 0,
2005
+ )
2006
+ .empty_dir_volume("dhsm", medium="Memory", size_limit=shared_memory)
2007
+ .pvc_volumes(resources.get("persistent_volume_claims"))
2008
+ # Set node selectors
2009
+ .node_selectors(resources.get("node_selector"))
2010
+ # Set tolerations
2011
+ .tolerations(resources.get("tolerations"))
2012
+ # Set container
2013
+ .container(
2014
+ # TODO: Unify the logic with kubernetes.py
2015
+ # Important note - Unfortunately, V1Container uses snakecase while
2016
+ # Argo Workflows uses camel. For most of the attributes, both cases
2017
+ # are indistinguishable, but unfortunately, not for all - (
2018
+ # env_from, value_from, etc.) - so we need to handle the conversion
2019
+ # ourselves using to_camelcase. We need to be vigilant about
2020
+ # resources attributes in particular where the keys maybe user
2021
+ # defined.
2022
+ to_camelcase(
2023
+ kubernetes_sdk.V1Container(
2024
+ name=self._sanitize(node.name),
2025
+ command=cmds,
2026
+ termination_message_policy="FallbackToLogsOnError",
2027
+ ports=(
2028
+ [
2029
+ kubernetes_sdk.V1ContainerPort(
2030
+ container_port=port
2033
2031
  )
2034
- ),
2035
- )
2036
- for k, v in {
2037
- "METAFLOW_KUBERNETES_POD_NAMESPACE": "metadata.namespace",
2038
- "METAFLOW_KUBERNETES_POD_NAME": "metadata.name",
2039
- "METAFLOW_KUBERNETES_POD_ID": "metadata.uid",
2040
- "METAFLOW_KUBERNETES_SERVICE_ACCOUNT_NAME": "spec.serviceAccountName",
2041
- "METAFLOW_KUBERNETES_NODE_IP": "status.hostIP",
2042
- }.items()
2043
- ],
2044
- image=resources["image"],
2045
- image_pull_policy=resources["image_pull_policy"],
2046
- resources=kubernetes_sdk.V1ResourceRequirements(
2047
- requests={
2048
- "cpu": str(resources["cpu"]),
2049
- "memory": "%sM" % str(resources["memory"]),
2050
- "ephemeral-storage": "%sM" % str(resources["disk"]),
2051
- },
2052
- limits={
2053
- "%s.com/gpu".lower()
2054
- % resources["gpu_vendor"]: str(resources["gpu"])
2055
- for k in [0]
2056
- if resources["gpu"] is not None
2057
- },
2058
- ),
2059
- # Configure secrets
2060
- env_from=[
2061
- kubernetes_sdk.V1EnvFromSource(
2062
- secret_ref=kubernetes_sdk.V1SecretEnvSource(
2063
- name=str(k),
2064
- # optional=True
2032
+ ]
2033
+ if port
2034
+ else None
2035
+ ),
2036
+ env=[
2037
+ kubernetes_sdk.V1EnvVar(name=k, value=str(v))
2038
+ for k, v in env.items()
2039
+ ]
2040
+ # Add environment variables for book-keeping.
2041
+ # https://argoproj.github.io/argo-workflows/fields/#fields_155
2042
+ + [
2043
+ kubernetes_sdk.V1EnvVar(
2044
+ name=k,
2045
+ value_from=kubernetes_sdk.V1EnvVarSource(
2046
+ field_ref=kubernetes_sdk.V1ObjectFieldSelector(
2047
+ field_path=str(v)
2048
+ )
2049
+ ),
2065
2050
  )
2066
- )
2067
- for k in list(
2068
- []
2069
- if not resources.get("secrets")
2070
- else [resources.get("secrets")]
2071
- if isinstance(resources.get("secrets"), str)
2072
- else resources.get("secrets")
2073
- )
2074
- + KUBERNETES_SECRETS.split(",")
2075
- + ARGO_WORKFLOWS_KUBERNETES_SECRETS.split(",")
2076
- if k
2077
- ],
2078
- volume_mounts=[
2079
- # Assign a volume mount to pass state to the next task.
2080
- kubernetes_sdk.V1VolumeMount(
2081
- name="out", mount_path="/mnt/out"
2082
- )
2083
- ]
2084
- # Support tmpfs.
2085
- + (
2086
- [
2087
- kubernetes_sdk.V1VolumeMount(
2088
- name="tmpfs-ephemeral-volume",
2089
- mount_path=tmpfs_path,
2051
+ for k, v in {
2052
+ "METAFLOW_KUBERNETES_NAMESPACE": "metadata.namespace",
2053
+ "METAFLOW_KUBERNETES_POD_NAMESPACE": "metadata.namespace",
2054
+ "METAFLOW_KUBERNETES_POD_NAME": "metadata.name",
2055
+ "METAFLOW_KUBERNETES_POD_ID": "metadata.uid",
2056
+ "METAFLOW_KUBERNETES_SERVICE_ACCOUNT_NAME": "spec.serviceAccountName",
2057
+ "METAFLOW_KUBERNETES_NODE_IP": "status.hostIP",
2058
+ }.items()
2059
+ ],
2060
+ image=resources["image"],
2061
+ image_pull_policy=resources["image_pull_policy"],
2062
+ resources=kubernetes_sdk.V1ResourceRequirements(
2063
+ requests={
2064
+ "cpu": str(resources["cpu"]),
2065
+ "memory": "%sM" % str(resources["memory"]),
2066
+ "ephemeral-storage": "%sM"
2067
+ % str(resources["disk"]),
2068
+ },
2069
+ limits={
2070
+ "%s.com/gpu".lower()
2071
+ % resources["gpu_vendor"]: str(resources["gpu"])
2072
+ for k in [0]
2073
+ if resources["gpu"] is not None
2074
+ },
2075
+ ),
2076
+ # Configure secrets
2077
+ env_from=[
2078
+ kubernetes_sdk.V1EnvFromSource(
2079
+ secret_ref=kubernetes_sdk.V1SecretEnvSource(
2080
+ name=str(k),
2081
+ # optional=True
2082
+ )
2090
2083
  )
2091
- ]
2092
- if tmpfs_enabled
2093
- else []
2094
- )
2095
- # Support shared_memory
2096
- + (
2097
- [
2098
- kubernetes_sdk.V1VolumeMount(
2099
- name="dhsm",
2100
- mount_path="/dev/shm",
2084
+ for k in list(
2085
+ []
2086
+ if not resources.get("secrets")
2087
+ else (
2088
+ [resources.get("secrets")]
2089
+ if isinstance(resources.get("secrets"), str)
2090
+ else resources.get("secrets")
2091
+ )
2101
2092
  )
2102
- ]
2103
- if shared_memory
2104
- else []
2105
- )
2106
- # Support persistent volume claims.
2107
- + (
2108
- [
2093
+ + KUBERNETES_SECRETS.split(",")
2094
+ + ARGO_WORKFLOWS_KUBERNETES_SECRETS.split(",")
2095
+ if k
2096
+ ],
2097
+ volume_mounts=[
2098
+ # Assign a volume mount to pass state to the next task.
2109
2099
  kubernetes_sdk.V1VolumeMount(
2110
- name=claim, mount_path=path
2100
+ name="out", mount_path="/mnt/out"
2111
2101
  )
2112
- for claim, path in resources.get(
2113
- "persistent_volume_claims"
2114
- ).items()
2115
2102
  ]
2116
- if resources.get("persistent_volume_claims") is not None
2117
- else []
2118
- ),
2119
- ).to_dict()
2103
+ # Support tmpfs.
2104
+ + (
2105
+ [
2106
+ kubernetes_sdk.V1VolumeMount(
2107
+ name="tmpfs-ephemeral-volume",
2108
+ mount_path=tmpfs_path,
2109
+ )
2110
+ ]
2111
+ if tmpfs_enabled
2112
+ else []
2113
+ )
2114
+ # Support shared_memory
2115
+ + (
2116
+ [
2117
+ kubernetes_sdk.V1VolumeMount(
2118
+ name="dhsm",
2119
+ mount_path="/dev/shm",
2120
+ )
2121
+ ]
2122
+ if shared_memory
2123
+ else []
2124
+ )
2125
+ # Support persistent volume claims.
2126
+ + (
2127
+ [
2128
+ kubernetes_sdk.V1VolumeMount(
2129
+ name=claim, mount_path=path
2130
+ )
2131
+ for claim, path in resources.get(
2132
+ "persistent_volume_claims"
2133
+ ).items()
2134
+ ]
2135
+ if resources.get("persistent_volume_claims")
2136
+ is not None
2137
+ else []
2138
+ ),
2139
+ ).to_dict()
2140
+ )
2120
2141
  )
2121
2142
  )
2122
- )
2143
+
2144
+ # Return daemon container templates for workflow execution notifications.
2145
+ def _daemon_templates(self):
2146
+ templates = []
2147
+ if self.enable_heartbeat_daemon:
2148
+ templates.append(self._heartbeat_daemon_template())
2149
+ return templates
2123
2150
 
2124
2151
  # Return exit hook templates for workflow execution notifications.
2125
2152
  def _exit_hook_templates(self):
@@ -2147,8 +2174,150 @@ class ArgoWorkflows(object):
2147
2174
  .success_condition("true == true")
2148
2175
  )
2149
2176
  )
2177
+ if self.enable_error_msg_capture:
2178
+ templates.extend(self._error_msg_capture_hook_templates())
2150
2179
  return templates
2151
2180
 
2181
+ def _error_msg_capture_hook_templates(self):
2182
+ from kubernetes import client as kubernetes_sdk
2183
+
2184
+ start_step = [step for step in self.graph if step.name == "start"][0]
2185
+ # We want to grab the base image used by the start step, as this is known to be pullable from within the cluster,
2186
+ # and it might contain the required libraries, allowing us to start up faster.
2187
+ resources = dict(
2188
+ [deco for deco in start_step.decorators if deco.name == "kubernetes"][
2189
+ 0
2190
+ ].attributes
2191
+ )
2192
+
2193
+ run_id_template = "argo-{{workflow.name}}"
2194
+ metaflow_version = self.environment.get_environment_info()
2195
+ metaflow_version["flow_name"] = self.graph.name
2196
+ metaflow_version["production_token"] = self.production_token
2197
+
2198
+ mflog_expr = export_mflog_env_vars(
2199
+ datastore_type=self.flow_datastore.TYPE,
2200
+ stdout_path="$PWD/.logs/mflog_stdout",
2201
+ stderr_path="$PWD/.logs/mflog_stderr",
2202
+ flow_name=self.flow.name,
2203
+ run_id=run_id_template,
2204
+ step_name="_run_capture_error",
2205
+ task_id="1",
2206
+ retry_count="0",
2207
+ )
2208
+
2209
+ cmds = " && ".join(
2210
+ [
2211
+ # For supporting sandboxes, ensure that a custom script is executed
2212
+ # before anything else is executed. The script is passed in as an
2213
+ # env var.
2214
+ '${METAFLOW_INIT_SCRIPT:+eval \\"${METAFLOW_INIT_SCRIPT}\\"}',
2215
+ "mkdir -p $PWD/.logs",
2216
+ mflog_expr,
2217
+ ]
2218
+ + self.environment.get_package_commands(
2219
+ self.code_package_url, self.flow_datastore.TYPE
2220
+ )[:-1]
2221
+ # Replace the line 'Task in starting'
2222
+ # FIXME: this can be brittle.
2223
+ + ["mflog 'Error capture hook is starting.'"]
2224
+ + ["argo_error=$(python -m 'metaflow.plugins.argo.capture_error')"]
2225
+ + ["export METAFLOW_ARGO_ERROR=$argo_error"]
2226
+ + [
2227
+ """python -c 'import json, os; error_obj=os.getenv(\\"METAFLOW_ARGO_ERROR\\");data=json.loads(error_obj); print(data[\\"message\\"])'"""
2228
+ ]
2229
+ + [
2230
+ 'if [ -n \\"${ARGO_WORKFLOWS_CAPTURE_ERROR_SCRIPT}\\" ]; then eval \\"${ARGO_WORKFLOWS_CAPTURE_ERROR_SCRIPT}\\"; fi'
2231
+ ]
2232
+ )
2233
+
2234
+ # TODO: Also capture the first failed task id
2235
+ cmds = shlex.split('bash -c "%s"' % cmds)
2236
+ env = {
2237
+ # These values are needed by Metaflow to set it's internal
2238
+ # state appropriately.
2239
+ "METAFLOW_CODE_URL": self.code_package_url,
2240
+ "METAFLOW_CODE_SHA": self.code_package_sha,
2241
+ "METAFLOW_CODE_DS": self.flow_datastore.TYPE,
2242
+ "METAFLOW_SERVICE_URL": SERVICE_INTERNAL_URL,
2243
+ "METAFLOW_SERVICE_HEADERS": json.dumps(SERVICE_HEADERS),
2244
+ "METAFLOW_USER": "argo-workflows",
2245
+ "METAFLOW_DEFAULT_DATASTORE": self.flow_datastore.TYPE,
2246
+ "METAFLOW_DEFAULT_METADATA": DEFAULT_METADATA,
2247
+ "METAFLOW_OWNER": self.username,
2248
+ }
2249
+ # support Metaflow sandboxes
2250
+ env["METAFLOW_INIT_SCRIPT"] = KUBERNETES_SANDBOX_INIT_SCRIPT
2251
+ env["METAFLOW_ARGO_WORKFLOWS_CAPTURE_ERROR_SCRIPT"] = (
2252
+ ARGO_WORKFLOWS_CAPTURE_ERROR_SCRIPT
2253
+ )
2254
+
2255
+ env["METAFLOW_WORKFLOW_NAME"] = "{{workflow.name}}"
2256
+ env["METAFLOW_WORKFLOW_NAMESPACE"] = "{{workflow.namespace}}"
2257
+ env["METAFLOW_ARGO_WORKFLOW_FAILURES"] = "{{workflow.failures}}"
2258
+ env = {
2259
+ k: v
2260
+ for k, v in env.items()
2261
+ if v is not None
2262
+ and k not in set(ARGO_WORKFLOWS_ENV_VARS_TO_SKIP.split(","))
2263
+ }
2264
+ return [
2265
+ Template("error-msg-capture-hook").container(
2266
+ to_camelcase(
2267
+ kubernetes_sdk.V1Container(
2268
+ name="main",
2269
+ command=cmds,
2270
+ image=resources["image"],
2271
+ env=[
2272
+ kubernetes_sdk.V1EnvVar(name=k, value=str(v))
2273
+ for k, v in env.items()
2274
+ ],
2275
+ env_from=[
2276
+ kubernetes_sdk.V1EnvFromSource(
2277
+ secret_ref=kubernetes_sdk.V1SecretEnvSource(
2278
+ name=str(k),
2279
+ # optional=True
2280
+ )
2281
+ )
2282
+ for k in list(
2283
+ []
2284
+ if not resources.get("secrets")
2285
+ else (
2286
+ [resources.get("secrets")]
2287
+ if isinstance(resources.get("secrets"), str)
2288
+ else resources.get("secrets")
2289
+ )
2290
+ )
2291
+ + KUBERNETES_SECRETS.split(",")
2292
+ + ARGO_WORKFLOWS_KUBERNETES_SECRETS.split(",")
2293
+ if k
2294
+ ],
2295
+ resources=kubernetes_sdk.V1ResourceRequirements(
2296
+ # NOTE: base resources for this are kept to a minimum to save on running costs.
2297
+ # This has an adverse effect on startup time for the daemon, which can be completely
2298
+ # alleviated by using a base image that has the required dependencies pre-installed
2299
+ requests={
2300
+ "cpu": "200m",
2301
+ "memory": "100Mi",
2302
+ },
2303
+ limits={
2304
+ "cpu": "200m",
2305
+ "memory": "500Mi",
2306
+ },
2307
+ ),
2308
+ )
2309
+ )
2310
+ ),
2311
+ Template("capture-error-hook-fn-preflight").steps(
2312
+ [
2313
+ WorkflowStep()
2314
+ .name("capture-error-hook-fn-preflight")
2315
+ .template("error-msg-capture-hook")
2316
+ .when("{{workflow.status}} != Succeeded")
2317
+ ]
2318
+ ),
2319
+ ]
2320
+
2152
2321
  def _pager_duty_alert_template(self):
2153
2322
  # https://developer.pagerduty.com/docs/ZG9jOjExMDI5NTgx-send-an-alert-event
2154
2323
  if self.notify_pager_duty_integration_key is None:
@@ -2327,6 +2496,137 @@ class ArgoWorkflows(object):
2327
2496
  Http("POST").url(self.notify_slack_webhook_url).body(json.dumps(payload))
2328
2497
  )
2329
2498
 
2499
+ def _heartbeat_daemon_template(self):
2500
+ # Use all the affordances available to _parameters task
2501
+ executable = self.environment.executable("_parameters")
2502
+ run_id = "argo-{{workflow.name}}"
2503
+ entrypoint = [executable, "-m metaflow.plugins.argo.daemon"]
2504
+ heartbeat_cmds = "{entrypoint} --flow_name {flow_name} --run_id {run_id} {tags} heartbeat".format(
2505
+ entrypoint=" ".join(entrypoint),
2506
+ flow_name=self.flow.name,
2507
+ run_id=run_id,
2508
+ tags=" ".join(["--tag %s" % t for t in self.tags]) if self.tags else "",
2509
+ )
2510
+
2511
+ # TODO: we do not really need MFLOG logging for the daemon at the moment, but might be good for the future.
2512
+ # Consider if we can do without this setup.
2513
+ # Configure log capture.
2514
+ mflog_expr = export_mflog_env_vars(
2515
+ datastore_type=self.flow_datastore.TYPE,
2516
+ stdout_path="$PWD/.logs/mflog_stdout",
2517
+ stderr_path="$PWD/.logs/mflog_stderr",
2518
+ flow_name=self.flow.name,
2519
+ run_id=run_id,
2520
+ step_name="_run_heartbeat_daemon",
2521
+ task_id="1",
2522
+ retry_count="0",
2523
+ )
2524
+ # TODO: Can the init be trimmed down?
2525
+ # Can we do without get_package_commands fetching the whole code package?
2526
+ init_cmds = " && ".join(
2527
+ [
2528
+ # For supporting sandboxes, ensure that a custom script is executed
2529
+ # before anything else is executed. The script is passed in as an
2530
+ # env var.
2531
+ '${METAFLOW_INIT_SCRIPT:+eval \\"${METAFLOW_INIT_SCRIPT}\\"}',
2532
+ "mkdir -p $PWD/.logs",
2533
+ mflog_expr,
2534
+ ]
2535
+ + self.environment.get_package_commands(
2536
+ self.code_package_url, self.flow_datastore.TYPE
2537
+ )[:-1]
2538
+ # Replace the line 'Task in starting'
2539
+ # FIXME: this can be brittle.
2540
+ + ["mflog 'Heartbeat daemon is starting.'"]
2541
+ )
2542
+
2543
+ cmd_str = " && ".join([init_cmds, heartbeat_cmds])
2544
+ cmds = shlex.split('bash -c "%s"' % cmd_str)
2545
+
2546
+ # TODO: Check that this is the minimal env.
2547
+ # Env required for sending heartbeats to the metadata service, nothing extra.
2548
+ env = {
2549
+ # These values are needed by Metaflow to set it's internal
2550
+ # state appropriately.
2551
+ "METAFLOW_CODE_URL": self.code_package_url,
2552
+ "METAFLOW_CODE_SHA": self.code_package_sha,
2553
+ "METAFLOW_CODE_DS": self.flow_datastore.TYPE,
2554
+ "METAFLOW_SERVICE_URL": SERVICE_INTERNAL_URL,
2555
+ "METAFLOW_SERVICE_HEADERS": json.dumps(SERVICE_HEADERS),
2556
+ "METAFLOW_USER": "argo-workflows",
2557
+ "METAFLOW_DEFAULT_DATASTORE": self.flow_datastore.TYPE,
2558
+ "METAFLOW_DEFAULT_METADATA": DEFAULT_METADATA,
2559
+ "METAFLOW_OWNER": self.username,
2560
+ }
2561
+ # support Metaflow sandboxes
2562
+ env["METAFLOW_INIT_SCRIPT"] = KUBERNETES_SANDBOX_INIT_SCRIPT
2563
+
2564
+ # cleanup env values
2565
+ env = {
2566
+ k: v
2567
+ for k, v in env.items()
2568
+ if v is not None
2569
+ and k not in set(ARGO_WORKFLOWS_ENV_VARS_TO_SKIP.split(","))
2570
+ }
2571
+
2572
+ # We want to grab the base image used by the start step, as this is known to be pullable from within the cluster,
2573
+ # and it might contain the required libraries, allowing us to start up faster.
2574
+ start_step = next(step for step in self.flow if step.name == "start")
2575
+ resources = dict(
2576
+ [deco for deco in start_step.decorators if deco.name == "kubernetes"][
2577
+ 0
2578
+ ].attributes
2579
+ )
2580
+ from kubernetes import client as kubernetes_sdk
2581
+
2582
+ return DaemonTemplate("heartbeat-daemon").container(
2583
+ to_camelcase(
2584
+ kubernetes_sdk.V1Container(
2585
+ name="main",
2586
+ # TODO: Make the image configurable
2587
+ image=resources["image"],
2588
+ command=cmds,
2589
+ env=[
2590
+ kubernetes_sdk.V1EnvVar(name=k, value=str(v))
2591
+ for k, v in env.items()
2592
+ ],
2593
+ env_from=[
2594
+ kubernetes_sdk.V1EnvFromSource(
2595
+ secret_ref=kubernetes_sdk.V1SecretEnvSource(
2596
+ name=str(k),
2597
+ # optional=True
2598
+ )
2599
+ )
2600
+ for k in list(
2601
+ []
2602
+ if not resources.get("secrets")
2603
+ else (
2604
+ [resources.get("secrets")]
2605
+ if isinstance(resources.get("secrets"), str)
2606
+ else resources.get("secrets")
2607
+ )
2608
+ )
2609
+ + KUBERNETES_SECRETS.split(",")
2610
+ + ARGO_WORKFLOWS_KUBERNETES_SECRETS.split(",")
2611
+ if k
2612
+ ],
2613
+ resources=kubernetes_sdk.V1ResourceRequirements(
2614
+ # NOTE: base resources for this are kept to a minimum to save on running costs.
2615
+ # This has an adverse effect on startup time for the daemon, which can be completely
2616
+ # alleviated by using a base image that has the required dependencies pre-installed
2617
+ requests={
2618
+ "cpu": "200m",
2619
+ "memory": "100Mi",
2620
+ },
2621
+ limits={
2622
+ "cpu": "200m",
2623
+ "memory": "100Mi",
2624
+ },
2625
+ ),
2626
+ )
2627
+ )
2628
+ )
2629
+
2330
2630
  def _compile_sensor(self):
2331
2631
  # This method compiles a Metaflow @trigger decorator into Argo Events Sensor.
2332
2632
  #
@@ -2781,6 +3081,34 @@ class ObjectMeta(object):
2781
3081
  return json.dumps(self.to_json(), indent=4)
2782
3082
 
2783
3083
 
3084
+ class WorkflowStep(object):
3085
+ def __init__(self):
3086
+ tree = lambda: defaultdict(tree)
3087
+ self.payload = tree()
3088
+
3089
+ def name(self, name):
3090
+ self.payload["name"] = str(name)
3091
+ return self
3092
+
3093
+ def template(self, template):
3094
+ self.payload["template"] = str(template)
3095
+ return self
3096
+
3097
+ def when(self, condition):
3098
+ self.payload["when"] = str(condition)
3099
+ return self
3100
+
3101
+ def step(self, expression):
3102
+ self.payload["expression"] = str(expression)
3103
+ return self
3104
+
3105
+ def to_json(self):
3106
+ return self.payload
3107
+
3108
+ def __str__(self):
3109
+ return json.dumps(self.to_json(), indent=4)
3110
+
3111
+
2784
3112
  class WorkflowSpec(object):
2785
3113
  # https://argoproj.github.io/argo-workflows/fields/#workflowspec
2786
3114
  # This object sets all Workflow level properties.
@@ -2811,6 +3139,11 @@ class WorkflowSpec(object):
2811
3139
  self.payload["entrypoint"] = entrypoint
2812
3140
  return self
2813
3141
 
3142
+ def onExit(self, on_exit_template):
3143
+ if on_exit_template:
3144
+ self.payload["onExit"] = on_exit_template
3145
+ return self
3146
+
2814
3147
  def parallelism(self, parallelism):
2815
3148
  # Set parallelism at Workflow level
2816
3149
  self.payload["parallelism"] = int(parallelism)
@@ -2899,6 +3232,25 @@ class Metadata(object):
2899
3232
  return json.dumps(self.to_json(), indent=4)
2900
3233
 
2901
3234
 
3235
+ class DaemonTemplate(object):
3236
+ def __init__(self, name):
3237
+ tree = lambda: defaultdict(tree)
3238
+ self.name = name
3239
+ self.payload = tree()
3240
+ self.payload["daemon"] = True
3241
+ self.payload["name"] = name
3242
+
3243
+ def container(self, container):
3244
+ self.payload["container"] = container
3245
+ return self
3246
+
3247
+ def to_json(self):
3248
+ return self.payload
3249
+
3250
+ def __str__(self):
3251
+ return json.dumps(self.payload, indent=4)
3252
+
3253
+
2902
3254
  class Template(object):
2903
3255
  # https://argoproj.github.io/argo-workflows/fields/#template
2904
3256
 
@@ -2917,6 +3269,18 @@ class Template(object):
2917
3269
  self.payload["dag"] = dag_template.to_json()
2918
3270
  return self
2919
3271
 
3272
+ def steps(self, steps):
3273
+ if "steps" not in self.payload:
3274
+ self.payload["steps"] = []
3275
+ # steps is a list of lists.
3276
+ # hence we go over every item in the incoming list
3277
+ # serialize it and then append the list to the payload
3278
+ step_list = []
3279
+ for step in steps:
3280
+ step_list.append(step.to_json())
3281
+ self.payload["steps"].append(step_list)
3282
+ return self
3283
+
2920
3284
  def container(self, container):
2921
3285
  # Luckily this can simply be V1Container and we are spared from writing more
2922
3286
  # boilerplate - https://github.com/kubernetes-client/python/blob/master/kubernetes/docs/V1Container.md.