ob-metaflow 2.11.13.1__py2.py3-none-any.whl → 2.19.7.1rc0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (289) hide show
  1. metaflow/R.py +10 -7
  2. metaflow/__init__.py +40 -25
  3. metaflow/_vendor/imghdr/__init__.py +186 -0
  4. metaflow/_vendor/importlib_metadata/__init__.py +1063 -0
  5. metaflow/_vendor/importlib_metadata/_adapters.py +68 -0
  6. metaflow/_vendor/importlib_metadata/_collections.py +30 -0
  7. metaflow/_vendor/importlib_metadata/_compat.py +71 -0
  8. metaflow/_vendor/importlib_metadata/_functools.py +104 -0
  9. metaflow/_vendor/importlib_metadata/_itertools.py +73 -0
  10. metaflow/_vendor/importlib_metadata/_meta.py +48 -0
  11. metaflow/_vendor/importlib_metadata/_text.py +99 -0
  12. metaflow/_vendor/importlib_metadata/py.typed +0 -0
  13. metaflow/_vendor/typeguard/__init__.py +48 -0
  14. metaflow/_vendor/typeguard/_checkers.py +1070 -0
  15. metaflow/_vendor/typeguard/_config.py +108 -0
  16. metaflow/_vendor/typeguard/_decorators.py +233 -0
  17. metaflow/_vendor/typeguard/_exceptions.py +42 -0
  18. metaflow/_vendor/typeguard/_functions.py +308 -0
  19. metaflow/_vendor/typeguard/_importhook.py +213 -0
  20. metaflow/_vendor/typeguard/_memo.py +48 -0
  21. metaflow/_vendor/typeguard/_pytest_plugin.py +127 -0
  22. metaflow/_vendor/typeguard/_suppression.py +86 -0
  23. metaflow/_vendor/typeguard/_transformer.py +1229 -0
  24. metaflow/_vendor/typeguard/_union_transformer.py +55 -0
  25. metaflow/_vendor/typeguard/_utils.py +173 -0
  26. metaflow/_vendor/typeguard/py.typed +0 -0
  27. metaflow/_vendor/typing_extensions.py +3641 -0
  28. metaflow/_vendor/v3_7/importlib_metadata/__init__.py +1063 -0
  29. metaflow/_vendor/v3_7/importlib_metadata/_adapters.py +68 -0
  30. metaflow/_vendor/v3_7/importlib_metadata/_collections.py +30 -0
  31. metaflow/_vendor/v3_7/importlib_metadata/_compat.py +71 -0
  32. metaflow/_vendor/v3_7/importlib_metadata/_functools.py +104 -0
  33. metaflow/_vendor/v3_7/importlib_metadata/_itertools.py +73 -0
  34. metaflow/_vendor/v3_7/importlib_metadata/_meta.py +48 -0
  35. metaflow/_vendor/v3_7/importlib_metadata/_text.py +99 -0
  36. metaflow/_vendor/v3_7/importlib_metadata/py.typed +0 -0
  37. metaflow/_vendor/v3_7/typeguard/__init__.py +48 -0
  38. metaflow/_vendor/v3_7/typeguard/_checkers.py +906 -0
  39. metaflow/_vendor/v3_7/typeguard/_config.py +108 -0
  40. metaflow/_vendor/v3_7/typeguard/_decorators.py +237 -0
  41. metaflow/_vendor/v3_7/typeguard/_exceptions.py +42 -0
  42. metaflow/_vendor/v3_7/typeguard/_functions.py +310 -0
  43. metaflow/_vendor/v3_7/typeguard/_importhook.py +213 -0
  44. metaflow/_vendor/v3_7/typeguard/_memo.py +48 -0
  45. metaflow/_vendor/v3_7/typeguard/_pytest_plugin.py +100 -0
  46. metaflow/_vendor/v3_7/typeguard/_suppression.py +88 -0
  47. metaflow/_vendor/v3_7/typeguard/_transformer.py +1207 -0
  48. metaflow/_vendor/v3_7/typeguard/_union_transformer.py +54 -0
  49. metaflow/_vendor/v3_7/typeguard/_utils.py +169 -0
  50. metaflow/_vendor/v3_7/typeguard/py.typed +0 -0
  51. metaflow/_vendor/v3_7/typing_extensions.py +3072 -0
  52. metaflow/_vendor/yaml/__init__.py +427 -0
  53. metaflow/_vendor/yaml/composer.py +139 -0
  54. metaflow/_vendor/yaml/constructor.py +748 -0
  55. metaflow/_vendor/yaml/cyaml.py +101 -0
  56. metaflow/_vendor/yaml/dumper.py +62 -0
  57. metaflow/_vendor/yaml/emitter.py +1137 -0
  58. metaflow/_vendor/yaml/error.py +75 -0
  59. metaflow/_vendor/yaml/events.py +86 -0
  60. metaflow/_vendor/yaml/loader.py +63 -0
  61. metaflow/_vendor/yaml/nodes.py +49 -0
  62. metaflow/_vendor/yaml/parser.py +589 -0
  63. metaflow/_vendor/yaml/reader.py +185 -0
  64. metaflow/_vendor/yaml/representer.py +389 -0
  65. metaflow/_vendor/yaml/resolver.py +227 -0
  66. metaflow/_vendor/yaml/scanner.py +1435 -0
  67. metaflow/_vendor/yaml/serializer.py +111 -0
  68. metaflow/_vendor/yaml/tokens.py +104 -0
  69. metaflow/cards.py +5 -0
  70. metaflow/cli.py +331 -785
  71. metaflow/cli_args.py +17 -0
  72. metaflow/cli_components/__init__.py +0 -0
  73. metaflow/cli_components/dump_cmd.py +96 -0
  74. metaflow/cli_components/init_cmd.py +52 -0
  75. metaflow/cli_components/run_cmds.py +546 -0
  76. metaflow/cli_components/step_cmd.py +334 -0
  77. metaflow/cli_components/utils.py +140 -0
  78. metaflow/client/__init__.py +1 -0
  79. metaflow/client/core.py +467 -73
  80. metaflow/client/filecache.py +75 -35
  81. metaflow/clone_util.py +7 -1
  82. metaflow/cmd/code/__init__.py +231 -0
  83. metaflow/cmd/develop/stub_generator.py +756 -288
  84. metaflow/cmd/develop/stubs.py +12 -28
  85. metaflow/cmd/main_cli.py +6 -4
  86. metaflow/cmd/make_wrapper.py +78 -0
  87. metaflow/datastore/__init__.py +1 -0
  88. metaflow/datastore/content_addressed_store.py +41 -10
  89. metaflow/datastore/datastore_set.py +11 -2
  90. metaflow/datastore/flow_datastore.py +156 -10
  91. metaflow/datastore/spin_datastore.py +91 -0
  92. metaflow/datastore/task_datastore.py +154 -39
  93. metaflow/debug.py +5 -0
  94. metaflow/decorators.py +404 -78
  95. metaflow/exception.py +8 -2
  96. metaflow/extension_support/__init__.py +527 -376
  97. metaflow/extension_support/_empty_file.py +2 -2
  98. metaflow/extension_support/plugins.py +49 -31
  99. metaflow/flowspec.py +482 -33
  100. metaflow/graph.py +210 -42
  101. metaflow/includefile.py +84 -40
  102. metaflow/lint.py +141 -22
  103. metaflow/meta_files.py +13 -0
  104. metaflow/{metadata → metadata_provider}/heartbeat.py +24 -8
  105. metaflow/{metadata → metadata_provider}/metadata.py +86 -1
  106. metaflow/metaflow_config.py +175 -28
  107. metaflow/metaflow_config_funcs.py +51 -3
  108. metaflow/metaflow_current.py +4 -10
  109. metaflow/metaflow_environment.py +139 -53
  110. metaflow/metaflow_git.py +115 -0
  111. metaflow/metaflow_profile.py +18 -0
  112. metaflow/metaflow_version.py +150 -66
  113. metaflow/mflog/__init__.py +4 -3
  114. metaflow/mflog/save_logs.py +2 -2
  115. metaflow/multicore_utils.py +31 -14
  116. metaflow/package/__init__.py +673 -0
  117. metaflow/packaging_sys/__init__.py +880 -0
  118. metaflow/packaging_sys/backend.py +128 -0
  119. metaflow/packaging_sys/distribution_support.py +153 -0
  120. metaflow/packaging_sys/tar_backend.py +99 -0
  121. metaflow/packaging_sys/utils.py +54 -0
  122. metaflow/packaging_sys/v1.py +527 -0
  123. metaflow/parameters.py +149 -28
  124. metaflow/plugins/__init__.py +74 -5
  125. metaflow/plugins/airflow/airflow.py +40 -25
  126. metaflow/plugins/airflow/airflow_cli.py +22 -5
  127. metaflow/plugins/airflow/airflow_decorator.py +1 -1
  128. metaflow/plugins/airflow/airflow_utils.py +5 -3
  129. metaflow/plugins/airflow/sensors/base_sensor.py +4 -4
  130. metaflow/plugins/airflow/sensors/external_task_sensor.py +2 -2
  131. metaflow/plugins/airflow/sensors/s3_sensor.py +2 -2
  132. metaflow/plugins/argo/argo_client.py +78 -33
  133. metaflow/plugins/argo/argo_events.py +6 -6
  134. metaflow/plugins/argo/argo_workflows.py +2410 -527
  135. metaflow/plugins/argo/argo_workflows_cli.py +571 -121
  136. metaflow/plugins/argo/argo_workflows_decorator.py +43 -12
  137. metaflow/plugins/argo/argo_workflows_deployer.py +106 -0
  138. metaflow/plugins/argo/argo_workflows_deployer_objects.py +453 -0
  139. metaflow/plugins/argo/capture_error.py +73 -0
  140. metaflow/plugins/argo/conditional_input_paths.py +35 -0
  141. metaflow/plugins/argo/exit_hooks.py +209 -0
  142. metaflow/plugins/argo/jobset_input_paths.py +15 -0
  143. metaflow/plugins/argo/param_val.py +19 -0
  144. metaflow/plugins/aws/aws_client.py +10 -3
  145. metaflow/plugins/aws/aws_utils.py +55 -2
  146. metaflow/plugins/aws/batch/batch.py +72 -5
  147. metaflow/plugins/aws/batch/batch_cli.py +33 -10
  148. metaflow/plugins/aws/batch/batch_client.py +4 -3
  149. metaflow/plugins/aws/batch/batch_decorator.py +102 -35
  150. metaflow/plugins/aws/secrets_manager/aws_secrets_manager_secrets_provider.py +13 -10
  151. metaflow/plugins/aws/step_functions/dynamo_db_client.py +0 -3
  152. metaflow/plugins/aws/step_functions/production_token.py +1 -1
  153. metaflow/plugins/aws/step_functions/step_functions.py +65 -8
  154. metaflow/plugins/aws/step_functions/step_functions_cli.py +101 -7
  155. metaflow/plugins/aws/step_functions/step_functions_decorator.py +1 -2
  156. metaflow/plugins/aws/step_functions/step_functions_deployer.py +97 -0
  157. metaflow/plugins/aws/step_functions/step_functions_deployer_objects.py +264 -0
  158. metaflow/plugins/azure/azure_exceptions.py +1 -1
  159. metaflow/plugins/azure/azure_secret_manager_secrets_provider.py +240 -0
  160. metaflow/plugins/azure/azure_tail.py +1 -1
  161. metaflow/plugins/azure/includefile_support.py +2 -0
  162. metaflow/plugins/cards/card_cli.py +66 -30
  163. metaflow/plugins/cards/card_creator.py +25 -1
  164. metaflow/plugins/cards/card_datastore.py +21 -49
  165. metaflow/plugins/cards/card_decorator.py +132 -8
  166. metaflow/plugins/cards/card_modules/basic.py +112 -17
  167. metaflow/plugins/cards/card_modules/bundle.css +1 -1
  168. metaflow/plugins/cards/card_modules/card.py +16 -1
  169. metaflow/plugins/cards/card_modules/chevron/renderer.py +1 -1
  170. metaflow/plugins/cards/card_modules/components.py +665 -28
  171. metaflow/plugins/cards/card_modules/convert_to_native_type.py +36 -7
  172. metaflow/plugins/cards/card_modules/json_viewer.py +232 -0
  173. metaflow/plugins/cards/card_modules/main.css +1 -0
  174. metaflow/plugins/cards/card_modules/main.js +68 -49
  175. metaflow/plugins/cards/card_modules/renderer_tools.py +1 -0
  176. metaflow/plugins/cards/card_modules/test_cards.py +26 -12
  177. metaflow/plugins/cards/card_server.py +39 -14
  178. metaflow/plugins/cards/component_serializer.py +2 -9
  179. metaflow/plugins/cards/metadata.py +22 -0
  180. metaflow/plugins/catch_decorator.py +9 -0
  181. metaflow/plugins/datastores/azure_storage.py +10 -1
  182. metaflow/plugins/datastores/gs_storage.py +6 -2
  183. metaflow/plugins/datastores/local_storage.py +12 -6
  184. metaflow/plugins/datastores/spin_storage.py +12 -0
  185. metaflow/plugins/datatools/local.py +2 -0
  186. metaflow/plugins/datatools/s3/s3.py +126 -75
  187. metaflow/plugins/datatools/s3/s3op.py +254 -121
  188. metaflow/plugins/env_escape/__init__.py +3 -3
  189. metaflow/plugins/env_escape/client_modules.py +102 -72
  190. metaflow/plugins/env_escape/server.py +7 -0
  191. metaflow/plugins/env_escape/stub.py +24 -5
  192. metaflow/plugins/events_decorator.py +343 -185
  193. metaflow/plugins/exit_hook/__init__.py +0 -0
  194. metaflow/plugins/exit_hook/exit_hook_decorator.py +46 -0
  195. metaflow/plugins/exit_hook/exit_hook_script.py +52 -0
  196. metaflow/plugins/gcp/__init__.py +1 -1
  197. metaflow/plugins/gcp/gcp_secret_manager_secrets_provider.py +11 -6
  198. metaflow/plugins/gcp/gs_tail.py +10 -6
  199. metaflow/plugins/gcp/includefile_support.py +3 -0
  200. metaflow/plugins/kubernetes/kube_utils.py +108 -0
  201. metaflow/plugins/kubernetes/kubernetes.py +411 -130
  202. metaflow/plugins/kubernetes/kubernetes_cli.py +168 -36
  203. metaflow/plugins/kubernetes/kubernetes_client.py +104 -2
  204. metaflow/plugins/kubernetes/kubernetes_decorator.py +246 -88
  205. metaflow/plugins/kubernetes/kubernetes_job.py +253 -581
  206. metaflow/plugins/kubernetes/kubernetes_jobsets.py +1071 -0
  207. metaflow/plugins/kubernetes/spot_metadata_cli.py +69 -0
  208. metaflow/plugins/kubernetes/spot_monitor_sidecar.py +109 -0
  209. metaflow/plugins/logs_cli.py +359 -0
  210. metaflow/plugins/{metadata → metadata_providers}/local.py +144 -84
  211. metaflow/plugins/{metadata → metadata_providers}/service.py +103 -26
  212. metaflow/plugins/metadata_providers/spin.py +16 -0
  213. metaflow/plugins/package_cli.py +36 -24
  214. metaflow/plugins/parallel_decorator.py +128 -11
  215. metaflow/plugins/parsers.py +16 -0
  216. metaflow/plugins/project_decorator.py +51 -5
  217. metaflow/plugins/pypi/bootstrap.py +357 -105
  218. metaflow/plugins/pypi/conda_decorator.py +82 -81
  219. metaflow/plugins/pypi/conda_environment.py +187 -52
  220. metaflow/plugins/pypi/micromamba.py +157 -47
  221. metaflow/plugins/pypi/parsers.py +268 -0
  222. metaflow/plugins/pypi/pip.py +88 -13
  223. metaflow/plugins/pypi/pypi_decorator.py +37 -1
  224. metaflow/plugins/pypi/utils.py +48 -2
  225. metaflow/plugins/resources_decorator.py +2 -2
  226. metaflow/plugins/secrets/__init__.py +3 -0
  227. metaflow/plugins/secrets/secrets_decorator.py +26 -181
  228. metaflow/plugins/secrets/secrets_func.py +49 -0
  229. metaflow/plugins/secrets/secrets_spec.py +101 -0
  230. metaflow/plugins/secrets/utils.py +74 -0
  231. metaflow/plugins/tag_cli.py +4 -7
  232. metaflow/plugins/test_unbounded_foreach_decorator.py +41 -6
  233. metaflow/plugins/timeout_decorator.py +3 -3
  234. metaflow/plugins/uv/__init__.py +0 -0
  235. metaflow/plugins/uv/bootstrap.py +128 -0
  236. metaflow/plugins/uv/uv_environment.py +72 -0
  237. metaflow/procpoll.py +1 -1
  238. metaflow/pylint_wrapper.py +5 -1
  239. metaflow/runner/__init__.py +0 -0
  240. metaflow/runner/click_api.py +717 -0
  241. metaflow/runner/deployer.py +470 -0
  242. metaflow/runner/deployer_impl.py +201 -0
  243. metaflow/runner/metaflow_runner.py +714 -0
  244. metaflow/runner/nbdeploy.py +132 -0
  245. metaflow/runner/nbrun.py +225 -0
  246. metaflow/runner/subprocess_manager.py +650 -0
  247. metaflow/runner/utils.py +335 -0
  248. metaflow/runtime.py +1078 -260
  249. metaflow/sidecar/sidecar_worker.py +1 -1
  250. metaflow/system/__init__.py +5 -0
  251. metaflow/system/system_logger.py +85 -0
  252. metaflow/system/system_monitor.py +108 -0
  253. metaflow/system/system_utils.py +19 -0
  254. metaflow/task.py +521 -225
  255. metaflow/tracing/__init__.py +7 -7
  256. metaflow/tracing/span_exporter.py +31 -38
  257. metaflow/tracing/tracing_modules.py +38 -43
  258. metaflow/tuple_util.py +27 -0
  259. metaflow/user_configs/__init__.py +0 -0
  260. metaflow/user_configs/config_options.py +563 -0
  261. metaflow/user_configs/config_parameters.py +598 -0
  262. metaflow/user_decorators/__init__.py +0 -0
  263. metaflow/user_decorators/common.py +144 -0
  264. metaflow/user_decorators/mutable_flow.py +512 -0
  265. metaflow/user_decorators/mutable_step.py +424 -0
  266. metaflow/user_decorators/user_flow_decorator.py +264 -0
  267. metaflow/user_decorators/user_step_decorator.py +749 -0
  268. metaflow/util.py +243 -27
  269. metaflow/vendor.py +23 -7
  270. metaflow/version.py +1 -1
  271. ob_metaflow-2.19.7.1rc0.data/data/share/metaflow/devtools/Makefile +355 -0
  272. ob_metaflow-2.19.7.1rc0.data/data/share/metaflow/devtools/Tiltfile +726 -0
  273. ob_metaflow-2.19.7.1rc0.data/data/share/metaflow/devtools/pick_services.sh +105 -0
  274. ob_metaflow-2.19.7.1rc0.dist-info/METADATA +87 -0
  275. ob_metaflow-2.19.7.1rc0.dist-info/RECORD +445 -0
  276. {ob_metaflow-2.11.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/WHEEL +1 -1
  277. {ob_metaflow-2.11.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/entry_points.txt +1 -0
  278. metaflow/_vendor/v3_5/__init__.py +0 -1
  279. metaflow/_vendor/v3_5/importlib_metadata/__init__.py +0 -644
  280. metaflow/_vendor/v3_5/importlib_metadata/_compat.py +0 -152
  281. metaflow/package.py +0 -188
  282. ob_metaflow-2.11.13.1.dist-info/METADATA +0 -85
  283. ob_metaflow-2.11.13.1.dist-info/RECORD +0 -308
  284. /metaflow/_vendor/{v3_5/zipp.py → zipp.py} +0 -0
  285. /metaflow/{metadata → metadata_provider}/__init__.py +0 -0
  286. /metaflow/{metadata → metadata_provider}/util.py +0 -0
  287. /metaflow/plugins/{metadata → metadata_providers}/__init__.py +0 -0
  288. {ob_metaflow-2.11.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info/licenses}/LICENSE +0 -0
  289. {ob_metaflow-2.11.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/top_level.txt +0 -0
@@ -7,26 +7,31 @@ import time
7
7
  from metaflow import current
8
8
  from metaflow.decorators import StepDecorator
9
9
  from metaflow.exception import MetaflowException
10
- from metaflow.metadata import MetaDatum
11
- from metaflow.metadata.util import sync_local_metadata_to_datastore
10
+ from metaflow.metadata_provider import MetaDatum
11
+ from metaflow.metadata_provider.util import sync_local_metadata_to_datastore
12
12
  from metaflow.metaflow_config import (
13
13
  DATASTORE_LOCAL_DIR,
14
+ FEAT_ALWAYS_UPLOAD_CODE_PACKAGE,
14
15
  KUBERNETES_CONTAINER_IMAGE,
15
16
  KUBERNETES_CONTAINER_REGISTRY,
17
+ KUBERNETES_CPU,
18
+ KUBERNETES_DISK,
16
19
  KUBERNETES_FETCH_EC2_METADATA,
17
- KUBERNETES_IMAGE_PULL_POLICY,
18
20
  KUBERNETES_GPU_VENDOR,
21
+ KUBERNETES_IMAGE_PULL_POLICY,
22
+ KUBERNETES_IMAGE_PULL_SECRETS,
23
+ KUBERNETES_MEMORY,
24
+ KUBERNETES_LABELS,
25
+ KUBERNETES_ANNOTATIONS,
19
26
  KUBERNETES_NAMESPACE,
20
27
  KUBERNETES_NODE_SELECTOR,
21
28
  KUBERNETES_PERSISTENT_VOLUME_CLAIMS,
22
- KUBERNETES_TOLERATIONS,
23
- KUBERNETES_SERVICE_ACCOUNT,
24
29
  KUBERNETES_PORT,
30
+ KUBERNETES_SERVICE_ACCOUNT,
25
31
  KUBERNETES_SHARED_MEMORY,
26
- KUBERNETES_PORT,
27
- KUBERNETES_CPU,
28
- KUBERNETES_MEMORY,
29
- KUBERNETES_DISK,
32
+ KUBERNETES_TOLERATIONS,
33
+ KUBERNETES_QOS,
34
+ KUBERNETES_CONDA_ARCH,
30
35
  )
31
36
  from metaflow.plugins.resources_decorator import ResourcesDecorator
32
37
  from metaflow.plugins.timeout_decorator import get_run_time_limit_for_task
@@ -34,7 +39,8 @@ from metaflow.sidecar import Sidecar
34
39
  from metaflow.unbounded_foreach import UBF_CONTROL
35
40
 
36
41
  from ..aws.aws_utils import get_docker_registry, get_ec2_instance_metadata
37
- from .kubernetes import KubernetesException, parse_kube_keyvalue_list
42
+ from .kubernetes import KubernetesException
43
+ from .kube_utils import validate_kube_labels, parse_kube_keyvalue_list
38
44
 
39
45
  from metaflow.metaflow_config import MAX_MEMORY_PER_TASK, MAX_CPU_PER_TASK
40
46
 
@@ -44,6 +50,8 @@ except NameError:
44
50
  unicode = str
45
51
  basestring = str
46
52
 
53
+ SUPPORTED_KUBERNETES_QOS_CLASSES = ["Guaranteed", "Burstable"]
54
+
47
55
 
48
56
  class KubernetesDecorator(StepDecorator):
49
57
  """
@@ -68,12 +76,21 @@ class KubernetesDecorator(StepDecorator):
68
76
  not, a default Docker image mapping to the current version of Python is used.
69
77
  image_pull_policy: str, default KUBERNETES_IMAGE_PULL_POLICY
70
78
  If given, the imagePullPolicy to be applied to the Docker image of the step.
79
+ image_pull_secrets: List[str], default []
80
+ The default is extracted from METAFLOW_KUBERNETES_IMAGE_PULL_SECRETS.
81
+ Kubernetes image pull secrets to use when pulling container images
82
+ in Kubernetes.
71
83
  service_account : str, default METAFLOW_KUBERNETES_SERVICE_ACCOUNT
72
84
  Kubernetes service account to use when launching pod in Kubernetes.
73
85
  secrets : List[str], optional, default None
74
86
  Kubernetes secrets to use when launching pod in Kubernetes. These
75
87
  secrets are in addition to the ones defined in `METAFLOW_KUBERNETES_SECRETS`
76
88
  in Metaflow configuration.
89
+ node_selector: Union[Dict[str,str], str], optional, default None
90
+ Kubernetes node selector(s) to apply to the pod running the task.
91
+ Can be passed in as a comma separated string of values e.g.
92
+ 'kubernetes.io/os=linux,kubernetes.io/arch=amd64' or as a dictionary
93
+ {'kubernetes.io/os': 'linux', 'kubernetes.io/arch': 'amd64'}
77
94
  namespace : str, default METAFLOW_KUBERNETES_NAMESPACE
78
95
  Kubernetes namespace to use when launching pod in Kubernetes.
79
96
  gpu : int, optional, default None
@@ -81,9 +98,13 @@ class KubernetesDecorator(StepDecorator):
81
98
  the scheduled node should not have GPUs.
82
99
  gpu_vendor : str, default KUBERNETES_GPU_VENDOR
83
100
  The vendor of the GPUs to be used for this step.
84
- tolerations : List[str], default []
101
+ tolerations : List[Dict[str,str]], default []
85
102
  The default is extracted from METAFLOW_KUBERNETES_TOLERATIONS.
86
103
  Kubernetes tolerations to use when launching pod in Kubernetes.
104
+ labels: Dict[str, str], default: METAFLOW_KUBERNETES_LABELS
105
+ Kubernetes labels to use when launching pod in Kubernetes.
106
+ annotations: Dict[str, str], default: METAFLOW_KUBERNETES_ANNOTATIONS
107
+ Kubernetes annotations to use when launching pod in Kubernetes.
87
108
  use_tmpfs : bool, default False
88
109
  This enables an explicit tmpfs mount for this step.
89
110
  tmpfs_tempdir : bool, default True
@@ -101,6 +122,22 @@ class KubernetesDecorator(StepDecorator):
101
122
  Shared memory size (in MiB) required for this step
102
123
  port: int, optional
103
124
  Port number to specify in the Kubernetes job object
125
+ compute_pool : str, optional, default None
126
+ Compute pool to be used for for this step.
127
+ If not specified, any accessible compute pool within the perimeter is used.
128
+ hostname_resolution_timeout: int, default 10 * 60
129
+ Timeout in seconds for the workers tasks in the gang scheduled cluster to resolve the hostname of control task.
130
+ Only applicable when @parallel is used.
131
+ qos: str, default: Burstable
132
+ Quality of Service class to assign to the pod. Supported values are: Guaranteed, Burstable, BestEffort
133
+
134
+ security_context: Dict[str, Any], optional, default None
135
+ Container security context. Applies to the task container. Allows the following keys:
136
+ - privileged: bool, optional, default None
137
+ - allow_privilege_escalation: bool, optional, default None
138
+ - run_as_user: int, optional, default None
139
+ - run_as_group: int, optional, default None
140
+ - run_as_non_root: bool, optional, default None
104
141
  """
105
142
 
106
143
  name = "kubernetes"
@@ -110,6 +147,7 @@ class KubernetesDecorator(StepDecorator):
110
147
  "disk": "10240",
111
148
  "image": None,
112
149
  "image_pull_policy": None,
150
+ "image_pull_secrets": None, # e.g., ["regcred"]
113
151
  "service_account": None,
114
152
  "secrets": None, # e.g., mysecret
115
153
  "node_selector": None, # e.g., kubernetes.io/os=linux
@@ -118,6 +156,8 @@ class KubernetesDecorator(StepDecorator):
118
156
  "gpu_vendor": None,
119
157
  "tolerations": None, # e.g., [{"key": "arch", "operator": "Equal", "value": "amd"},
120
158
  # {"key": "foo", "operator": "Equal", "value": "bar"}]
159
+ "labels": None, # e.g. {"test-label": "value", "another-label":"value2"}
160
+ "annotations": None, # e.g. {"note": "value", "another-note": "value2"}
121
161
  "use_tmpfs": None,
122
162
  "tmpfs_tempdir": True,
123
163
  "tmpfs_size": None,
@@ -125,14 +165,22 @@ class KubernetesDecorator(StepDecorator):
125
165
  "persistent_volume_claims": None, # e.g., {"pvc-name": "/mnt/vol", "another-pvc": "/mnt/vol2"}
126
166
  "shared_memory": None,
127
167
  "port": None,
168
+ "compute_pool": None,
169
+ "executable": None,
170
+ "hostname_resolution_timeout": 10 * 60,
171
+ "qos": KUBERNETES_QOS,
172
+ "security_context": None,
128
173
  }
174
+ package_metadata = None
129
175
  package_url = None
130
176
  package_sha = None
131
177
  run_time_limit = None
132
178
 
133
- def __init__(self, attributes=None, statically_defined=False):
134
- super(KubernetesDecorator, self).__init__(attributes, statically_defined)
179
+ # Conda environment support
180
+ supports_conda_environment = True
181
+ target_platform = KUBERNETES_CONDA_ARCH or "linux-64"
135
182
 
183
+ def init(self):
136
184
  if not self.attributes["namespace"]:
137
185
  self.attributes["namespace"] = KUBERNETES_NAMESPACE
138
186
  if not self.attributes["service_account"]:
@@ -152,11 +200,21 @@ class KubernetesDecorator(StepDecorator):
152
200
  )
153
201
  if not self.attributes["image_pull_policy"] and KUBERNETES_IMAGE_PULL_POLICY:
154
202
  self.attributes["image_pull_policy"] = KUBERNETES_IMAGE_PULL_POLICY
203
+ if not self.attributes["image_pull_secrets"] and KUBERNETES_IMAGE_PULL_SECRETS:
204
+ self.attributes["image_pull_secrets"] = json.loads(
205
+ KUBERNETES_IMAGE_PULL_SECRETS
206
+ )
155
207
 
156
208
  if isinstance(self.attributes["node_selector"], str):
157
209
  self.attributes["node_selector"] = parse_kube_keyvalue_list(
158
210
  self.attributes["node_selector"].split(",")
159
211
  )
212
+ if self.attributes["compute_pool"]:
213
+ if self.attributes["node_selector"] is None:
214
+ self.attributes["node_selector"] = {}
215
+ self.attributes["node_selector"].update(
216
+ {"outerbounds.co/compute-pool": self.attributes["compute_pool"]}
217
+ )
160
218
 
161
219
  if self.attributes["tolerations"]:
162
220
  try:
@@ -190,6 +248,36 @@ class KubernetesDecorator(StepDecorator):
190
248
  self.attributes["memory"] = KUBERNETES_MEMORY
191
249
  if self.attributes["disk"] == self.defaults["disk"] and KUBERNETES_DISK:
192
250
  self.attributes["disk"] = KUBERNETES_DISK
251
+ # Label source precedence (decreasing):
252
+ # - System labels (set outside of decorator)
253
+ # - Decorator labels: @kubernetes(labels={})
254
+ # - Environment variable labels: METAFLOW_KUBERNETES_LABELS=
255
+ deco_labels = {}
256
+ if self.attributes["labels"] is not None:
257
+ deco_labels = self.attributes["labels"]
258
+
259
+ env_labels = {}
260
+ if KUBERNETES_LABELS:
261
+ env_labels = parse_kube_keyvalue_list(KUBERNETES_LABELS.split(","), False)
262
+
263
+ self.attributes["labels"] = {**env_labels, **deco_labels}
264
+
265
+ # Annotations
266
+ # annotation precedence (decreasing):
267
+ # - System annotations (set outside of decorator)
268
+ # - Decorator annotations: @kubernetes(annotations={})
269
+ # - Environment annotations: METAFLOW_KUBERNETES_ANNOTATIONS=
270
+ deco_annotations = {}
271
+ if self.attributes["annotations"] is not None:
272
+ deco_annotations = self.attributes["annotations"]
273
+
274
+ env_annotations = {}
275
+ if KUBERNETES_ANNOTATIONS:
276
+ env_annotations = parse_kube_keyvalue_list(
277
+ KUBERNETES_ANNOTATIONS.split(","), False
278
+ )
279
+
280
+ self.attributes["annotations"] = {**env_annotations, **deco_annotations}
193
281
 
194
282
  # If no docker image is explicitly specified, impute a default image.
195
283
  if not self.attributes["image"]:
@@ -238,12 +326,33 @@ class KubernetesDecorator(StepDecorator):
238
326
  self.step = step
239
327
  self.flow_datastore = flow_datastore
240
328
 
329
+ if (
330
+ self.attributes["qos"] is not None
331
+ # case insensitive matching.
332
+ and self.attributes["qos"].lower()
333
+ not in [c.lower() for c in SUPPORTED_KUBERNETES_QOS_CLASSES]
334
+ ):
335
+ raise MetaflowException(
336
+ "*%s* is not a valid Kubernetes QoS class. Choose one of the following: %s"
337
+ % (self.attributes["qos"], ", ".join(SUPPORTED_KUBERNETES_QOS_CLASSES))
338
+ )
339
+
241
340
  if any([deco.name == "batch" for deco in decos]):
242
341
  raise MetaflowException(
243
342
  "Step *{step}* is marked for execution both on AWS Batch and "
244
343
  "Kubernetes. Please use one or the other.".format(step=step)
245
344
  )
246
345
 
346
+ if any([deco.name == "parallel" for deco in decos]) and any(
347
+ [deco.name == "catch" for deco in decos]
348
+ ):
349
+ raise MetaflowException(
350
+ "Step *{step}* contains a @parallel decorator "
351
+ "with the @catch decorator. @catch is not supported with @parallel on Kubernetes.".format(
352
+ step=step
353
+ )
354
+ )
355
+
247
356
  # Set run time limit for the Kubernetes job.
248
357
  self.run_time_limit = get_run_time_limit_for_task(decos)
249
358
  if self.run_time_limit < 60:
@@ -327,7 +436,7 @@ class KubernetesDecorator(StepDecorator):
327
436
 
328
437
  if self.attributes["shared_memory"]:
329
438
  if not (
330
- isinstance(self.attributes["shared_memory"], (int, unicode, basestring))
439
+ isinstance(self.attributes["shared_memory"], int)
331
440
  and int(self.attributes["shared_memory"]) > 0
332
441
  ):
333
442
  raise KubernetesException(
@@ -336,6 +445,9 @@ class KubernetesDecorator(StepDecorator):
336
445
  )
337
446
  )
338
447
 
448
+ validate_kube_labels(self.attributes["labels"])
449
+ # TODO: add validation to annotations as well?
450
+
339
451
  def package_init(self, flow, step_name, environment):
340
452
  try:
341
453
  # Kubernetes is a soft dependency.
@@ -374,12 +486,17 @@ class KubernetesDecorator(StepDecorator):
374
486
  # to execute on Kubernetes anymore. We can execute possible fallback
375
487
  # code locally.
376
488
  cli_args.commands = ["kubernetes", "step"]
489
+ cli_args.command_args.append(self.package_metadata)
377
490
  cli_args.command_args.append(self.package_sha)
378
491
  cli_args.command_args.append(self.package_url)
379
492
 
493
+ # skip certain keys as CLI arguments
494
+ _skip_keys = ["compute_pool", "hostname_resolution_timeout"]
380
495
  # --namespace is used to specify Metaflow namespace (a different
381
496
  # concept from k8s namespace).
382
497
  for k, v in self.attributes.items():
498
+ if k in _skip_keys:
499
+ continue
383
500
  if k == "namespace":
384
501
  cli_args.command_options["k8s_namespace"] = v
385
502
  elif k in {"node_selector"} and v:
@@ -387,7 +504,14 @@ class KubernetesDecorator(StepDecorator):
387
504
  "=".join([key, str(val)]) if val else key
388
505
  for key, val in v.items()
389
506
  ]
390
- elif k in ["tolerations", "persistent_volume_claims"]:
507
+ elif k in [
508
+ "image_pull_secrets",
509
+ "tolerations",
510
+ "persistent_volume_claims",
511
+ "labels",
512
+ "annotations",
513
+ "security_context",
514
+ ]:
391
515
  cli_args.command_options[k] = json.dumps(v)
392
516
  else:
393
517
  cli_args.command_options[k] = v
@@ -422,8 +546,8 @@ class KubernetesDecorator(StepDecorator):
422
546
  # check for the existence of METAFLOW_KUBERNETES_WORKLOAD environment
423
547
  # variable.
424
548
 
549
+ meta = {}
425
550
  if "METAFLOW_KUBERNETES_WORKLOAD" in os.environ:
426
- meta = {}
427
551
  meta["kubernetes-pod-name"] = os.environ["METAFLOW_KUBERNETES_POD_NAME"]
428
552
  meta["kubernetes-pod-namespace"] = os.environ[
429
553
  "METAFLOW_KUBERNETES_POD_NAMESPACE"
@@ -434,10 +558,14 @@ class KubernetesDecorator(StepDecorator):
434
558
  ]
435
559
  meta["kubernetes-node-ip"] = os.environ["METAFLOW_KUBERNETES_NODE_IP"]
436
560
 
561
+ meta["kubernetes-jobset-name"] = os.environ.get(
562
+ "METAFLOW_KUBERNETES_JOBSET_NAME"
563
+ )
564
+
437
565
  # TODO (savin): Introduce equivalent support for Microsoft Azure and
438
566
  # Google Cloud Platform
439
- # TODO: Introduce a way to detect Cloud Provider, so unnecessary requests (and delays)
440
- # can be avoided by not having to try out all providers.
567
+ # TODO: Introduce a way to detect Cloud Provider, so unnecessary requests
568
+ # (and delays) can be avoided by not having to try out all providers.
441
569
  if KUBERNETES_FETCH_EC2_METADATA:
442
570
  instance_meta = get_ec2_instance_metadata()
443
571
  meta.update(instance_meta)
@@ -453,38 +581,51 @@ class KubernetesDecorator(StepDecorator):
453
581
  # "METAFLOW_KUBERNETES_POD_NAME"
454
582
  # ].rpartition("-")[0]
455
583
 
456
- entries = [
457
- MetaDatum(field=k, value=v, type=k, tags=[])
458
- for k, v in meta.items()
459
- if v is not None
460
- ]
461
- # Register book-keeping metadata for debugging.
462
- metadata.register_metadata(run_id, step_name, task_id, entries)
463
-
464
584
  # Start MFLog sidecar to collect task logs.
465
585
  self._save_logs_sidecar = Sidecar("save_logs_periodically")
466
586
  self._save_logs_sidecar.start()
467
587
 
468
- num_parallel = int(os.environ.get("WORLD_SIZE", 0))
469
- if num_parallel >= 1:
588
+ # Start spot termination monitor sidecar.
589
+ current._update_env(
590
+ {"spot_termination_notice": "/tmp/spot_termination_notice"}
591
+ )
592
+ self._spot_monitor_sidecar = Sidecar("spot_termination_monitor")
593
+ self._spot_monitor_sidecar.start()
594
+
595
+ num_parallel = None
596
+ if hasattr(flow, "_parallel_ubf_iter"):
597
+ num_parallel = flow._parallel_ubf_iter.num_parallel
598
+
599
+ if num_parallel and num_parallel > 1:
600
+ _setup_multinode_environment(
601
+ ubf_context, self.attributes["hostname_resolution_timeout"]
602
+ )
603
+ # current.parallel.node_index will be correctly available over here.
604
+ meta.update({"parallel-node-index": current.parallel.node_index})
470
605
  if ubf_context == UBF_CONTROL:
471
- control_task_id = current.task_id
472
- top_task_id = control_task_id.replace("control-", "")
473
- mapper_task_ids = [control_task_id] + [
474
- "%s-node-%d" % (top_task_id, node_idx)
475
- for node_idx in range(1, num_parallel)
476
- ]
477
606
  flow._control_mapper_tasks = [
478
- "%s/%s/%s" % (run_id, step_name, mapper_task_id)
479
- for mapper_task_id in mapper_task_ids
607
+ "{}/{}/{}".format(run_id, step_name, task_id)
608
+ for task_id in [task_id]
609
+ + [
610
+ "%s-worker-%d" % (task_id, idx)
611
+ for idx in range(num_parallel - 1)
612
+ ]
480
613
  ]
481
614
  flow._control_task_is_mapper_zero = True
482
- else:
483
- worker_job_rank = int(os.environ["RANK"])
484
- os.environ["RANK"] = str(worker_job_rank + 1)
485
615
 
486
- if num_parallel >= 1:
487
- _setup_multinode_environment()
616
+ if len(meta) > 0:
617
+ entries = [
618
+ MetaDatum(
619
+ field=k,
620
+ value=v,
621
+ type=k,
622
+ tags=["attempt_id:{0}".format(retry_count)],
623
+ )
624
+ for k, v in meta.items()
625
+ if v is not None
626
+ ]
627
+ # Register book-keeping metadata for debugging.
628
+ metadata.register_metadata(run_id, step_name, task_id, entries)
488
629
 
489
630
  def task_finished(
490
631
  self, step_name, flow, graph, is_task_ok, retry_count, max_retries
@@ -497,10 +638,10 @@ class KubernetesDecorator(StepDecorator):
497
638
  # local file system after the user code has finished execution.
498
639
  # This happens via datastore as a communication bridge.
499
640
 
500
- # TODO: There is no guarantee that task_prestep executes before
501
- # task_finished is invoked. That will result in AttributeError:
502
- # 'KubernetesDecorator' object has no attribute 'metadata' error.
503
- if self.metadata.TYPE == "local":
641
+ # TODO: There is no guarantee that task_pre_step executes before
642
+ # task_finished is invoked.
643
+ # For now we guard against the missing metadata object in this case.
644
+ if hasattr(self, "metadata") and self.metadata.TYPE == "local":
504
645
  # Note that the datastore is *always* Amazon S3 (see
505
646
  # runtime_task_created function).
506
647
  sync_local_metadata_to_datastore(
@@ -509,57 +650,74 @@ class KubernetesDecorator(StepDecorator):
509
650
 
510
651
  try:
511
652
  self._save_logs_sidecar.terminate()
653
+ self._spot_monitor_sidecar.terminate()
512
654
  except:
513
655
  # Best effort kill
514
656
  pass
515
657
 
516
- if is_task_ok and len(getattr(flow, "_control_mapper_tasks", [])) > 1:
517
- self._wait_for_mapper_tasks(flow, step_name)
658
+ @classmethod
659
+ def _save_package_once(cls, flow_datastore, package):
660
+ if cls.package_url is None:
661
+ if not FEAT_ALWAYS_UPLOAD_CODE_PACKAGE:
662
+ cls.package_url, cls.package_sha = flow_datastore.save_data(
663
+ [package.blob], len_hint=1
664
+ )[0]
665
+ cls.package_metadata = package.package_metadata
666
+ else:
667
+ # Blocks until the package is uploaded
668
+ cls.package_url = package.package_url()
669
+ cls.package_sha = package.package_sha()
670
+ cls.package_metadata = package.package_metadata
671
+
672
+
673
+ # TODO: Unify this method with the multi-node setup in @batch
674
+ def _setup_multinode_environment(ubf_context, hostname_resolution_timeout):
675
+ import socket
518
676
 
519
- def _wait_for_mapper_tasks(self, flow, step_name):
677
+ def _wait_for_hostname_resolution(max_wait_timeout=10 * 60):
520
678
  """
521
- When launching multinode task with UBF, need to wait for the secondary
522
- tasks to finish cleanly and produce their output before exiting the
523
- main task. Otherwise, the main task finishing will cause secondary nodes
524
- to terminate immediately, and possibly prematurely.
679
+ keep trying to resolve the hostname of the control task until the hostname is resolved
680
+ or the max_wait_timeout is reached. This is a workaround for the issue where the control
681
+ task is not scheduled before the worker task and the worker task fails because it cannot
682
+ resolve the hostname of the control task.
525
683
  """
526
- from metaflow import Step # avoid circular dependency
527
-
528
- TIMEOUT = 600
529
- last_completion_timeout = time.time() + TIMEOUT
530
- print("Waiting for batch secondary tasks to finish")
531
- while last_completion_timeout > time.time():
532
- time.sleep(2)
684
+ start_time = time.time()
685
+ while True:
533
686
  try:
534
- step_path = "%s/%s/%s" % (flow.name, current.run_id, step_name)
535
- tasks = [task for task in Step(step_path)]
536
- if len(tasks) == len(flow._control_mapper_tasks):
537
- if all(
538
- task.finished_at is not None for task in tasks
539
- ): # for some reason task.finished fails
540
- return True
541
- else:
542
- print(
543
- "Waiting for all parallel tasks to finish. Finished: {}/{}".format(
544
- len(tasks),
545
- len(flow._control_mapper_tasks),
687
+ return socket.gethostbyname(os.environ["MF_MASTER_ADDR"])
688
+ except socket.gaierror:
689
+ if time.time() - start_time > max_wait_timeout:
690
+ raise MetaflowException(
691
+ "Failed to get host by name for MF_MASTER_ADDR after waiting for {} seconds.".format(
692
+ max_wait_timeout
546
693
  )
547
694
  )
548
- except Exception as e:
549
- pass
550
- raise Exception(
551
- "Batch secondary workers did not finish in %s seconds" % TIMEOUT
552
- )
553
-
554
- @classmethod
555
- def _save_package_once(cls, flow_datastore, package):
556
- if cls.package_url is None:
557
- cls.package_url, cls.package_sha = flow_datastore.save_data(
558
- [package.blob], len_hint=1
559
- )[0]
695
+ time.sleep(1)
696
+
697
+ try:
698
+ # Even if Kubernetes may deploy control pods before worker pods, there is always a
699
+ # possibility that the worker pods may start before the control. In the case that this happens,
700
+ # the worker pods will not be able to resolve the control pod's IP address and this will cause
701
+ # the worker pods to fail. So if the worker pods are requesting a hostname resolution, we will
702
+ # make it wait for the name to be resolved within a reasonable timeout period.
703
+ if ubf_context != UBF_CONTROL:
704
+ os.environ["MF_PARALLEL_MAIN_IP"] = _wait_for_hostname_resolution(
705
+ hostname_resolution_timeout
706
+ )
707
+ else:
708
+ os.environ["MF_PARALLEL_MAIN_IP"] = socket.gethostbyname(
709
+ os.environ["MF_MASTER_ADDR"]
710
+ )
560
711
 
561
- def _setup_multinode_environment():
562
- import socket
563
- os.environ["MF_PARALLEL_MAIN_IP"] = socket.gethostbyname(os.environ["MASTER_ADDR"])
564
- os.environ["MF_PARALLEL_NUM_NODES"] = os.environ["WORLD_SIZE"]
565
- os.environ["MF_PARALLEL_NODE_INDEX"] = os.environ["RANK"]
712
+ os.environ["MF_PARALLEL_NUM_NODES"] = os.environ["MF_WORLD_SIZE"]
713
+ os.environ["MF_PARALLEL_NODE_INDEX"] = (
714
+ str(0)
715
+ if "MF_CONTROL_INDEX" in os.environ
716
+ else str(int(os.environ["MF_WORKER_REPLICA_INDEX"]) + 1)
717
+ )
718
+ except KeyError as e:
719
+ raise MetaflowException("Environment variable {} is missing.".format(e))
720
+ except socket.gaierror:
721
+ raise MetaflowException("Failed to get host by name for MF_MASTER_ADDR.")
722
+ except ValueError:
723
+ raise MetaflowException("Invalid value for MF_WORKER_REPLICA_INDEX.")