ob-metaflow 2.11.13.1__py2.py3-none-any.whl → 2.19.7.1rc0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (289) hide show
  1. metaflow/R.py +10 -7
  2. metaflow/__init__.py +40 -25
  3. metaflow/_vendor/imghdr/__init__.py +186 -0
  4. metaflow/_vendor/importlib_metadata/__init__.py +1063 -0
  5. metaflow/_vendor/importlib_metadata/_adapters.py +68 -0
  6. metaflow/_vendor/importlib_metadata/_collections.py +30 -0
  7. metaflow/_vendor/importlib_metadata/_compat.py +71 -0
  8. metaflow/_vendor/importlib_metadata/_functools.py +104 -0
  9. metaflow/_vendor/importlib_metadata/_itertools.py +73 -0
  10. metaflow/_vendor/importlib_metadata/_meta.py +48 -0
  11. metaflow/_vendor/importlib_metadata/_text.py +99 -0
  12. metaflow/_vendor/importlib_metadata/py.typed +0 -0
  13. metaflow/_vendor/typeguard/__init__.py +48 -0
  14. metaflow/_vendor/typeguard/_checkers.py +1070 -0
  15. metaflow/_vendor/typeguard/_config.py +108 -0
  16. metaflow/_vendor/typeguard/_decorators.py +233 -0
  17. metaflow/_vendor/typeguard/_exceptions.py +42 -0
  18. metaflow/_vendor/typeguard/_functions.py +308 -0
  19. metaflow/_vendor/typeguard/_importhook.py +213 -0
  20. metaflow/_vendor/typeguard/_memo.py +48 -0
  21. metaflow/_vendor/typeguard/_pytest_plugin.py +127 -0
  22. metaflow/_vendor/typeguard/_suppression.py +86 -0
  23. metaflow/_vendor/typeguard/_transformer.py +1229 -0
  24. metaflow/_vendor/typeguard/_union_transformer.py +55 -0
  25. metaflow/_vendor/typeguard/_utils.py +173 -0
  26. metaflow/_vendor/typeguard/py.typed +0 -0
  27. metaflow/_vendor/typing_extensions.py +3641 -0
  28. metaflow/_vendor/v3_7/importlib_metadata/__init__.py +1063 -0
  29. metaflow/_vendor/v3_7/importlib_metadata/_adapters.py +68 -0
  30. metaflow/_vendor/v3_7/importlib_metadata/_collections.py +30 -0
  31. metaflow/_vendor/v3_7/importlib_metadata/_compat.py +71 -0
  32. metaflow/_vendor/v3_7/importlib_metadata/_functools.py +104 -0
  33. metaflow/_vendor/v3_7/importlib_metadata/_itertools.py +73 -0
  34. metaflow/_vendor/v3_7/importlib_metadata/_meta.py +48 -0
  35. metaflow/_vendor/v3_7/importlib_metadata/_text.py +99 -0
  36. metaflow/_vendor/v3_7/importlib_metadata/py.typed +0 -0
  37. metaflow/_vendor/v3_7/typeguard/__init__.py +48 -0
  38. metaflow/_vendor/v3_7/typeguard/_checkers.py +906 -0
  39. metaflow/_vendor/v3_7/typeguard/_config.py +108 -0
  40. metaflow/_vendor/v3_7/typeguard/_decorators.py +237 -0
  41. metaflow/_vendor/v3_7/typeguard/_exceptions.py +42 -0
  42. metaflow/_vendor/v3_7/typeguard/_functions.py +310 -0
  43. metaflow/_vendor/v3_7/typeguard/_importhook.py +213 -0
  44. metaflow/_vendor/v3_7/typeguard/_memo.py +48 -0
  45. metaflow/_vendor/v3_7/typeguard/_pytest_plugin.py +100 -0
  46. metaflow/_vendor/v3_7/typeguard/_suppression.py +88 -0
  47. metaflow/_vendor/v3_7/typeguard/_transformer.py +1207 -0
  48. metaflow/_vendor/v3_7/typeguard/_union_transformer.py +54 -0
  49. metaflow/_vendor/v3_7/typeguard/_utils.py +169 -0
  50. metaflow/_vendor/v3_7/typeguard/py.typed +0 -0
  51. metaflow/_vendor/v3_7/typing_extensions.py +3072 -0
  52. metaflow/_vendor/yaml/__init__.py +427 -0
  53. metaflow/_vendor/yaml/composer.py +139 -0
  54. metaflow/_vendor/yaml/constructor.py +748 -0
  55. metaflow/_vendor/yaml/cyaml.py +101 -0
  56. metaflow/_vendor/yaml/dumper.py +62 -0
  57. metaflow/_vendor/yaml/emitter.py +1137 -0
  58. metaflow/_vendor/yaml/error.py +75 -0
  59. metaflow/_vendor/yaml/events.py +86 -0
  60. metaflow/_vendor/yaml/loader.py +63 -0
  61. metaflow/_vendor/yaml/nodes.py +49 -0
  62. metaflow/_vendor/yaml/parser.py +589 -0
  63. metaflow/_vendor/yaml/reader.py +185 -0
  64. metaflow/_vendor/yaml/representer.py +389 -0
  65. metaflow/_vendor/yaml/resolver.py +227 -0
  66. metaflow/_vendor/yaml/scanner.py +1435 -0
  67. metaflow/_vendor/yaml/serializer.py +111 -0
  68. metaflow/_vendor/yaml/tokens.py +104 -0
  69. metaflow/cards.py +5 -0
  70. metaflow/cli.py +331 -785
  71. metaflow/cli_args.py +17 -0
  72. metaflow/cli_components/__init__.py +0 -0
  73. metaflow/cli_components/dump_cmd.py +96 -0
  74. metaflow/cli_components/init_cmd.py +52 -0
  75. metaflow/cli_components/run_cmds.py +546 -0
  76. metaflow/cli_components/step_cmd.py +334 -0
  77. metaflow/cli_components/utils.py +140 -0
  78. metaflow/client/__init__.py +1 -0
  79. metaflow/client/core.py +467 -73
  80. metaflow/client/filecache.py +75 -35
  81. metaflow/clone_util.py +7 -1
  82. metaflow/cmd/code/__init__.py +231 -0
  83. metaflow/cmd/develop/stub_generator.py +756 -288
  84. metaflow/cmd/develop/stubs.py +12 -28
  85. metaflow/cmd/main_cli.py +6 -4
  86. metaflow/cmd/make_wrapper.py +78 -0
  87. metaflow/datastore/__init__.py +1 -0
  88. metaflow/datastore/content_addressed_store.py +41 -10
  89. metaflow/datastore/datastore_set.py +11 -2
  90. metaflow/datastore/flow_datastore.py +156 -10
  91. metaflow/datastore/spin_datastore.py +91 -0
  92. metaflow/datastore/task_datastore.py +154 -39
  93. metaflow/debug.py +5 -0
  94. metaflow/decorators.py +404 -78
  95. metaflow/exception.py +8 -2
  96. metaflow/extension_support/__init__.py +527 -376
  97. metaflow/extension_support/_empty_file.py +2 -2
  98. metaflow/extension_support/plugins.py +49 -31
  99. metaflow/flowspec.py +482 -33
  100. metaflow/graph.py +210 -42
  101. metaflow/includefile.py +84 -40
  102. metaflow/lint.py +141 -22
  103. metaflow/meta_files.py +13 -0
  104. metaflow/{metadata → metadata_provider}/heartbeat.py +24 -8
  105. metaflow/{metadata → metadata_provider}/metadata.py +86 -1
  106. metaflow/metaflow_config.py +175 -28
  107. metaflow/metaflow_config_funcs.py +51 -3
  108. metaflow/metaflow_current.py +4 -10
  109. metaflow/metaflow_environment.py +139 -53
  110. metaflow/metaflow_git.py +115 -0
  111. metaflow/metaflow_profile.py +18 -0
  112. metaflow/metaflow_version.py +150 -66
  113. metaflow/mflog/__init__.py +4 -3
  114. metaflow/mflog/save_logs.py +2 -2
  115. metaflow/multicore_utils.py +31 -14
  116. metaflow/package/__init__.py +673 -0
  117. metaflow/packaging_sys/__init__.py +880 -0
  118. metaflow/packaging_sys/backend.py +128 -0
  119. metaflow/packaging_sys/distribution_support.py +153 -0
  120. metaflow/packaging_sys/tar_backend.py +99 -0
  121. metaflow/packaging_sys/utils.py +54 -0
  122. metaflow/packaging_sys/v1.py +527 -0
  123. metaflow/parameters.py +149 -28
  124. metaflow/plugins/__init__.py +74 -5
  125. metaflow/plugins/airflow/airflow.py +40 -25
  126. metaflow/plugins/airflow/airflow_cli.py +22 -5
  127. metaflow/plugins/airflow/airflow_decorator.py +1 -1
  128. metaflow/plugins/airflow/airflow_utils.py +5 -3
  129. metaflow/plugins/airflow/sensors/base_sensor.py +4 -4
  130. metaflow/plugins/airflow/sensors/external_task_sensor.py +2 -2
  131. metaflow/plugins/airflow/sensors/s3_sensor.py +2 -2
  132. metaflow/plugins/argo/argo_client.py +78 -33
  133. metaflow/plugins/argo/argo_events.py +6 -6
  134. metaflow/plugins/argo/argo_workflows.py +2410 -527
  135. metaflow/plugins/argo/argo_workflows_cli.py +571 -121
  136. metaflow/plugins/argo/argo_workflows_decorator.py +43 -12
  137. metaflow/plugins/argo/argo_workflows_deployer.py +106 -0
  138. metaflow/plugins/argo/argo_workflows_deployer_objects.py +453 -0
  139. metaflow/plugins/argo/capture_error.py +73 -0
  140. metaflow/plugins/argo/conditional_input_paths.py +35 -0
  141. metaflow/plugins/argo/exit_hooks.py +209 -0
  142. metaflow/plugins/argo/jobset_input_paths.py +15 -0
  143. metaflow/plugins/argo/param_val.py +19 -0
  144. metaflow/plugins/aws/aws_client.py +10 -3
  145. metaflow/plugins/aws/aws_utils.py +55 -2
  146. metaflow/plugins/aws/batch/batch.py +72 -5
  147. metaflow/plugins/aws/batch/batch_cli.py +33 -10
  148. metaflow/plugins/aws/batch/batch_client.py +4 -3
  149. metaflow/plugins/aws/batch/batch_decorator.py +102 -35
  150. metaflow/plugins/aws/secrets_manager/aws_secrets_manager_secrets_provider.py +13 -10
  151. metaflow/plugins/aws/step_functions/dynamo_db_client.py +0 -3
  152. metaflow/plugins/aws/step_functions/production_token.py +1 -1
  153. metaflow/plugins/aws/step_functions/step_functions.py +65 -8
  154. metaflow/plugins/aws/step_functions/step_functions_cli.py +101 -7
  155. metaflow/plugins/aws/step_functions/step_functions_decorator.py +1 -2
  156. metaflow/plugins/aws/step_functions/step_functions_deployer.py +97 -0
  157. metaflow/plugins/aws/step_functions/step_functions_deployer_objects.py +264 -0
  158. metaflow/plugins/azure/azure_exceptions.py +1 -1
  159. metaflow/plugins/azure/azure_secret_manager_secrets_provider.py +240 -0
  160. metaflow/plugins/azure/azure_tail.py +1 -1
  161. metaflow/plugins/azure/includefile_support.py +2 -0
  162. metaflow/plugins/cards/card_cli.py +66 -30
  163. metaflow/plugins/cards/card_creator.py +25 -1
  164. metaflow/plugins/cards/card_datastore.py +21 -49
  165. metaflow/plugins/cards/card_decorator.py +132 -8
  166. metaflow/plugins/cards/card_modules/basic.py +112 -17
  167. metaflow/plugins/cards/card_modules/bundle.css +1 -1
  168. metaflow/plugins/cards/card_modules/card.py +16 -1
  169. metaflow/plugins/cards/card_modules/chevron/renderer.py +1 -1
  170. metaflow/plugins/cards/card_modules/components.py +665 -28
  171. metaflow/plugins/cards/card_modules/convert_to_native_type.py +36 -7
  172. metaflow/plugins/cards/card_modules/json_viewer.py +232 -0
  173. metaflow/plugins/cards/card_modules/main.css +1 -0
  174. metaflow/plugins/cards/card_modules/main.js +68 -49
  175. metaflow/plugins/cards/card_modules/renderer_tools.py +1 -0
  176. metaflow/plugins/cards/card_modules/test_cards.py +26 -12
  177. metaflow/plugins/cards/card_server.py +39 -14
  178. metaflow/plugins/cards/component_serializer.py +2 -9
  179. metaflow/plugins/cards/metadata.py +22 -0
  180. metaflow/plugins/catch_decorator.py +9 -0
  181. metaflow/plugins/datastores/azure_storage.py +10 -1
  182. metaflow/plugins/datastores/gs_storage.py +6 -2
  183. metaflow/plugins/datastores/local_storage.py +12 -6
  184. metaflow/plugins/datastores/spin_storage.py +12 -0
  185. metaflow/plugins/datatools/local.py +2 -0
  186. metaflow/plugins/datatools/s3/s3.py +126 -75
  187. metaflow/plugins/datatools/s3/s3op.py +254 -121
  188. metaflow/plugins/env_escape/__init__.py +3 -3
  189. metaflow/plugins/env_escape/client_modules.py +102 -72
  190. metaflow/plugins/env_escape/server.py +7 -0
  191. metaflow/plugins/env_escape/stub.py +24 -5
  192. metaflow/plugins/events_decorator.py +343 -185
  193. metaflow/plugins/exit_hook/__init__.py +0 -0
  194. metaflow/plugins/exit_hook/exit_hook_decorator.py +46 -0
  195. metaflow/plugins/exit_hook/exit_hook_script.py +52 -0
  196. metaflow/plugins/gcp/__init__.py +1 -1
  197. metaflow/plugins/gcp/gcp_secret_manager_secrets_provider.py +11 -6
  198. metaflow/plugins/gcp/gs_tail.py +10 -6
  199. metaflow/plugins/gcp/includefile_support.py +3 -0
  200. metaflow/plugins/kubernetes/kube_utils.py +108 -0
  201. metaflow/plugins/kubernetes/kubernetes.py +411 -130
  202. metaflow/plugins/kubernetes/kubernetes_cli.py +168 -36
  203. metaflow/plugins/kubernetes/kubernetes_client.py +104 -2
  204. metaflow/plugins/kubernetes/kubernetes_decorator.py +246 -88
  205. metaflow/plugins/kubernetes/kubernetes_job.py +253 -581
  206. metaflow/plugins/kubernetes/kubernetes_jobsets.py +1071 -0
  207. metaflow/plugins/kubernetes/spot_metadata_cli.py +69 -0
  208. metaflow/plugins/kubernetes/spot_monitor_sidecar.py +109 -0
  209. metaflow/plugins/logs_cli.py +359 -0
  210. metaflow/plugins/{metadata → metadata_providers}/local.py +144 -84
  211. metaflow/plugins/{metadata → metadata_providers}/service.py +103 -26
  212. metaflow/plugins/metadata_providers/spin.py +16 -0
  213. metaflow/plugins/package_cli.py +36 -24
  214. metaflow/plugins/parallel_decorator.py +128 -11
  215. metaflow/plugins/parsers.py +16 -0
  216. metaflow/plugins/project_decorator.py +51 -5
  217. metaflow/plugins/pypi/bootstrap.py +357 -105
  218. metaflow/plugins/pypi/conda_decorator.py +82 -81
  219. metaflow/plugins/pypi/conda_environment.py +187 -52
  220. metaflow/plugins/pypi/micromamba.py +157 -47
  221. metaflow/plugins/pypi/parsers.py +268 -0
  222. metaflow/plugins/pypi/pip.py +88 -13
  223. metaflow/plugins/pypi/pypi_decorator.py +37 -1
  224. metaflow/plugins/pypi/utils.py +48 -2
  225. metaflow/plugins/resources_decorator.py +2 -2
  226. metaflow/plugins/secrets/__init__.py +3 -0
  227. metaflow/plugins/secrets/secrets_decorator.py +26 -181
  228. metaflow/plugins/secrets/secrets_func.py +49 -0
  229. metaflow/plugins/secrets/secrets_spec.py +101 -0
  230. metaflow/plugins/secrets/utils.py +74 -0
  231. metaflow/plugins/tag_cli.py +4 -7
  232. metaflow/plugins/test_unbounded_foreach_decorator.py +41 -6
  233. metaflow/plugins/timeout_decorator.py +3 -3
  234. metaflow/plugins/uv/__init__.py +0 -0
  235. metaflow/plugins/uv/bootstrap.py +128 -0
  236. metaflow/plugins/uv/uv_environment.py +72 -0
  237. metaflow/procpoll.py +1 -1
  238. metaflow/pylint_wrapper.py +5 -1
  239. metaflow/runner/__init__.py +0 -0
  240. metaflow/runner/click_api.py +717 -0
  241. metaflow/runner/deployer.py +470 -0
  242. metaflow/runner/deployer_impl.py +201 -0
  243. metaflow/runner/metaflow_runner.py +714 -0
  244. metaflow/runner/nbdeploy.py +132 -0
  245. metaflow/runner/nbrun.py +225 -0
  246. metaflow/runner/subprocess_manager.py +650 -0
  247. metaflow/runner/utils.py +335 -0
  248. metaflow/runtime.py +1078 -260
  249. metaflow/sidecar/sidecar_worker.py +1 -1
  250. metaflow/system/__init__.py +5 -0
  251. metaflow/system/system_logger.py +85 -0
  252. metaflow/system/system_monitor.py +108 -0
  253. metaflow/system/system_utils.py +19 -0
  254. metaflow/task.py +521 -225
  255. metaflow/tracing/__init__.py +7 -7
  256. metaflow/tracing/span_exporter.py +31 -38
  257. metaflow/tracing/tracing_modules.py +38 -43
  258. metaflow/tuple_util.py +27 -0
  259. metaflow/user_configs/__init__.py +0 -0
  260. metaflow/user_configs/config_options.py +563 -0
  261. metaflow/user_configs/config_parameters.py +598 -0
  262. metaflow/user_decorators/__init__.py +0 -0
  263. metaflow/user_decorators/common.py +144 -0
  264. metaflow/user_decorators/mutable_flow.py +512 -0
  265. metaflow/user_decorators/mutable_step.py +424 -0
  266. metaflow/user_decorators/user_flow_decorator.py +264 -0
  267. metaflow/user_decorators/user_step_decorator.py +749 -0
  268. metaflow/util.py +243 -27
  269. metaflow/vendor.py +23 -7
  270. metaflow/version.py +1 -1
  271. ob_metaflow-2.19.7.1rc0.data/data/share/metaflow/devtools/Makefile +355 -0
  272. ob_metaflow-2.19.7.1rc0.data/data/share/metaflow/devtools/Tiltfile +726 -0
  273. ob_metaflow-2.19.7.1rc0.data/data/share/metaflow/devtools/pick_services.sh +105 -0
  274. ob_metaflow-2.19.7.1rc0.dist-info/METADATA +87 -0
  275. ob_metaflow-2.19.7.1rc0.dist-info/RECORD +445 -0
  276. {ob_metaflow-2.11.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/WHEEL +1 -1
  277. {ob_metaflow-2.11.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/entry_points.txt +1 -0
  278. metaflow/_vendor/v3_5/__init__.py +0 -1
  279. metaflow/_vendor/v3_5/importlib_metadata/__init__.py +0 -644
  280. metaflow/_vendor/v3_5/importlib_metadata/_compat.py +0 -152
  281. metaflow/package.py +0 -188
  282. ob_metaflow-2.11.13.1.dist-info/METADATA +0 -85
  283. ob_metaflow-2.11.13.1.dist-info/RECORD +0 -308
  284. /metaflow/_vendor/{v3_5/zipp.py → zipp.py} +0 -0
  285. /metaflow/{metadata → metadata_provider}/__init__.py +0 -0
  286. /metaflow/{metadata → metadata_provider}/util.py +0 -0
  287. /metaflow/plugins/{metadata → metadata_providers}/__init__.py +0 -0
  288. {ob_metaflow-2.11.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info/licenses}/LICENSE +0 -0
  289. {ob_metaflow-2.11.13.1.dist-info → ob_metaflow-2.19.7.1rc0.dist-info}/top_level.txt +0 -0
@@ -2,18 +2,24 @@ import json
2
2
  import math
3
3
  import random
4
4
  import time
5
- import os
6
- import socket
7
- import copy
8
5
 
9
6
  from metaflow.exception import MetaflowException
10
7
  from metaflow.metaflow_config import KUBERNETES_SECRETS
8
+ from metaflow.tracing import inject_tracing_vars
9
+ from metaflow.metaflow_config_funcs import init_config
11
10
 
12
11
  CLIENT_REFRESH_INTERVAL_SECONDS = 300
13
12
 
13
+ from .kube_utils import qos_requests_and_limits
14
+ from .kubernetes_jobsets import (
15
+ KubernetesJobSet,
16
+ ) # We need this import for Kubernetes Client.
17
+
18
+
14
19
  class KubernetesJobException(MetaflowException):
15
20
  headline = "Kubernetes job error"
16
21
 
22
+
17
23
  # Implements truncated exponential backoff from
18
24
  # https://cloud.google.com/storage/docs/retry-strategy#exponential-backoff
19
25
  def k8s_retry(deadline_seconds=60, max_backoff=32):
@@ -56,19 +62,7 @@ class KubernetesJob(object):
56
62
  self._client = client
57
63
  self._kwargs = kwargs
58
64
 
59
- def create(self):
60
- # A discerning eye would notice and question the choice of using the
61
- # V1Job construct over the V1Pod construct given that we don't rely much
62
- # on any of the V1Job semantics. The major reasons at the moment are -
63
- # 1. It makes the Kubernetes UIs (Octant, Lens) a bit easier on
64
- # the eyes, although even that can be questioned.
65
- # 2. AWS Step Functions, at the moment (Apr' 22) only supports
66
- # executing Jobs and not Pods as part of it's publicly declared
67
- # API. When we ship the AWS Step Functions integration with EKS,
68
- # it will hopefully lessen our workload.
69
- #
70
- # Note: This implementation ensures that there is only one unique Pod
71
- # (unique UID) per Metaflow task attempt.
65
+ def create_job_spec(self):
72
66
  client = self._client.get()
73
67
 
74
68
  # tmpfs variables
@@ -80,529 +74,139 @@ class KubernetesJob(object):
80
74
  if self._kwargs["shared_memory"]
81
75
  else None
82
76
  )
83
-
84
- jobset_name = "js-%s" % self._kwargs["attrs"]["metaflow.task_id"].split('-')[-1]
85
- main_job_name = "control"
86
- main_job_index = 0
87
- main_pod_index = 0
88
- subdomain = jobset_name
89
- master_port = int(self._kwargs['port']) if self._kwargs['port'] else None
90
- shared_memory = int(self._kwargs['shared_memory']) if self._kwargs['shared_memory'] else None
91
-
92
- passwordless_ssh = self._kwargs["attrs"]["requires_passwordless_ssh"]
93
- if passwordless_ssh:
94
- passwordless_ssh_service_name = subdomain
95
- passwordless_ssh_service_selector = {
96
- "passwordless-ssh-jobset": "true"
97
- }
98
- else:
99
- passwordless_ssh_service_name = None
100
- passwordless_ssh_service_selector = {}
101
-
102
- fqdn_suffix = "%s.svc.cluster.local" % self._kwargs["namespace"]
103
- jobset_main_addr = "%s-%s-%s-%s.%s.%s" % (
104
- jobset_name,
105
- main_job_name,
106
- main_job_index,
107
- main_pod_index,
108
- subdomain,
109
- fqdn_suffix,
77
+ qos_requests, qos_limits = qos_requests_and_limits(
78
+ self._kwargs["qos"],
79
+ self._kwargs["cpu"],
80
+ self._kwargs["memory"],
81
+ self._kwargs["disk"],
110
82
  )
111
-
112
- def _install_jobset(
113
- repo_url="https://github.com/kubernetes-sigs/jobset",
114
- python_sdk_path="jobset/sdk/python",
115
- ):
116
-
117
- # TODO (Eddie): Remove this and suggest to user.
118
-
119
- import subprocess
120
- import tempfile
121
- import shutil
122
- import os
123
-
124
- with open(os.devnull, "wb") as devnull:
125
- cwd = os.getcwd()
126
- tmp_dir = tempfile.mkdtemp()
127
- os.chdir(tmp_dir)
128
- subprocess.check_call(
129
- ["git", "clone", repo_url], stdout=devnull, stderr=subprocess.STDOUT
130
- )
131
- tmp_python_sdk_path = os.path.join(tmp_dir, python_sdk_path)
132
- os.chdir(tmp_python_sdk_path)
133
- subprocess.check_call(
134
- ["pip", "install", "."], stdout=devnull, stderr=subprocess.STDOUT
135
- )
136
- os.chdir(cwd)
137
- shutil.rmtree(tmp_dir)
138
-
139
- def _get_passwordless_ssh_service():
140
-
141
- return client.V1Service(
142
- api_version="v1",
143
- kind="Service",
144
- metadata=client.V1ObjectMeta(
145
- name=passwordless_ssh_service_name,
146
- namespace=self._kwargs["namespace"]
147
- ),
148
- spec=client.V1ServiceSpec(
149
- cluster_ip="None",
150
- internal_traffic_policy="Cluster",
151
- ip_families=["IPv4"],
152
- ip_family_policy="SingleStack",
153
- selector=passwordless_ssh_service_selector,
154
- session_affinity="None",
155
- type="ClusterIP",
156
- ports=[
157
- client.V1ServicePort(
158
- name="control",
159
- port=22,
160
- protocol="TCP",
161
- target_port=22
162
- )
163
- ]
83
+ initial_configs = init_config()
84
+ for entry in ["OBP_PERIMETER", "OBP_INTEGRATIONS_URL"]:
85
+ if entry not in initial_configs:
86
+ raise KubernetesJobException(
87
+ f"{entry} was not found in metaflow config. Please make sure to run `outerbounds configure <...>` command which can be found on the Ourebounds UI or reach out to your Outerbounds support team."
164
88
  )
165
- )
166
89
 
167
- def _get_replicated_job(job_name, parallelism, command):
168
- return jobset.models.jobset_v1alpha2_replicated_job.JobsetV1alpha2ReplicatedJob(
169
- name=job_name,
170
- template=client.V1JobTemplateSpec(
171
- metadata=client.V1ObjectMeta(
172
- annotations=self._kwargs.get("annotations", {}),
173
- labels=self._kwargs.get("labels", {}),
174
- namespace=self._kwargs["namespace"],
175
- ),
176
- spec=client.V1JobSpec(
177
- parallelism=parallelism, # how many jobs can run at once
178
- completions=parallelism, # how many Pods the JobSet creates in total
179
- backoff_limit=0,
180
- ttl_seconds_after_finished=7
181
- * 60
182
- * 60
183
- * 24,
184
- template=client.V1PodTemplateSpec(
185
- metadata=client.V1ObjectMeta(
186
- annotations=self._kwargs.get("annotations", {}),
187
- labels={
188
- **self._kwargs.get("labels", {}),
189
- **passwordless_ssh_service_selector, # TODO: necessary?
190
- # TODO: cluster-name, app.kubernetes.io/name necessary?
191
- },
192
- namespace=self._kwargs["namespace"],
193
- ),
194
- spec=client.V1PodSpec(
195
- active_deadline_seconds=self._kwargs[
196
- "timeout_in_seconds"
197
- ],
198
- containers=[
199
- client.V1Container(
200
- command=command,
201
- ports=[client.V1ContainerPort(container_port=master_port)] if master_port and job_name=="control" else [],
202
- env=[
203
- client.V1EnvVar(name=k, value=str(v))
204
- for k, v in self._kwargs.get(
205
- "environment_variables", {}
206
- ).items()
207
- ]
208
- + [
209
- client.V1EnvVar(
210
- name=k,
211
- value_from=client.V1EnvVarSource(
212
- field_ref=client.V1ObjectFieldSelector(
213
- field_path=str(v)
214
- )
215
- ),
216
- )
217
- for k, v in {
218
- "METAFLOW_KUBERNETES_POD_NAMESPACE": "metadata.namespace",
219
- "METAFLOW_KUBERNETES_POD_NAME": "metadata.name",
220
- "METAFLOW_KUBERNETES_POD_ID": "metadata.uid",
221
- "METAFLOW_KUBERNETES_SERVICE_ACCOUNT_NAME": "spec.serviceAccountName",
222
- "METAFLOW_KUBERNETES_NODE_IP": "status.hostIP",
223
- }.items()
224
- ]
225
- # Mimicking the AWS Batch Multinode env vars.
226
- + [
227
- client.V1EnvVar(
228
- name="MASTER_ADDR",
229
- value=jobset_main_addr,
230
- ),
231
- client.V1EnvVar(
232
- name="MASTER_PORT",
233
- value=str(master_port),
234
- ),
235
- client.V1EnvVar(
236
- name="RANK",
237
- value_from=client.V1EnvVarSource(
238
- field_ref=client.V1ObjectFieldSelector(
239
- field_path="metadata.annotations['batch.kubernetes.io/job-completion-index']"
240
- )
241
- ),
242
- ),
243
- client.V1EnvVar(
244
- name="WORLD_SIZE",
245
- value=str(self._kwargs["num_parallel"]),
246
- ),
247
- client.V1EnvVar(
248
- name="PYTHONUNBUFFERED",
249
- value="0",
250
- ),
251
- ],
252
- env_from=[
253
- client.V1EnvFromSource(
254
- secret_ref=client.V1SecretEnvSource(
255
- name=str(k),
256
- # optional=True
257
- )
258
- )
259
- for k in list(
260
- self._kwargs.get("secrets", [])
261
- )
262
- + KUBERNETES_SECRETS.split(",")
263
- if k
264
- ],
265
- image=self._kwargs["image"],
266
- image_pull_policy=self._kwargs[
267
- "image_pull_policy"
268
- ],
269
- name=self._kwargs["step_name"].replace(
270
- "_", "-"
271
- ),
272
- resources=client.V1ResourceRequirements(
273
- requests={
274
- "cpu": str(self._kwargs["cpu"]),
275
- "memory": "%sM"
276
- % str(self._kwargs["memory"]),
277
- "ephemeral-storage": "%sM"
278
- % str(self._kwargs["disk"]),
279
- },
280
- limits={
281
- "%s.com/gpu".lower()
282
- % self._kwargs["gpu_vendor"]: str(
283
- self._kwargs["gpu"]
284
- )
285
- for k in [0]
286
- # Don't set GPU limits if gpu isn't specified.
287
- if self._kwargs["gpu"] is not None
288
- },
289
- ),
290
- volume_mounts=(
291
- [
292
- client.V1VolumeMount(
293
- mount_path=self._kwargs.get(
294
- "tmpfs_path"
295
- ),
296
- name="tmpfs-ephemeral-volume",
297
- )
298
- ]
299
- if tmpfs_enabled
300
- else []
301
- )
302
- + (
303
- [
304
- client.V1VolumeMount(
305
- mount_path="/dev/shm",
306
- name="dhsm"
307
- )
308
- ]
309
- if shared_memory else []
310
- )
311
- + (
312
- [
313
- client.V1VolumeMount(
314
- mount_path=path, name=claim
315
- )
316
- for claim, path in self._kwargs[
317
- "persistent_volume_claims"
318
- ].items()
319
- ]
320
- if self._kwargs["persistent_volume_claims"]
321
- is not None
322
- else []
323
- ),
324
- )
325
- ],
326
- node_selector=self._kwargs.get("node_selector"),
327
- restart_policy="Never",
328
-
329
- set_hostname_as_fqdn=True, # configure pod hostname as pod's FQDN
330
- share_process_namespace=False, # default
331
- subdomain=subdomain, # FQDN = <hostname>.<subdomain>.<pod namespace>.svc.<cluster domain>
332
-
333
- service_account_name=self._kwargs["service_account"],
334
- termination_grace_period_seconds=0,
335
- tolerations=[
336
- client.V1Toleration(**toleration)
337
- for toleration in self._kwargs.get("tolerations")
338
- or []
339
- ],
340
- volumes=(
341
- [
342
- client.V1Volume(
343
- name="tmpfs-ephemeral-volume",
344
- empty_dir=client.V1EmptyDirVolumeSource(
345
- medium="Memory",
346
- size_limit="{}Mi".format(tmpfs_size),
347
- ),
348
- )
349
- ]
350
- if tmpfs_enabled
351
- else []
352
- )
353
- + (
354
- [
355
- client.V1Volume(
356
- name="dhsm",
357
- empty_dir=client.V1EmptyDirVolumeSource(
358
- medium="Memory",
359
- size_limit="{}Mi".format(shared_memory),
360
- )
361
- )
362
- ]
363
- if shared_memory else []
364
- )
365
- + (
366
- [
367
- client.V1Volume(
368
- name=claim,
369
- persistent_volume_claim=client.V1PersistentVolumeClaimVolumeSource(
370
- claim_name=claim
371
- ),
372
- )
373
- for claim in self._kwargs[
374
- "persistent_volume_claims"
375
- ].keys()
376
- ]
377
- if self._kwargs["persistent_volume_claims"]
378
- is not None
379
- else []
380
- ),
381
- ),
382
- ),
383
- ),
384
- ),
385
- )
386
-
387
- if "num_parallel" in self._kwargs and self._kwargs["num_parallel"] >= 1:
388
-
389
- try:
390
- import jobset
391
- except ImportError:
392
- _install_jobset()
393
- import jobset
394
-
395
- main_commands = copy.copy(self._kwargs["command"])
396
- main_commands[-1] = main_commands[-1].replace(
397
- "[multinode-args]", "--split-index 0"
398
- )
399
-
400
- task_id = self._kwargs["attrs"]["metaflow.task_id"]
401
- secondary_commands = copy.copy(self._kwargs["command"])
402
- # RANK needs +1 because control node is not in the worker index group, yet we want global nodes.
403
- # Technically, control and worker could be same replicated job type, but cleaner to separate for future use cases.
404
- secondary_commands[-1] = secondary_commands[-1].replace(
405
- "[multinode-args]", "--split-index `expr $RANK + 1`"
406
- )
407
- secondary_commands[-1] = secondary_commands[-1].replace(
408
- "ubf_control", "ubf_task"
409
- )
410
- secondary_commands[-1] = secondary_commands[-1].replace(
411
- task_id,
412
- task_id.replace("control-", "") + "-node-`expr $RANK + 1`",
413
- )
414
-
415
- if passwordless_ssh:
416
- if not os.path.exists("/usr/sbin/sshd"):
417
- raise KubernetesJobException(
418
- "This @parallel decorator requires sshd to be installed in the container image."
419
- "Please install OpenSSH."
420
- )
421
-
422
- # run sshd in background
423
- main_commands[-1] = "/usr/sbin/sshd -D & %s" % main_commands[-1]
424
- secondary_commands[-1] = "/usr/sbin/sshd -D & %s" % secondary_commands[-1]
425
-
426
- replicated_jobs = [_get_replicated_job("control", 1, main_commands)]
427
- if self._kwargs["num_parallel"] > 1:
428
- replicated_jobs.append(
429
- _get_replicated_job("worker", self._kwargs["num_parallel"] - 1, secondary_commands)
430
- )
90
+ additional_obp_configs = {
91
+ "OBP_PERIMETER": initial_configs["OBP_PERIMETER"],
92
+ "OBP_INTEGRATIONS_URL": initial_configs[
93
+ "OBP_INTEGRATIONS_URL"
94
+ ],
95
+ }
96
+
97
+ security_context = self._kwargs.get("security_context", {})
98
+ _security_context = {}
99
+ if security_context is not None and len(security_context) > 0:
100
+ _security_context = {
101
+ "security_context": client.V1SecurityContext(**security_context)
102
+ }
431
103
 
432
- self._jobset = jobset.models.jobset_v1alpha2_job_set.JobsetV1alpha2JobSet(
433
- api_version="jobset.x-k8s.io/v1alpha2",
434
- kind="JobSet",
104
+ return client.V1JobSpec(
105
+ # Retries are handled by Metaflow when it is responsible for
106
+ # executing the flow. The responsibility is moved to Kubernetes
107
+ # when Argo Workflows is responsible for the execution.
108
+ backoff_limit=self._kwargs.get("retries", 0),
109
+ completions=self._kwargs.get("completions", 1),
110
+ ttl_seconds_after_finished=7
111
+ * 60
112
+ * 60 # Remove job after a week. TODO: Make this configurable
113
+ * 24,
114
+ template=client.V1PodTemplateSpec(
435
115
  metadata=client.V1ObjectMeta(
436
116
  annotations=self._kwargs.get("annotations", {}),
437
117
  labels=self._kwargs.get("labels", {}),
438
- name=jobset_name,
439
118
  namespace=self._kwargs["namespace"],
440
119
  ),
441
- spec=jobset.models.jobset_v1alpha2_job_set_spec.JobsetV1alpha2JobSetSpec(
442
- network=jobset.models.jobset_v1alpha2_network.JobsetV1alpha2Network(
443
- enable_dns_hostnames=True if not self._kwargs['attrs']['requires_passwordless_ssh'] else False,
444
- subdomain=subdomain
445
- ),
446
- replicated_jobs=replicated_jobs
447
- ),
448
- )
449
- self._passwordless_ssh_service = _get_passwordless_ssh_service()
450
- else:
451
- self._job = client.V1Job(
452
- api_version="batch/v1",
453
- kind="Job",
454
- metadata=client.V1ObjectMeta(
455
- # Annotations are for humans
456
- annotations=self._kwargs.get("annotations", {}),
457
- # While labels are for Kubernetes
458
- labels=self._kwargs.get("labels", {}),
459
- generate_name=self._kwargs["generate_name"],
460
- namespace=self._kwargs["namespace"], # Defaults to `default`
461
- ),
462
- spec=client.V1JobSpec(
463
- # Retries are handled by Metaflow when it is responsible for
464
- # executing the flow. The responsibility is moved to Kubernetes
465
- # when Argo Workflows is responsible for the execution.
466
- backoff_limit=self._kwargs.get("retries", 0),
467
- completions=1, # A single non-indexed pod job
468
- ttl_seconds_after_finished=7
469
- * 60
470
- * 60 # Remove job after a week. TODO: Make this configurable
471
- * 24,
472
- template=client.V1PodTemplateSpec(
473
- metadata=client.V1ObjectMeta(
474
- annotations=self._kwargs.get("annotations", {}),
475
- labels=self._kwargs.get("labels", {}),
476
- namespace=self._kwargs["namespace"],
477
- ),
478
- spec=client.V1PodSpec(
479
- # Timeout is set on the pod and not the job (important!)
480
- active_deadline_seconds=self._kwargs["timeout_in_seconds"],
481
- # TODO (savin): Enable affinities for GPU scheduling.
482
- # affinity=?,
483
- containers=[
484
- client.V1Container(
485
- command=self._kwargs["command"],
486
- ports=[
487
- client.V1ContainerPort(
488
- container_port=int(self._kwargs["port"])
489
- )
490
- ]
491
- if "port" in self._kwargs and self._kwargs["port"]
492
- else None,
493
- env=[
494
- client.V1EnvVar(name=k, value=str(v))
495
- for k, v in self._kwargs.get(
496
- "environment_variables", {}
497
- ).items()
498
- ]
499
- # And some downward API magic. Add (key, value)
500
- # pairs below to make pod metadata available
501
- # within Kubernetes container.
502
- + [
503
- client.V1EnvVar(
504
- name=k,
505
- value_from=client.V1EnvVarSource(
506
- field_ref=client.V1ObjectFieldSelector(
507
- field_path=str(v)
508
- )
509
- ),
510
- )
511
- for k, v in {
512
- "METAFLOW_KUBERNETES_POD_NAMESPACE": "metadata.namespace",
513
- "METAFLOW_KUBERNETES_POD_NAME": "metadata.name",
514
- "METAFLOW_KUBERNETES_POD_ID": "metadata.uid",
515
- "METAFLOW_KUBERNETES_SERVICE_ACCOUNT_NAME": "spec.serviceAccountName",
516
- "METAFLOW_KUBERNETES_NODE_IP": "status.hostIP",
517
- }.items()
518
- ],
519
- env_from=[
520
- client.V1EnvFromSource(
521
- secret_ref=client.V1SecretEnvSource(
522
- name=str(k),
523
- # optional=True
524
- )
525
- )
526
- for k in list(self._kwargs.get("secrets", []))
527
- + KUBERNETES_SECRETS.split(",")
528
- if k
529
- ],
530
- image=self._kwargs["image"],
531
- image_pull_policy=self._kwargs["image_pull_policy"],
532
- name=self._kwargs["step_name"].replace("_", "-"),
533
- resources=client.V1ResourceRequirements(
534
- requests={
535
- "cpu": str(self._kwargs["cpu"]),
536
- "memory": "%sM"
537
- % str(self._kwargs["memory"]),
538
- "ephemeral-storage": "%sM"
539
- % str(self._kwargs["disk"]),
540
- },
541
- limits={
542
- "%s.com/gpu".lower()
543
- % self._kwargs["gpu_vendor"]: str(
544
- self._kwargs["gpu"]
545
- )
546
- for k in [0]
547
- # Don't set GPU limits if gpu isn't specified.
548
- if self._kwargs["gpu"] is not None
549
- },
550
- ),
551
- volume_mounts=(
552
- [
553
- client.V1VolumeMount(
554
- mount_path=self._kwargs.get(
555
- "tmpfs_path"
556
- ),
557
- name="tmpfs-ephemeral-volume",
558
- )
559
- ]
560
- if tmpfs_enabled
561
- else []
120
+ spec=client.V1PodSpec(
121
+ # Timeout is set on the pod and not the job (important!)
122
+ active_deadline_seconds=self._kwargs["timeout_in_seconds"],
123
+ # TODO (savin): Enable affinities for GPU scheduling.
124
+ # affinity=?,
125
+ containers=[
126
+ client.V1Container(
127
+ command=self._kwargs["command"],
128
+ termination_message_policy="FallbackToLogsOnError",
129
+ ports=(
130
+ []
131
+ if self._kwargs["port"] is None
132
+ else [
133
+ client.V1ContainerPort(
134
+ container_port=int(self._kwargs["port"])
562
135
  )
563
- + (
564
- [
565
- client.V1VolumeMount(
566
- mount_path=path, name=claim
567
- )
568
- for claim, path in self._kwargs[
569
- "persistent_volume_claims"
570
- ].items()
571
- ]
572
- if self._kwargs["persistent_volume_claims"]
573
- is not None
574
- else []
136
+ ]
137
+ ),
138
+ env=[
139
+ client.V1EnvVar(name=k, value=str(v))
140
+ for k, v in self._kwargs.get(
141
+ "environment_variables", {}
142
+ ).items()
143
+ ]
144
+ # And some downward API magic. Add (key, value)
145
+ # pairs below to make pod metadata available
146
+ # within Kubernetes container.
147
+ + [
148
+ client.V1EnvVar(
149
+ name=k,
150
+ value_from=client.V1EnvVarSource(
151
+ field_ref=client.V1ObjectFieldSelector(
152
+ field_path=str(v)
153
+ )
575
154
  ),
576
155
  )
156
+ for k, v in {
157
+ "METAFLOW_KUBERNETES_NAMESPACE": "metadata.namespace",
158
+ "METAFLOW_KUBERNETES_POD_NAMESPACE": "metadata.namespace",
159
+ "METAFLOW_KUBERNETES_POD_NAME": "metadata.name",
160
+ "METAFLOW_KUBERNETES_POD_ID": "metadata.uid",
161
+ "METAFLOW_KUBERNETES_SERVICE_ACCOUNT_NAME": "spec.serviceAccountName",
162
+ "METAFLOW_KUBERNETES_NODE_IP": "status.hostIP",
163
+ }.items()
164
+ ]
165
+ + [
166
+ client.V1EnvVar(
167
+ name=k,
168
+ value=v,
169
+ )
170
+ for k, v in additional_obp_configs.items()
171
+ ]
172
+ + [
173
+ client.V1EnvVar(name=k, value=str(v))
174
+ for k, v in inject_tracing_vars({}).items()
577
175
  ],
578
- node_selector=self._kwargs.get("node_selector"),
579
- # TODO (savin): Support image_pull_secrets
580
- # image_pull_secrets=?,
581
- # TODO (savin): Support preemption policies
582
- # preemption_policy=?,
583
- #
584
- # A Container in a Pod may fail for a number of
585
- # reasons, such as because the process in it exited
586
- # with a non-zero exit code, or the Container was
587
- # killed due to OOM etc. If this happens, fail the pod
588
- # and let Metaflow handle the retries.
589
- restart_policy="Never",
590
- service_account_name=self._kwargs["service_account"],
591
- # Terminate the container immediately on SIGTERM
592
- termination_grace_period_seconds=0,
593
- tolerations=[
594
- client.V1Toleration(**toleration)
595
- for toleration in self._kwargs.get("tolerations") or []
176
+ env_from=[
177
+ client.V1EnvFromSource(
178
+ secret_ref=client.V1SecretEnvSource(
179
+ name=str(k),
180
+ # optional=True
181
+ )
182
+ )
183
+ for k in list(self._kwargs.get("secrets", []))
184
+ + KUBERNETES_SECRETS.split(",")
185
+ if k
596
186
  ],
597
- volumes=(
187
+ image=self._kwargs["image"],
188
+ image_pull_policy=self._kwargs["image_pull_policy"],
189
+ name=self._kwargs["step_name"].replace("_", "-"),
190
+ resources=client.V1ResourceRequirements(
191
+ requests=qos_requests,
192
+ limits={
193
+ **qos_limits,
194
+ **{
195
+ "%s.com/gpu".lower()
196
+ % self._kwargs["gpu_vendor"]: str(
197
+ self._kwargs["gpu"]
198
+ )
199
+ for k in [0]
200
+ # Don't set GPU limits if gpu isn't specified.
201
+ if self._kwargs["gpu"] is not None
202
+ },
203
+ },
204
+ ),
205
+ volume_mounts=(
598
206
  [
599
- client.V1Volume(
207
+ client.V1VolumeMount(
208
+ mount_path=self._kwargs.get("tmpfs_path"),
600
209
  name="tmpfs-ephemeral-volume",
601
- empty_dir=client.V1EmptyDirVolumeSource(
602
- medium="Memory",
603
- # Add default unit as ours differs from Kubernetes default.
604
- size_limit="{}Mi".format(tmpfs_size),
605
- ),
606
210
  )
607
211
  ]
608
212
  if tmpfs_enabled
@@ -610,24 +214,119 @@ class KubernetesJob(object):
610
214
  )
611
215
  + (
612
216
  [
613
- client.V1Volume(
614
- name=claim,
615
- persistent_volume_claim=client.V1PersistentVolumeClaimVolumeSource(
616
- claim_name=claim
617
- ),
217
+ client.V1VolumeMount(
218
+ mount_path="/dev/shm", name="dhsm"
618
219
  )
619
- for claim in self._kwargs[
220
+ ]
221
+ if shared_memory
222
+ else []
223
+ )
224
+ + (
225
+ [
226
+ client.V1VolumeMount(mount_path=path, name=claim)
227
+ for claim, path in self._kwargs[
620
228
  "persistent_volume_claims"
621
- ].keys()
229
+ ].items()
622
230
  ]
623
231
  if self._kwargs["persistent_volume_claims"] is not None
624
232
  else []
625
233
  ),
626
- # TODO (savin): Set termination_message_policy
627
- ),
234
+ **_security_context,
235
+ )
236
+ ],
237
+ node_selector=self._kwargs.get("node_selector"),
238
+ image_pull_secrets=[
239
+ client.V1LocalObjectReference(secret)
240
+ for secret in self._kwargs.get("image_pull_secrets") or []
241
+ ],
242
+ # TODO (savin): Support preemption policies
243
+ # preemption_policy=?,
244
+ #
245
+ # A Container in a Pod may fail for a number of
246
+ # reasons, such as because the process in it exited
247
+ # with a non-zero exit code, or the Container was
248
+ # killed due to OOM etc. If this happens, fail the pod
249
+ # and let Metaflow handle the retries.
250
+ restart_policy="Never",
251
+ service_account_name=self._kwargs["service_account"],
252
+ # Terminate the container immediately on SIGTERM
253
+ termination_grace_period_seconds=0,
254
+ tolerations=[
255
+ client.V1Toleration(**toleration)
256
+ for toleration in self._kwargs.get("tolerations") or []
257
+ ],
258
+ volumes=(
259
+ [
260
+ client.V1Volume(
261
+ name="tmpfs-ephemeral-volume",
262
+ empty_dir=client.V1EmptyDirVolumeSource(
263
+ medium="Memory",
264
+ # Add default unit as ours differs from Kubernetes default.
265
+ size_limit="{}Mi".format(tmpfs_size),
266
+ ),
267
+ )
268
+ ]
269
+ if tmpfs_enabled
270
+ else []
271
+ )
272
+ + (
273
+ [
274
+ client.V1Volume(
275
+ name="dhsm",
276
+ empty_dir=client.V1EmptyDirVolumeSource(
277
+ medium="Memory",
278
+ size_limit="{}Mi".format(shared_memory),
279
+ ),
280
+ )
281
+ ]
282
+ if shared_memory
283
+ else []
284
+ )
285
+ + (
286
+ [
287
+ client.V1Volume(
288
+ name=claim,
289
+ persistent_volume_claim=client.V1PersistentVolumeClaimVolumeSource(
290
+ claim_name=claim
291
+ ),
292
+ )
293
+ for claim in self._kwargs["persistent_volume_claims"].keys()
294
+ ]
295
+ if self._kwargs["persistent_volume_claims"] is not None
296
+ else []
628
297
  ),
629
298
  ),
630
- )
299
+ ),
300
+ )
301
+
302
+ def create(self):
303
+ # A discerning eye would notice and question the choice of using the
304
+ # V1Job construct over the V1Pod construct given that we don't rely much
305
+ # on any of the V1Job semantics. The major reasons at the moment are -
306
+ # 1. It makes the Kubernetes UIs (Octant, Lens) a bit easier on
307
+ # the eyes, although even that can be questioned.
308
+ # 2. AWS Step Functions, at the moment (Apr' 22) only supports
309
+ # executing Jobs and not Pods as part of it's publicly declared
310
+ # API. When we ship the AWS Step Functions integration with EKS,
311
+ # it will hopefully lessen our workload.
312
+ #
313
+ # Note: This implementation ensures that there is only one unique Pod
314
+ # (unique UID) per Metaflow task attempt.
315
+ client = self._client.get()
316
+
317
+ self._job = client.V1Job(
318
+ api_version="batch/v1",
319
+ kind="Job",
320
+ metadata=client.V1ObjectMeta(
321
+ # Annotations are for humans
322
+ annotations=self._kwargs.get("annotations", {}),
323
+ # While labels are for Kubernetes
324
+ labels=self._kwargs.get("labels", {}),
325
+ generate_name=self._kwargs["generate_name"],
326
+ namespace=self._kwargs["namespace"], # Defaults to `default`
327
+ ),
328
+ spec=self.create_job_spec(),
329
+ )
631
330
  return self
632
331
 
633
332
  def execute(self):
@@ -638,53 +337,19 @@ class KubernetesJob(object):
638
337
  # achieve the guarantees that we are seeking.
639
338
  # https://github.com/kubernetes/enhancements/issues/1040
640
339
  # Hopefully, we will be able to get creative with kube-batch
641
-
642
- if "num_parallel" in self._kwargs and self._kwargs["num_parallel"] >= 1:
643
- # TODO (Eddie): this is kinda gross. fix it.
644
- if self._kwargs["attrs"]["requires_passwordless_ssh"]:
645
- api_instance = client.CoreV1Api()
646
- api_response = api_instance.create_namespaced_service(namespace=self._kwargs['namespace'], body=self._passwordless_ssh_service)
647
-
648
- with client.ApiClient() as api_client:
649
- api_instance = client.CustomObjectsApi(api_client)
650
-
651
- response = api_instance.create_namespaced_custom_object(
652
- body=self._jobset,
653
- group="jobset.x-k8s.io",
654
- version="v1alpha2",
655
- namespace=self._kwargs["namespace"],
656
- plural="jobsets",
657
- )
658
-
659
- # HACK: Give K8s some time to actually create the job
660
- time.sleep(10)
661
-
662
- # TODO (Eddie): Remove hack and make RunningJobSet.
663
- # There are many jobs running that should be monitored.
664
- job_name = "%s-control-0" % response["metadata"]["name"]
665
- fake_id = 123
666
- return RunningJob(
667
- client=self._client,
668
- name=job_name,
669
- uid=fake_id,
670
- namespace=response["metadata"]["namespace"],
671
- )
672
-
673
- else:
674
- response = (
675
- client.BatchV1Api()
676
- .create_namespaced_job(
677
- body=self._job, namespace=self._kwargs["namespace"]
678
- )
679
- .to_dict()
680
- )
681
- return RunningJob(
682
- client=self._client,
683
- name=response["metadata"]["name"],
684
- uid=response["metadata"]["uid"],
685
- namespace=response["metadata"]["namespace"],
340
+ response = (
341
+ client.BatchV1Api()
342
+ .create_namespaced_job(
343
+ body=self._job, namespace=self._kwargs["namespace"]
686
344
  )
687
-
345
+ .to_dict()
346
+ )
347
+ return RunningJob(
348
+ client=self._client,
349
+ name=response["metadata"]["name"],
350
+ uid=response["metadata"]["uid"],
351
+ namespace=response["metadata"]["namespace"],
352
+ )
688
353
  except client.rest.ApiException as e:
689
354
  raise KubernetesJobException(
690
355
  "Unable to launch Kubernetes job.\n %s"
@@ -793,7 +458,7 @@ class RunningJob(object):
793
458
  def best_effort_kill():
794
459
  try:
795
460
  self.kill()
796
- except:
461
+ except Exception:
797
462
  pass
798
463
 
799
464
  atexit.register(best_effort_kill)
@@ -861,7 +526,6 @@ class RunningJob(object):
861
526
  if self.is_running:
862
527
  # Case 1.
863
528
  from kubernetes.stream import stream
864
-
865
529
  api_instance = client.CoreV1Api
866
530
  try:
867
531
  # TODO: stream opens a web-socket connection. It may
@@ -927,6 +591,10 @@ class RunningJob(object):
927
591
  return self.id
928
592
  return "job %s" % self._name
929
593
 
594
+ @property
595
+ def is_unschedulable(self):
596
+ return self._job["metadata"]["annotations"].get("metaflow/job_status", "") == "Unsatisfiable_Resource_Request"
597
+
930
598
  @property
931
599
  def is_done(self):
932
600
  # Check if the container is done. As a side effect, also refreshes self._job and
@@ -940,6 +608,7 @@ class RunningJob(object):
940
608
  or bool(self._job["status"].get("failed"))
941
609
  or self._are_pod_containers_done
942
610
  or (self._job["spec"]["parallelism"] == 0)
611
+ or self.is_unschedulable
943
612
  )
944
613
 
945
614
  if not done():
@@ -997,6 +666,7 @@ class RunningJob(object):
997
666
  bool(self._job["status"].get("failed"))
998
667
  or self._has_any_container_failed
999
668
  or (self._job["spec"]["parallelism"] == 0)
669
+ or self.is_unschedulable
1000
670
  )
1001
671
  return retval
1002
672
 
@@ -1094,6 +764,8 @@ class RunningJob(object):
1094
764
  return 0, None
1095
765
  # Best effort since Pod object can disappear on us at anytime
1096
766
  else:
767
+ if self.is_unschedulable:
768
+ return 1, self._job["metadata"]["annotations"].get("metaflow/job_status_reason", "")
1097
769
  if self._pod.get("status", {}).get("phase") not in (
1098
770
  "Succeeded",
1099
771
  "Failed",