metaflow 2.11.15__py2.py3-none-any.whl → 2.12.0__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. metaflow/__init__.py +8 -0
  2. metaflow/_vendor/importlib_metadata/__init__.py +1063 -0
  3. metaflow/_vendor/importlib_metadata/_adapters.py +68 -0
  4. metaflow/_vendor/importlib_metadata/_collections.py +30 -0
  5. metaflow/_vendor/importlib_metadata/_compat.py +71 -0
  6. metaflow/_vendor/importlib_metadata/_functools.py +104 -0
  7. metaflow/_vendor/importlib_metadata/_itertools.py +73 -0
  8. metaflow/_vendor/importlib_metadata/_meta.py +48 -0
  9. metaflow/_vendor/importlib_metadata/_text.py +99 -0
  10. metaflow/_vendor/importlib_metadata/py.typed +0 -0
  11. metaflow/_vendor/typeguard/__init__.py +48 -0
  12. metaflow/_vendor/typeguard/_checkers.py +906 -0
  13. metaflow/_vendor/typeguard/_config.py +108 -0
  14. metaflow/_vendor/typeguard/_decorators.py +237 -0
  15. metaflow/_vendor/typeguard/_exceptions.py +42 -0
  16. metaflow/_vendor/typeguard/_functions.py +307 -0
  17. metaflow/_vendor/typeguard/_importhook.py +213 -0
  18. metaflow/_vendor/typeguard/_memo.py +48 -0
  19. metaflow/_vendor/typeguard/_pytest_plugin.py +100 -0
  20. metaflow/_vendor/typeguard/_suppression.py +88 -0
  21. metaflow/_vendor/typeguard/_transformer.py +1193 -0
  22. metaflow/_vendor/typeguard/_union_transformer.py +54 -0
  23. metaflow/_vendor/typeguard/_utils.py +169 -0
  24. metaflow/_vendor/typeguard/py.typed +0 -0
  25. metaflow/_vendor/typing_extensions.py +3053 -0
  26. metaflow/cli.py +48 -36
  27. metaflow/clone_util.py +6 -0
  28. metaflow/cmd/develop/stubs.py +2 -0
  29. metaflow/extension_support/__init__.py +2 -0
  30. metaflow/extension_support/plugins.py +2 -0
  31. metaflow/metaflow_config.py +24 -0
  32. metaflow/metaflow_environment.py +2 -2
  33. metaflow/parameters.py +1 -0
  34. metaflow/plugins/__init__.py +19 -0
  35. metaflow/plugins/airflow/airflow.py +7 -0
  36. metaflow/plugins/argo/argo_workflows.py +17 -0
  37. metaflow/plugins/aws/batch/batch_decorator.py +3 -3
  38. metaflow/plugins/azure/__init__.py +3 -0
  39. metaflow/plugins/azure/azure_credential.py +53 -0
  40. metaflow/plugins/azure/azure_exceptions.py +1 -1
  41. metaflow/plugins/azure/azure_secret_manager_secrets_provider.py +240 -0
  42. metaflow/plugins/azure/azure_utils.py +2 -35
  43. metaflow/plugins/azure/blob_service_client_factory.py +4 -2
  44. metaflow/plugins/datastores/azure_storage.py +6 -6
  45. metaflow/plugins/datatools/s3/s3.py +1 -1
  46. metaflow/plugins/gcp/__init__.py +1 -0
  47. metaflow/plugins/gcp/gcp_secret_manager_secrets_provider.py +169 -0
  48. metaflow/plugins/gcp/gs_storage_client_factory.py +52 -1
  49. metaflow/plugins/kubernetes/kubernetes.py +85 -8
  50. metaflow/plugins/kubernetes/kubernetes_cli.py +24 -1
  51. metaflow/plugins/kubernetes/kubernetes_client.py +4 -1
  52. metaflow/plugins/kubernetes/kubernetes_decorator.py +49 -4
  53. metaflow/plugins/kubernetes/kubernetes_job.py +208 -206
  54. metaflow/plugins/kubernetes/kubernetes_jobsets.py +784 -0
  55. metaflow/plugins/timeout_decorator.py +2 -1
  56. metaflow/runner/__init__.py +0 -0
  57. metaflow/runner/click_api.py +406 -0
  58. metaflow/runner/metaflow_runner.py +452 -0
  59. metaflow/runner/nbrun.py +246 -0
  60. metaflow/runner/subprocess_manager.py +552 -0
  61. metaflow/task.py +1 -12
  62. metaflow/tuple_util.py +27 -0
  63. metaflow/util.py +0 -15
  64. metaflow/vendor.py +0 -1
  65. metaflow/version.py +1 -1
  66. {metaflow-2.11.15.dist-info → metaflow-2.12.0.dist-info}/METADATA +2 -2
  67. {metaflow-2.11.15.dist-info → metaflow-2.12.0.dist-info}/RECORD +72 -39
  68. metaflow/_vendor/v3_7/__init__.py +0 -1
  69. /metaflow/_vendor/{v3_7/zipp.py → zipp.py} +0 -0
  70. {metaflow-2.11.15.dist-info → metaflow-2.12.0.dist-info}/LICENSE +0 -0
  71. {metaflow-2.11.15.dist-info → metaflow-2.12.0.dist-info}/WHEEL +0 -0
  72. {metaflow-2.11.15.dist-info → metaflow-2.12.0.dist-info}/entry_points.txt +0 -0
  73. {metaflow-2.11.15.dist-info → metaflow-2.12.0.dist-info}/top_level.txt +0 -0
@@ -2,14 +2,17 @@ import json
2
2
  import math
3
3
  import random
4
4
  import time
5
-
5
+ import copy
6
+ import sys
6
7
  from metaflow.tracing import inject_tracing_vars
7
-
8
-
9
8
  from metaflow.exception import MetaflowException
10
9
  from metaflow.metaflow_config import KUBERNETES_SECRETS
10
+ from metaflow.unbounded_foreach import UBF_CONTROL, UBF_TASK
11
11
 
12
12
  CLIENT_REFRESH_INTERVAL_SECONDS = 300
13
+ from .kubernetes_jobsets import (
14
+ KubernetesJobSet, # We need this import for Kubernetes Client.
15
+ )
13
16
 
14
17
 
15
18
  class KubernetesJobException(MetaflowException):
@@ -58,6 +61,205 @@ class KubernetesJob(object):
58
61
  self._client = client
59
62
  self._kwargs = kwargs
60
63
 
64
+ def create_job_spec(self):
65
+ client = self._client.get()
66
+
67
+ # tmpfs variables
68
+ use_tmpfs = self._kwargs["use_tmpfs"]
69
+ tmpfs_size = self._kwargs["tmpfs_size"]
70
+ tmpfs_enabled = use_tmpfs or (tmpfs_size and not use_tmpfs)
71
+ shared_memory = (
72
+ int(self._kwargs["shared_memory"])
73
+ if self._kwargs["shared_memory"]
74
+ else None
75
+ )
76
+ return client.V1JobSpec(
77
+ # Retries are handled by Metaflow when it is responsible for
78
+ # executing the flow. The responsibility is moved to Kubernetes
79
+ # when Argo Workflows is responsible for the execution.
80
+ backoff_limit=self._kwargs.get("retries", 0),
81
+ completions=self._kwargs.get("completions", 1),
82
+ ttl_seconds_after_finished=7
83
+ * 60
84
+ * 60 # Remove job after a week. TODO: Make this configurable
85
+ * 24,
86
+ template=client.V1PodTemplateSpec(
87
+ metadata=client.V1ObjectMeta(
88
+ annotations=self._kwargs.get("annotations", {}),
89
+ labels=self._kwargs.get("labels", {}),
90
+ namespace=self._kwargs["namespace"],
91
+ ),
92
+ spec=client.V1PodSpec(
93
+ # Timeout is set on the pod and not the job (important!)
94
+ active_deadline_seconds=self._kwargs["timeout_in_seconds"],
95
+ # TODO (savin): Enable affinities for GPU scheduling.
96
+ # affinity=?,
97
+ containers=[
98
+ client.V1Container(
99
+ command=self._kwargs["command"],
100
+ ports=[]
101
+ if self._kwargs["port"] is None
102
+ else [
103
+ client.V1ContainerPort(
104
+ container_port=int(self._kwargs["port"])
105
+ )
106
+ ],
107
+ env=[
108
+ client.V1EnvVar(name=k, value=str(v))
109
+ for k, v in self._kwargs.get(
110
+ "environment_variables", {}
111
+ ).items()
112
+ ]
113
+ # And some downward API magic. Add (key, value)
114
+ # pairs below to make pod metadata available
115
+ # within Kubernetes container.
116
+ + [
117
+ client.V1EnvVar(
118
+ name=k,
119
+ value_from=client.V1EnvVarSource(
120
+ field_ref=client.V1ObjectFieldSelector(
121
+ field_path=str(v)
122
+ )
123
+ ),
124
+ )
125
+ for k, v in {
126
+ "METAFLOW_KUBERNETES_POD_NAMESPACE": "metadata.namespace",
127
+ "METAFLOW_KUBERNETES_POD_NAME": "metadata.name",
128
+ "METAFLOW_KUBERNETES_POD_ID": "metadata.uid",
129
+ "METAFLOW_KUBERNETES_SERVICE_ACCOUNT_NAME": "spec.serviceAccountName",
130
+ "METAFLOW_KUBERNETES_NODE_IP": "status.hostIP",
131
+ }.items()
132
+ ]
133
+ + [
134
+ client.V1EnvVar(name=k, value=str(v))
135
+ for k, v in inject_tracing_vars({}).items()
136
+ ],
137
+ env_from=[
138
+ client.V1EnvFromSource(
139
+ secret_ref=client.V1SecretEnvSource(
140
+ name=str(k),
141
+ # optional=True
142
+ )
143
+ )
144
+ for k in list(self._kwargs.get("secrets", []))
145
+ + KUBERNETES_SECRETS.split(",")
146
+ if k
147
+ ],
148
+ image=self._kwargs["image"],
149
+ image_pull_policy=self._kwargs["image_pull_policy"],
150
+ name=self._kwargs["step_name"].replace("_", "-"),
151
+ resources=client.V1ResourceRequirements(
152
+ requests={
153
+ "cpu": str(self._kwargs["cpu"]),
154
+ "memory": "%sM" % str(self._kwargs["memory"]),
155
+ "ephemeral-storage": "%sM"
156
+ % str(self._kwargs["disk"]),
157
+ },
158
+ limits={
159
+ "%s.com/gpu".lower()
160
+ % self._kwargs["gpu_vendor"]: str(
161
+ self._kwargs["gpu"]
162
+ )
163
+ for k in [0]
164
+ # Don't set GPU limits if gpu isn't specified.
165
+ if self._kwargs["gpu"] is not None
166
+ },
167
+ ),
168
+ volume_mounts=(
169
+ [
170
+ client.V1VolumeMount(
171
+ mount_path=self._kwargs.get("tmpfs_path"),
172
+ name="tmpfs-ephemeral-volume",
173
+ )
174
+ ]
175
+ if tmpfs_enabled
176
+ else []
177
+ )
178
+ + (
179
+ [
180
+ client.V1VolumeMount(
181
+ mount_path="/dev/shm", name="dhsm"
182
+ )
183
+ ]
184
+ if shared_memory
185
+ else []
186
+ )
187
+ + (
188
+ [
189
+ client.V1VolumeMount(mount_path=path, name=claim)
190
+ for claim, path in self._kwargs[
191
+ "persistent_volume_claims"
192
+ ].items()
193
+ ]
194
+ if self._kwargs["persistent_volume_claims"] is not None
195
+ else []
196
+ ),
197
+ )
198
+ ],
199
+ node_selector=self._kwargs.get("node_selector"),
200
+ # TODO (savin): Support image_pull_secrets
201
+ # image_pull_secrets=?,
202
+ # TODO (savin): Support preemption policies
203
+ # preemption_policy=?,
204
+ #
205
+ # A Container in a Pod may fail for a number of
206
+ # reasons, such as because the process in it exited
207
+ # with a non-zero exit code, or the Container was
208
+ # killed due to OOM etc. If this happens, fail the pod
209
+ # and let Metaflow handle the retries.
210
+ restart_policy="Never",
211
+ service_account_name=self._kwargs["service_account"],
212
+ # Terminate the container immediately on SIGTERM
213
+ termination_grace_period_seconds=0,
214
+ tolerations=[
215
+ client.V1Toleration(**toleration)
216
+ for toleration in self._kwargs.get("tolerations") or []
217
+ ],
218
+ volumes=(
219
+ [
220
+ client.V1Volume(
221
+ name="tmpfs-ephemeral-volume",
222
+ empty_dir=client.V1EmptyDirVolumeSource(
223
+ medium="Memory",
224
+ # Add default unit as ours differs from Kubernetes default.
225
+ size_limit="{}Mi".format(tmpfs_size),
226
+ ),
227
+ )
228
+ ]
229
+ if tmpfs_enabled
230
+ else []
231
+ )
232
+ + (
233
+ [
234
+ client.V1Volume(
235
+ name="dhsm",
236
+ empty_dir=client.V1EmptyDirVolumeSource(
237
+ medium="Memory",
238
+ size_limit="{}Mi".format(shared_memory),
239
+ ),
240
+ )
241
+ ]
242
+ if shared_memory
243
+ else []
244
+ )
245
+ + (
246
+ [
247
+ client.V1Volume(
248
+ name=claim,
249
+ persistent_volume_claim=client.V1PersistentVolumeClaimVolumeSource(
250
+ claim_name=claim
251
+ ),
252
+ )
253
+ for claim in self._kwargs["persistent_volume_claims"].keys()
254
+ ]
255
+ if self._kwargs["persistent_volume_claims"] is not None
256
+ else []
257
+ ),
258
+ # TODO (savin): Set termination_message_policy
259
+ ),
260
+ ),
261
+ )
262
+
61
263
  def create(self):
62
264
  # A discerning eye would notice and question the choice of using the
63
265
  # V1Job construct over the V1Pod construct given that we don't rely much
@@ -73,16 +275,6 @@ class KubernetesJob(object):
73
275
  # (unique UID) per Metaflow task attempt.
74
276
  client = self._client.get()
75
277
 
76
- # tmpfs variables
77
- use_tmpfs = self._kwargs["use_tmpfs"]
78
- tmpfs_size = self._kwargs["tmpfs_size"]
79
- tmpfs_enabled = use_tmpfs or (tmpfs_size and not use_tmpfs)
80
- shared_memory = (
81
- int(self._kwargs["shared_memory"])
82
- if self._kwargs["shared_memory"]
83
- else None
84
- )
85
-
86
278
  self._job = client.V1Job(
87
279
  api_version="batch/v1",
88
280
  kind="Job",
@@ -94,197 +286,7 @@ class KubernetesJob(object):
94
286
  generate_name=self._kwargs["generate_name"],
95
287
  namespace=self._kwargs["namespace"], # Defaults to `default`
96
288
  ),
97
- spec=client.V1JobSpec(
98
- # Retries are handled by Metaflow when it is responsible for
99
- # executing the flow. The responsibility is moved to Kubernetes
100
- # when Argo Workflows is responsible for the execution.
101
- backoff_limit=self._kwargs.get("retries", 0),
102
- completions=1, # A single non-indexed pod job
103
- ttl_seconds_after_finished=7
104
- * 60
105
- * 60 # Remove job after a week. TODO: Make this configurable
106
- * 24,
107
- template=client.V1PodTemplateSpec(
108
- metadata=client.V1ObjectMeta(
109
- annotations=self._kwargs.get("annotations", {}),
110
- labels=self._kwargs.get("labels", {}),
111
- namespace=self._kwargs["namespace"],
112
- ),
113
- spec=client.V1PodSpec(
114
- # Timeout is set on the pod and not the job (important!)
115
- active_deadline_seconds=self._kwargs["timeout_in_seconds"],
116
- # TODO (savin): Enable affinities for GPU scheduling.
117
- # affinity=?,
118
- containers=[
119
- client.V1Container(
120
- command=self._kwargs["command"],
121
- ports=[
122
- client.V1ContainerPort(
123
- container_port=int(self._kwargs["port"])
124
- )
125
- ]
126
- if "port" in self._kwargs and self._kwargs["port"]
127
- else None,
128
- env=[
129
- client.V1EnvVar(name=k, value=str(v))
130
- for k, v in self._kwargs.get(
131
- "environment_variables", {}
132
- ).items()
133
- ]
134
- # And some downward API magic. Add (key, value)
135
- # pairs below to make pod metadata available
136
- # within Kubernetes container.
137
- + [
138
- client.V1EnvVar(
139
- name=k,
140
- value_from=client.V1EnvVarSource(
141
- field_ref=client.V1ObjectFieldSelector(
142
- field_path=str(v)
143
- )
144
- ),
145
- )
146
- for k, v in {
147
- "METAFLOW_KUBERNETES_POD_NAMESPACE": "metadata.namespace",
148
- "METAFLOW_KUBERNETES_POD_NAME": "metadata.name",
149
- "METAFLOW_KUBERNETES_POD_ID": "metadata.uid",
150
- "METAFLOW_KUBERNETES_SERVICE_ACCOUNT_NAME": "spec.serviceAccountName",
151
- "METAFLOW_KUBERNETES_NODE_IP": "status.hostIP",
152
- }.items()
153
- ]
154
- + [
155
- client.V1EnvVar(name=k, value=str(v))
156
- for k, v in inject_tracing_vars({}).items()
157
- ],
158
- env_from=[
159
- client.V1EnvFromSource(
160
- secret_ref=client.V1SecretEnvSource(
161
- name=str(k),
162
- # optional=True
163
- )
164
- )
165
- for k in list(self._kwargs.get("secrets", []))
166
- + KUBERNETES_SECRETS.split(",")
167
- if k
168
- ],
169
- image=self._kwargs["image"],
170
- image_pull_policy=self._kwargs["image_pull_policy"],
171
- name=self._kwargs["step_name"].replace("_", "-"),
172
- resources=client.V1ResourceRequirements(
173
- requests={
174
- "cpu": str(self._kwargs["cpu"]),
175
- "memory": "%sM" % str(self._kwargs["memory"]),
176
- "ephemeral-storage": "%sM"
177
- % str(self._kwargs["disk"]),
178
- },
179
- limits={
180
- "%s.com/gpu".lower()
181
- % self._kwargs["gpu_vendor"]: str(
182
- self._kwargs["gpu"]
183
- )
184
- for k in [0]
185
- # Don't set GPU limits if gpu isn't specified.
186
- if self._kwargs["gpu"] is not None
187
- },
188
- ),
189
- volume_mounts=(
190
- [
191
- client.V1VolumeMount(
192
- mount_path=self._kwargs.get("tmpfs_path"),
193
- name="tmpfs-ephemeral-volume",
194
- )
195
- ]
196
- if tmpfs_enabled
197
- else []
198
- )
199
- + (
200
- [
201
- client.V1VolumeMount(
202
- mount_path="/dev/shm", name="dhsm"
203
- )
204
- ]
205
- if shared_memory
206
- else []
207
- )
208
- + (
209
- [
210
- client.V1VolumeMount(
211
- mount_path=path, name=claim
212
- )
213
- for claim, path in self._kwargs[
214
- "persistent_volume_claims"
215
- ].items()
216
- ]
217
- if self._kwargs["persistent_volume_claims"]
218
- is not None
219
- else []
220
- ),
221
- )
222
- ],
223
- node_selector=self._kwargs.get("node_selector"),
224
- # TODO (savin): Support image_pull_secrets
225
- # image_pull_secrets=?,
226
- # TODO (savin): Support preemption policies
227
- # preemption_policy=?,
228
- #
229
- # A Container in a Pod may fail for a number of
230
- # reasons, such as because the process in it exited
231
- # with a non-zero exit code, or the Container was
232
- # killed due to OOM etc. If this happens, fail the pod
233
- # and let Metaflow handle the retries.
234
- restart_policy="Never",
235
- service_account_name=self._kwargs["service_account"],
236
- # Terminate the container immediately on SIGTERM
237
- termination_grace_period_seconds=0,
238
- tolerations=[
239
- client.V1Toleration(**toleration)
240
- for toleration in self._kwargs.get("tolerations") or []
241
- ],
242
- volumes=(
243
- [
244
- client.V1Volume(
245
- name="tmpfs-ephemeral-volume",
246
- empty_dir=client.V1EmptyDirVolumeSource(
247
- medium="Memory",
248
- # Add default unit as ours differs from Kubernetes default.
249
- size_limit="{}Mi".format(tmpfs_size),
250
- ),
251
- )
252
- ]
253
- if tmpfs_enabled
254
- else []
255
- )
256
- + (
257
- [
258
- client.V1Volume(
259
- name="dhsm",
260
- empty_dir=client.V1EmptyDirVolumeSource(
261
- medium="Memory",
262
- size_limit="{}Mi".format(shared_memory),
263
- ),
264
- )
265
- ]
266
- if shared_memory
267
- else []
268
- )
269
- + (
270
- [
271
- client.V1Volume(
272
- name=claim,
273
- persistent_volume_claim=client.V1PersistentVolumeClaimVolumeSource(
274
- claim_name=claim
275
- ),
276
- )
277
- for claim in self._kwargs[
278
- "persistent_volume_claims"
279
- ].keys()
280
- ]
281
- if self._kwargs["persistent_volume_claims"] is not None
282
- else []
283
- ),
284
- # TODO (savin): Set termination_message_policy
285
- ),
286
- ),
287
- ),
289
+ spec=self.create_job_spec(),
288
290
  )
289
291
  return self
290
292
 
@@ -418,7 +420,7 @@ class RunningJob(object):
418
420
  def best_effort_kill():
419
421
  try:
420
422
  self.kill()
421
- except:
423
+ except Exception as ex:
422
424
  pass
423
425
 
424
426
  atexit.register(best_effort_kill)
@@ -482,9 +484,9 @@ class RunningJob(object):
482
484
  # 3. If the pod object hasn't shown up yet, we set the parallelism to 0
483
485
  # to preempt it.
484
486
  client = self._client.get()
487
+
485
488
  if not self.is_done:
486
489
  if self.is_running:
487
-
488
490
  # Case 1.
489
491
  from kubernetes.stream import stream
490
492