metaflow 2.11.14__py2.py3-none-any.whl → 2.11.16__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. metaflow/__init__.py +3 -0
  2. metaflow/cli.py +0 -120
  3. metaflow/clone_util.py +6 -0
  4. metaflow/datastore/datastore_set.py +1 -1
  5. metaflow/datastore/flow_datastore.py +32 -6
  6. metaflow/datastore/task_datastore.py +50 -0
  7. metaflow/extension_support/plugins.py +2 -0
  8. metaflow/metaflow_config.py +24 -0
  9. metaflow/metaflow_environment.py +2 -2
  10. metaflow/plugins/__init__.py +20 -0
  11. metaflow/plugins/airflow/airflow.py +7 -0
  12. metaflow/plugins/argo/argo_workflows.py +17 -0
  13. metaflow/plugins/aws/batch/batch_cli.py +6 -4
  14. metaflow/plugins/azure/__init__.py +3 -0
  15. metaflow/plugins/azure/azure_credential.py +53 -0
  16. metaflow/plugins/azure/azure_exceptions.py +1 -1
  17. metaflow/plugins/azure/azure_secret_manager_secrets_provider.py +240 -0
  18. metaflow/plugins/azure/azure_utils.py +2 -35
  19. metaflow/plugins/azure/blob_service_client_factory.py +4 -2
  20. metaflow/plugins/datastores/azure_storage.py +6 -6
  21. metaflow/plugins/datatools/s3/s3.py +9 -9
  22. metaflow/plugins/gcp/__init__.py +1 -0
  23. metaflow/plugins/gcp/gcp_secret_manager_secrets_provider.py +169 -0
  24. metaflow/plugins/gcp/gs_storage_client_factory.py +52 -1
  25. metaflow/plugins/kubernetes/kubernetes.py +85 -8
  26. metaflow/plugins/kubernetes/kubernetes_cli.py +24 -1
  27. metaflow/plugins/kubernetes/kubernetes_client.py +4 -1
  28. metaflow/plugins/kubernetes/kubernetes_decorator.py +49 -4
  29. metaflow/plugins/kubernetes/kubernetes_job.py +208 -201
  30. metaflow/plugins/kubernetes/kubernetes_jobsets.py +784 -0
  31. metaflow/plugins/logs_cli.py +358 -0
  32. metaflow/plugins/timeout_decorator.py +2 -1
  33. metaflow/task.py +1 -12
  34. metaflow/tuple_util.py +27 -0
  35. metaflow/util.py +0 -15
  36. metaflow/version.py +1 -1
  37. {metaflow-2.11.14.dist-info → metaflow-2.11.16.dist-info}/METADATA +2 -2
  38. {metaflow-2.11.14.dist-info → metaflow-2.11.16.dist-info}/RECORD +42 -36
  39. {metaflow-2.11.14.dist-info → metaflow-2.11.16.dist-info}/LICENSE +0 -0
  40. {metaflow-2.11.14.dist-info → metaflow-2.11.16.dist-info}/WHEEL +0 -0
  41. {metaflow-2.11.14.dist-info → metaflow-2.11.16.dist-info}/entry_points.txt +0 -0
  42. {metaflow-2.11.14.dist-info → metaflow-2.11.16.dist-info}/top_level.txt +0 -0
@@ -2,14 +2,17 @@ import json
2
2
  import math
3
3
  import random
4
4
  import time
5
-
5
+ import copy
6
+ import sys
6
7
  from metaflow.tracing import inject_tracing_vars
7
-
8
-
9
8
  from metaflow.exception import MetaflowException
10
9
  from metaflow.metaflow_config import KUBERNETES_SECRETS
10
+ from metaflow.unbounded_foreach import UBF_CONTROL, UBF_TASK
11
11
 
12
12
  CLIENT_REFRESH_INTERVAL_SECONDS = 300
13
+ from .kubernetes_jobsets import (
14
+ KubernetesJobSet, # We need this import for Kubernetes Client.
15
+ )
13
16
 
14
17
 
15
18
  class KubernetesJobException(MetaflowException):
@@ -58,6 +61,205 @@ class KubernetesJob(object):
58
61
  self._client = client
59
62
  self._kwargs = kwargs
60
63
 
64
+ def create_job_spec(self):
65
+ client = self._client.get()
66
+
67
+ # tmpfs variables
68
+ use_tmpfs = self._kwargs["use_tmpfs"]
69
+ tmpfs_size = self._kwargs["tmpfs_size"]
70
+ tmpfs_enabled = use_tmpfs or (tmpfs_size and not use_tmpfs)
71
+ shared_memory = (
72
+ int(self._kwargs["shared_memory"])
73
+ if self._kwargs["shared_memory"]
74
+ else None
75
+ )
76
+ return client.V1JobSpec(
77
+ # Retries are handled by Metaflow when it is responsible for
78
+ # executing the flow. The responsibility is moved to Kubernetes
79
+ # when Argo Workflows is responsible for the execution.
80
+ backoff_limit=self._kwargs.get("retries", 0),
81
+ completions=self._kwargs.get("completions", 1),
82
+ ttl_seconds_after_finished=7
83
+ * 60
84
+ * 60 # Remove job after a week. TODO: Make this configurable
85
+ * 24,
86
+ template=client.V1PodTemplateSpec(
87
+ metadata=client.V1ObjectMeta(
88
+ annotations=self._kwargs.get("annotations", {}),
89
+ labels=self._kwargs.get("labels", {}),
90
+ namespace=self._kwargs["namespace"],
91
+ ),
92
+ spec=client.V1PodSpec(
93
+ # Timeout is set on the pod and not the job (important!)
94
+ active_deadline_seconds=self._kwargs["timeout_in_seconds"],
95
+ # TODO (savin): Enable affinities for GPU scheduling.
96
+ # affinity=?,
97
+ containers=[
98
+ client.V1Container(
99
+ command=self._kwargs["command"],
100
+ ports=[]
101
+ if self._kwargs["port"] is None
102
+ else [
103
+ client.V1ContainerPort(
104
+ container_port=int(self._kwargs["port"])
105
+ )
106
+ ],
107
+ env=[
108
+ client.V1EnvVar(name=k, value=str(v))
109
+ for k, v in self._kwargs.get(
110
+ "environment_variables", {}
111
+ ).items()
112
+ ]
113
+ # And some downward API magic. Add (key, value)
114
+ # pairs below to make pod metadata available
115
+ # within Kubernetes container.
116
+ + [
117
+ client.V1EnvVar(
118
+ name=k,
119
+ value_from=client.V1EnvVarSource(
120
+ field_ref=client.V1ObjectFieldSelector(
121
+ field_path=str(v)
122
+ )
123
+ ),
124
+ )
125
+ for k, v in {
126
+ "METAFLOW_KUBERNETES_POD_NAMESPACE": "metadata.namespace",
127
+ "METAFLOW_KUBERNETES_POD_NAME": "metadata.name",
128
+ "METAFLOW_KUBERNETES_POD_ID": "metadata.uid",
129
+ "METAFLOW_KUBERNETES_SERVICE_ACCOUNT_NAME": "spec.serviceAccountName",
130
+ "METAFLOW_KUBERNETES_NODE_IP": "status.hostIP",
131
+ }.items()
132
+ ]
133
+ + [
134
+ client.V1EnvVar(name=k, value=str(v))
135
+ for k, v in inject_tracing_vars({}).items()
136
+ ],
137
+ env_from=[
138
+ client.V1EnvFromSource(
139
+ secret_ref=client.V1SecretEnvSource(
140
+ name=str(k),
141
+ # optional=True
142
+ )
143
+ )
144
+ for k in list(self._kwargs.get("secrets", []))
145
+ + KUBERNETES_SECRETS.split(",")
146
+ if k
147
+ ],
148
+ image=self._kwargs["image"],
149
+ image_pull_policy=self._kwargs["image_pull_policy"],
150
+ name=self._kwargs["step_name"].replace("_", "-"),
151
+ resources=client.V1ResourceRequirements(
152
+ requests={
153
+ "cpu": str(self._kwargs["cpu"]),
154
+ "memory": "%sM" % str(self._kwargs["memory"]),
155
+ "ephemeral-storage": "%sM"
156
+ % str(self._kwargs["disk"]),
157
+ },
158
+ limits={
159
+ "%s.com/gpu".lower()
160
+ % self._kwargs["gpu_vendor"]: str(
161
+ self._kwargs["gpu"]
162
+ )
163
+ for k in [0]
164
+ # Don't set GPU limits if gpu isn't specified.
165
+ if self._kwargs["gpu"] is not None
166
+ },
167
+ ),
168
+ volume_mounts=(
169
+ [
170
+ client.V1VolumeMount(
171
+ mount_path=self._kwargs.get("tmpfs_path"),
172
+ name="tmpfs-ephemeral-volume",
173
+ )
174
+ ]
175
+ if tmpfs_enabled
176
+ else []
177
+ )
178
+ + (
179
+ [
180
+ client.V1VolumeMount(
181
+ mount_path="/dev/shm", name="dhsm"
182
+ )
183
+ ]
184
+ if shared_memory
185
+ else []
186
+ )
187
+ + (
188
+ [
189
+ client.V1VolumeMount(mount_path=path, name=claim)
190
+ for claim, path in self._kwargs[
191
+ "persistent_volume_claims"
192
+ ].items()
193
+ ]
194
+ if self._kwargs["persistent_volume_claims"] is not None
195
+ else []
196
+ ),
197
+ )
198
+ ],
199
+ node_selector=self._kwargs.get("node_selector"),
200
+ # TODO (savin): Support image_pull_secrets
201
+ # image_pull_secrets=?,
202
+ # TODO (savin): Support preemption policies
203
+ # preemption_policy=?,
204
+ #
205
+ # A Container in a Pod may fail for a number of
206
+ # reasons, such as because the process in it exited
207
+ # with a non-zero exit code, or the Container was
208
+ # killed due to OOM etc. If this happens, fail the pod
209
+ # and let Metaflow handle the retries.
210
+ restart_policy="Never",
211
+ service_account_name=self._kwargs["service_account"],
212
+ # Terminate the container immediately on SIGTERM
213
+ termination_grace_period_seconds=0,
214
+ tolerations=[
215
+ client.V1Toleration(**toleration)
216
+ for toleration in self._kwargs.get("tolerations") or []
217
+ ],
218
+ volumes=(
219
+ [
220
+ client.V1Volume(
221
+ name="tmpfs-ephemeral-volume",
222
+ empty_dir=client.V1EmptyDirVolumeSource(
223
+ medium="Memory",
224
+ # Add default unit as ours differs from Kubernetes default.
225
+ size_limit="{}Mi".format(tmpfs_size),
226
+ ),
227
+ )
228
+ ]
229
+ if tmpfs_enabled
230
+ else []
231
+ )
232
+ + (
233
+ [
234
+ client.V1Volume(
235
+ name="dhsm",
236
+ empty_dir=client.V1EmptyDirVolumeSource(
237
+ medium="Memory",
238
+ size_limit="{}Mi".format(shared_memory),
239
+ ),
240
+ )
241
+ ]
242
+ if shared_memory
243
+ else []
244
+ )
245
+ + (
246
+ [
247
+ client.V1Volume(
248
+ name=claim,
249
+ persistent_volume_claim=client.V1PersistentVolumeClaimVolumeSource(
250
+ claim_name=claim
251
+ ),
252
+ )
253
+ for claim in self._kwargs["persistent_volume_claims"].keys()
254
+ ]
255
+ if self._kwargs["persistent_volume_claims"] is not None
256
+ else []
257
+ ),
258
+ # TODO (savin): Set termination_message_policy
259
+ ),
260
+ ),
261
+ )
262
+
61
263
  def create(self):
62
264
  # A discerning eye would notice and question the choice of using the
63
265
  # V1Job construct over the V1Pod construct given that we don't rely much
@@ -77,11 +279,6 @@ class KubernetesJob(object):
77
279
  use_tmpfs = self._kwargs["use_tmpfs"]
78
280
  tmpfs_size = self._kwargs["tmpfs_size"]
79
281
  tmpfs_enabled = use_tmpfs or (tmpfs_size and not use_tmpfs)
80
- shared_memory = (
81
- int(self._kwargs["shared_memory"])
82
- if self._kwargs["shared_memory"]
83
- else None
84
- )
85
282
 
86
283
  self._job = client.V1Job(
87
284
  api_version="batch/v1",
@@ -94,197 +291,7 @@ class KubernetesJob(object):
94
291
  generate_name=self._kwargs["generate_name"],
95
292
  namespace=self._kwargs["namespace"], # Defaults to `default`
96
293
  ),
97
- spec=client.V1JobSpec(
98
- # Retries are handled by Metaflow when it is responsible for
99
- # executing the flow. The responsibility is moved to Kubernetes
100
- # when Argo Workflows is responsible for the execution.
101
- backoff_limit=self._kwargs.get("retries", 0),
102
- completions=1, # A single non-indexed pod job
103
- ttl_seconds_after_finished=7
104
- * 60
105
- * 60 # Remove job after a week. TODO: Make this configurable
106
- * 24,
107
- template=client.V1PodTemplateSpec(
108
- metadata=client.V1ObjectMeta(
109
- annotations=self._kwargs.get("annotations", {}),
110
- labels=self._kwargs.get("labels", {}),
111
- namespace=self._kwargs["namespace"],
112
- ),
113
- spec=client.V1PodSpec(
114
- # Timeout is set on the pod and not the job (important!)
115
- active_deadline_seconds=self._kwargs["timeout_in_seconds"],
116
- # TODO (savin): Enable affinities for GPU scheduling.
117
- # affinity=?,
118
- containers=[
119
- client.V1Container(
120
- command=self._kwargs["command"],
121
- ports=[
122
- client.V1ContainerPort(
123
- container_port=int(self._kwargs["port"])
124
- )
125
- ]
126
- if "port" in self._kwargs and self._kwargs["port"]
127
- else None,
128
- env=[
129
- client.V1EnvVar(name=k, value=str(v))
130
- for k, v in self._kwargs.get(
131
- "environment_variables", {}
132
- ).items()
133
- ]
134
- # And some downward API magic. Add (key, value)
135
- # pairs below to make pod metadata available
136
- # within Kubernetes container.
137
- + [
138
- client.V1EnvVar(
139
- name=k,
140
- value_from=client.V1EnvVarSource(
141
- field_ref=client.V1ObjectFieldSelector(
142
- field_path=str(v)
143
- )
144
- ),
145
- )
146
- for k, v in {
147
- "METAFLOW_KUBERNETES_POD_NAMESPACE": "metadata.namespace",
148
- "METAFLOW_KUBERNETES_POD_NAME": "metadata.name",
149
- "METAFLOW_KUBERNETES_POD_ID": "metadata.uid",
150
- "METAFLOW_KUBERNETES_SERVICE_ACCOUNT_NAME": "spec.serviceAccountName",
151
- "METAFLOW_KUBERNETES_NODE_IP": "status.hostIP",
152
- }.items()
153
- ]
154
- + [
155
- client.V1EnvVar(name=k, value=str(v))
156
- for k, v in inject_tracing_vars({}).items()
157
- ],
158
- env_from=[
159
- client.V1EnvFromSource(
160
- secret_ref=client.V1SecretEnvSource(
161
- name=str(k),
162
- # optional=True
163
- )
164
- )
165
- for k in list(self._kwargs.get("secrets", []))
166
- + KUBERNETES_SECRETS.split(",")
167
- if k
168
- ],
169
- image=self._kwargs["image"],
170
- image_pull_policy=self._kwargs["image_pull_policy"],
171
- name=self._kwargs["step_name"].replace("_", "-"),
172
- resources=client.V1ResourceRequirements(
173
- requests={
174
- "cpu": str(self._kwargs["cpu"]),
175
- "memory": "%sM" % str(self._kwargs["memory"]),
176
- "ephemeral-storage": "%sM"
177
- % str(self._kwargs["disk"]),
178
- },
179
- limits={
180
- "%s.com/gpu".lower()
181
- % self._kwargs["gpu_vendor"]: str(
182
- self._kwargs["gpu"]
183
- )
184
- for k in [0]
185
- # Don't set GPU limits if gpu isn't specified.
186
- if self._kwargs["gpu"] is not None
187
- },
188
- ),
189
- volume_mounts=(
190
- [
191
- client.V1VolumeMount(
192
- mount_path=self._kwargs.get("tmpfs_path"),
193
- name="tmpfs-ephemeral-volume",
194
- )
195
- ]
196
- if tmpfs_enabled
197
- else []
198
- )
199
- + (
200
- [
201
- client.V1VolumeMount(
202
- mount_path="/dev/shm", name="dhsm"
203
- )
204
- ]
205
- if shared_memory
206
- else []
207
- )
208
- + (
209
- [
210
- client.V1VolumeMount(
211
- mount_path=path, name=claim
212
- )
213
- for claim, path in self._kwargs[
214
- "persistent_volume_claims"
215
- ].items()
216
- ]
217
- if self._kwargs["persistent_volume_claims"]
218
- is not None
219
- else []
220
- ),
221
- )
222
- ],
223
- node_selector=self._kwargs.get("node_selector"),
224
- # TODO (savin): Support image_pull_secrets
225
- # image_pull_secrets=?,
226
- # TODO (savin): Support preemption policies
227
- # preemption_policy=?,
228
- #
229
- # A Container in a Pod may fail for a number of
230
- # reasons, such as because the process in it exited
231
- # with a non-zero exit code, or the Container was
232
- # killed due to OOM etc. If this happens, fail the pod
233
- # and let Metaflow handle the retries.
234
- restart_policy="Never",
235
- service_account_name=self._kwargs["service_account"],
236
- # Terminate the container immediately on SIGTERM
237
- termination_grace_period_seconds=0,
238
- tolerations=[
239
- client.V1Toleration(**toleration)
240
- for toleration in self._kwargs.get("tolerations") or []
241
- ],
242
- volumes=(
243
- [
244
- client.V1Volume(
245
- name="tmpfs-ephemeral-volume",
246
- empty_dir=client.V1EmptyDirVolumeSource(
247
- medium="Memory",
248
- # Add default unit as ours differs from Kubernetes default.
249
- size_limit="{}Mi".format(tmpfs_size),
250
- ),
251
- )
252
- ]
253
- if tmpfs_enabled
254
- else []
255
- )
256
- + (
257
- [
258
- client.V1Volume(
259
- name="dhsm",
260
- empty_dir=client.V1EmptyDirVolumeSource(
261
- medium="Memory",
262
- size_limit="{}Mi".format(shared_memory),
263
- ),
264
- )
265
- ]
266
- if shared_memory
267
- else []
268
- )
269
- + (
270
- [
271
- client.V1Volume(
272
- name=claim,
273
- persistent_volume_claim=client.V1PersistentVolumeClaimVolumeSource(
274
- claim_name=claim
275
- ),
276
- )
277
- for claim in self._kwargs[
278
- "persistent_volume_claims"
279
- ].keys()
280
- ]
281
- if self._kwargs["persistent_volume_claims"] is not None
282
- else []
283
- ),
284
- # TODO (savin): Set termination_message_policy
285
- ),
286
- ),
287
- ),
294
+ spec=self.create_job_spec(),
288
295
  )
289
296
  return self
290
297
 
@@ -418,7 +425,7 @@ class RunningJob(object):
418
425
  def best_effort_kill():
419
426
  try:
420
427
  self.kill()
421
- except:
428
+ except Exception as ex:
422
429
  pass
423
430
 
424
431
  atexit.register(best_effort_kill)
@@ -482,9 +489,9 @@ class RunningJob(object):
482
489
  # 3. If the pod object hasn't shown up yet, we set the parallelism to 0
483
490
  # to preempt it.
484
491
  client = self._client.get()
492
+
485
493
  if not self.is_done:
486
494
  if self.is_running:
487
-
488
495
  # Case 1.
489
496
  from kubernetes.stream import stream
490
497