metaflow 2.11.14__py2.py3-none-any.whl → 2.11.16__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metaflow/__init__.py +3 -0
- metaflow/cli.py +0 -120
- metaflow/clone_util.py +6 -0
- metaflow/datastore/datastore_set.py +1 -1
- metaflow/datastore/flow_datastore.py +32 -6
- metaflow/datastore/task_datastore.py +50 -0
- metaflow/extension_support/plugins.py +2 -0
- metaflow/metaflow_config.py +24 -0
- metaflow/metaflow_environment.py +2 -2
- metaflow/plugins/__init__.py +20 -0
- metaflow/plugins/airflow/airflow.py +7 -0
- metaflow/plugins/argo/argo_workflows.py +17 -0
- metaflow/plugins/aws/batch/batch_cli.py +6 -4
- metaflow/plugins/azure/__init__.py +3 -0
- metaflow/plugins/azure/azure_credential.py +53 -0
- metaflow/plugins/azure/azure_exceptions.py +1 -1
- metaflow/plugins/azure/azure_secret_manager_secrets_provider.py +240 -0
- metaflow/plugins/azure/azure_utils.py +2 -35
- metaflow/plugins/azure/blob_service_client_factory.py +4 -2
- metaflow/plugins/datastores/azure_storage.py +6 -6
- metaflow/plugins/datatools/s3/s3.py +9 -9
- metaflow/plugins/gcp/__init__.py +1 -0
- metaflow/plugins/gcp/gcp_secret_manager_secrets_provider.py +169 -0
- metaflow/plugins/gcp/gs_storage_client_factory.py +52 -1
- metaflow/plugins/kubernetes/kubernetes.py +85 -8
- metaflow/plugins/kubernetes/kubernetes_cli.py +24 -1
- metaflow/plugins/kubernetes/kubernetes_client.py +4 -1
- metaflow/plugins/kubernetes/kubernetes_decorator.py +49 -4
- metaflow/plugins/kubernetes/kubernetes_job.py +208 -201
- metaflow/plugins/kubernetes/kubernetes_jobsets.py +784 -0
- metaflow/plugins/logs_cli.py +358 -0
- metaflow/plugins/timeout_decorator.py +2 -1
- metaflow/task.py +1 -12
- metaflow/tuple_util.py +27 -0
- metaflow/util.py +0 -15
- metaflow/version.py +1 -1
- {metaflow-2.11.14.dist-info → metaflow-2.11.16.dist-info}/METADATA +2 -2
- {metaflow-2.11.14.dist-info → metaflow-2.11.16.dist-info}/RECORD +42 -36
- {metaflow-2.11.14.dist-info → metaflow-2.11.16.dist-info}/LICENSE +0 -0
- {metaflow-2.11.14.dist-info → metaflow-2.11.16.dist-info}/WHEEL +0 -0
- {metaflow-2.11.14.dist-info → metaflow-2.11.16.dist-info}/entry_points.txt +0 -0
- {metaflow-2.11.14.dist-info → metaflow-2.11.16.dist-info}/top_level.txt +0 -0
@@ -2,14 +2,17 @@ import json
|
|
2
2
|
import math
|
3
3
|
import random
|
4
4
|
import time
|
5
|
-
|
5
|
+
import copy
|
6
|
+
import sys
|
6
7
|
from metaflow.tracing import inject_tracing_vars
|
7
|
-
|
8
|
-
|
9
8
|
from metaflow.exception import MetaflowException
|
10
9
|
from metaflow.metaflow_config import KUBERNETES_SECRETS
|
10
|
+
from metaflow.unbounded_foreach import UBF_CONTROL, UBF_TASK
|
11
11
|
|
12
12
|
CLIENT_REFRESH_INTERVAL_SECONDS = 300
|
13
|
+
from .kubernetes_jobsets import (
|
14
|
+
KubernetesJobSet, # We need this import for Kubernetes Client.
|
15
|
+
)
|
13
16
|
|
14
17
|
|
15
18
|
class KubernetesJobException(MetaflowException):
|
@@ -58,6 +61,205 @@ class KubernetesJob(object):
|
|
58
61
|
self._client = client
|
59
62
|
self._kwargs = kwargs
|
60
63
|
|
64
|
+
def create_job_spec(self):
|
65
|
+
client = self._client.get()
|
66
|
+
|
67
|
+
# tmpfs variables
|
68
|
+
use_tmpfs = self._kwargs["use_tmpfs"]
|
69
|
+
tmpfs_size = self._kwargs["tmpfs_size"]
|
70
|
+
tmpfs_enabled = use_tmpfs or (tmpfs_size and not use_tmpfs)
|
71
|
+
shared_memory = (
|
72
|
+
int(self._kwargs["shared_memory"])
|
73
|
+
if self._kwargs["shared_memory"]
|
74
|
+
else None
|
75
|
+
)
|
76
|
+
return client.V1JobSpec(
|
77
|
+
# Retries are handled by Metaflow when it is responsible for
|
78
|
+
# executing the flow. The responsibility is moved to Kubernetes
|
79
|
+
# when Argo Workflows is responsible for the execution.
|
80
|
+
backoff_limit=self._kwargs.get("retries", 0),
|
81
|
+
completions=self._kwargs.get("completions", 1),
|
82
|
+
ttl_seconds_after_finished=7
|
83
|
+
* 60
|
84
|
+
* 60 # Remove job after a week. TODO: Make this configurable
|
85
|
+
* 24,
|
86
|
+
template=client.V1PodTemplateSpec(
|
87
|
+
metadata=client.V1ObjectMeta(
|
88
|
+
annotations=self._kwargs.get("annotations", {}),
|
89
|
+
labels=self._kwargs.get("labels", {}),
|
90
|
+
namespace=self._kwargs["namespace"],
|
91
|
+
),
|
92
|
+
spec=client.V1PodSpec(
|
93
|
+
# Timeout is set on the pod and not the job (important!)
|
94
|
+
active_deadline_seconds=self._kwargs["timeout_in_seconds"],
|
95
|
+
# TODO (savin): Enable affinities for GPU scheduling.
|
96
|
+
# affinity=?,
|
97
|
+
containers=[
|
98
|
+
client.V1Container(
|
99
|
+
command=self._kwargs["command"],
|
100
|
+
ports=[]
|
101
|
+
if self._kwargs["port"] is None
|
102
|
+
else [
|
103
|
+
client.V1ContainerPort(
|
104
|
+
container_port=int(self._kwargs["port"])
|
105
|
+
)
|
106
|
+
],
|
107
|
+
env=[
|
108
|
+
client.V1EnvVar(name=k, value=str(v))
|
109
|
+
for k, v in self._kwargs.get(
|
110
|
+
"environment_variables", {}
|
111
|
+
).items()
|
112
|
+
]
|
113
|
+
# And some downward API magic. Add (key, value)
|
114
|
+
# pairs below to make pod metadata available
|
115
|
+
# within Kubernetes container.
|
116
|
+
+ [
|
117
|
+
client.V1EnvVar(
|
118
|
+
name=k,
|
119
|
+
value_from=client.V1EnvVarSource(
|
120
|
+
field_ref=client.V1ObjectFieldSelector(
|
121
|
+
field_path=str(v)
|
122
|
+
)
|
123
|
+
),
|
124
|
+
)
|
125
|
+
for k, v in {
|
126
|
+
"METAFLOW_KUBERNETES_POD_NAMESPACE": "metadata.namespace",
|
127
|
+
"METAFLOW_KUBERNETES_POD_NAME": "metadata.name",
|
128
|
+
"METAFLOW_KUBERNETES_POD_ID": "metadata.uid",
|
129
|
+
"METAFLOW_KUBERNETES_SERVICE_ACCOUNT_NAME": "spec.serviceAccountName",
|
130
|
+
"METAFLOW_KUBERNETES_NODE_IP": "status.hostIP",
|
131
|
+
}.items()
|
132
|
+
]
|
133
|
+
+ [
|
134
|
+
client.V1EnvVar(name=k, value=str(v))
|
135
|
+
for k, v in inject_tracing_vars({}).items()
|
136
|
+
],
|
137
|
+
env_from=[
|
138
|
+
client.V1EnvFromSource(
|
139
|
+
secret_ref=client.V1SecretEnvSource(
|
140
|
+
name=str(k),
|
141
|
+
# optional=True
|
142
|
+
)
|
143
|
+
)
|
144
|
+
for k in list(self._kwargs.get("secrets", []))
|
145
|
+
+ KUBERNETES_SECRETS.split(",")
|
146
|
+
if k
|
147
|
+
],
|
148
|
+
image=self._kwargs["image"],
|
149
|
+
image_pull_policy=self._kwargs["image_pull_policy"],
|
150
|
+
name=self._kwargs["step_name"].replace("_", "-"),
|
151
|
+
resources=client.V1ResourceRequirements(
|
152
|
+
requests={
|
153
|
+
"cpu": str(self._kwargs["cpu"]),
|
154
|
+
"memory": "%sM" % str(self._kwargs["memory"]),
|
155
|
+
"ephemeral-storage": "%sM"
|
156
|
+
% str(self._kwargs["disk"]),
|
157
|
+
},
|
158
|
+
limits={
|
159
|
+
"%s.com/gpu".lower()
|
160
|
+
% self._kwargs["gpu_vendor"]: str(
|
161
|
+
self._kwargs["gpu"]
|
162
|
+
)
|
163
|
+
for k in [0]
|
164
|
+
# Don't set GPU limits if gpu isn't specified.
|
165
|
+
if self._kwargs["gpu"] is not None
|
166
|
+
},
|
167
|
+
),
|
168
|
+
volume_mounts=(
|
169
|
+
[
|
170
|
+
client.V1VolumeMount(
|
171
|
+
mount_path=self._kwargs.get("tmpfs_path"),
|
172
|
+
name="tmpfs-ephemeral-volume",
|
173
|
+
)
|
174
|
+
]
|
175
|
+
if tmpfs_enabled
|
176
|
+
else []
|
177
|
+
)
|
178
|
+
+ (
|
179
|
+
[
|
180
|
+
client.V1VolumeMount(
|
181
|
+
mount_path="/dev/shm", name="dhsm"
|
182
|
+
)
|
183
|
+
]
|
184
|
+
if shared_memory
|
185
|
+
else []
|
186
|
+
)
|
187
|
+
+ (
|
188
|
+
[
|
189
|
+
client.V1VolumeMount(mount_path=path, name=claim)
|
190
|
+
for claim, path in self._kwargs[
|
191
|
+
"persistent_volume_claims"
|
192
|
+
].items()
|
193
|
+
]
|
194
|
+
if self._kwargs["persistent_volume_claims"] is not None
|
195
|
+
else []
|
196
|
+
),
|
197
|
+
)
|
198
|
+
],
|
199
|
+
node_selector=self._kwargs.get("node_selector"),
|
200
|
+
# TODO (savin): Support image_pull_secrets
|
201
|
+
# image_pull_secrets=?,
|
202
|
+
# TODO (savin): Support preemption policies
|
203
|
+
# preemption_policy=?,
|
204
|
+
#
|
205
|
+
# A Container in a Pod may fail for a number of
|
206
|
+
# reasons, such as because the process in it exited
|
207
|
+
# with a non-zero exit code, or the Container was
|
208
|
+
# killed due to OOM etc. If this happens, fail the pod
|
209
|
+
# and let Metaflow handle the retries.
|
210
|
+
restart_policy="Never",
|
211
|
+
service_account_name=self._kwargs["service_account"],
|
212
|
+
# Terminate the container immediately on SIGTERM
|
213
|
+
termination_grace_period_seconds=0,
|
214
|
+
tolerations=[
|
215
|
+
client.V1Toleration(**toleration)
|
216
|
+
for toleration in self._kwargs.get("tolerations") or []
|
217
|
+
],
|
218
|
+
volumes=(
|
219
|
+
[
|
220
|
+
client.V1Volume(
|
221
|
+
name="tmpfs-ephemeral-volume",
|
222
|
+
empty_dir=client.V1EmptyDirVolumeSource(
|
223
|
+
medium="Memory",
|
224
|
+
# Add default unit as ours differs from Kubernetes default.
|
225
|
+
size_limit="{}Mi".format(tmpfs_size),
|
226
|
+
),
|
227
|
+
)
|
228
|
+
]
|
229
|
+
if tmpfs_enabled
|
230
|
+
else []
|
231
|
+
)
|
232
|
+
+ (
|
233
|
+
[
|
234
|
+
client.V1Volume(
|
235
|
+
name="dhsm",
|
236
|
+
empty_dir=client.V1EmptyDirVolumeSource(
|
237
|
+
medium="Memory",
|
238
|
+
size_limit="{}Mi".format(shared_memory),
|
239
|
+
),
|
240
|
+
)
|
241
|
+
]
|
242
|
+
if shared_memory
|
243
|
+
else []
|
244
|
+
)
|
245
|
+
+ (
|
246
|
+
[
|
247
|
+
client.V1Volume(
|
248
|
+
name=claim,
|
249
|
+
persistent_volume_claim=client.V1PersistentVolumeClaimVolumeSource(
|
250
|
+
claim_name=claim
|
251
|
+
),
|
252
|
+
)
|
253
|
+
for claim in self._kwargs["persistent_volume_claims"].keys()
|
254
|
+
]
|
255
|
+
if self._kwargs["persistent_volume_claims"] is not None
|
256
|
+
else []
|
257
|
+
),
|
258
|
+
# TODO (savin): Set termination_message_policy
|
259
|
+
),
|
260
|
+
),
|
261
|
+
)
|
262
|
+
|
61
263
|
def create(self):
|
62
264
|
# A discerning eye would notice and question the choice of using the
|
63
265
|
# V1Job construct over the V1Pod construct given that we don't rely much
|
@@ -77,11 +279,6 @@ class KubernetesJob(object):
|
|
77
279
|
use_tmpfs = self._kwargs["use_tmpfs"]
|
78
280
|
tmpfs_size = self._kwargs["tmpfs_size"]
|
79
281
|
tmpfs_enabled = use_tmpfs or (tmpfs_size and not use_tmpfs)
|
80
|
-
shared_memory = (
|
81
|
-
int(self._kwargs["shared_memory"])
|
82
|
-
if self._kwargs["shared_memory"]
|
83
|
-
else None
|
84
|
-
)
|
85
282
|
|
86
283
|
self._job = client.V1Job(
|
87
284
|
api_version="batch/v1",
|
@@ -94,197 +291,7 @@ class KubernetesJob(object):
|
|
94
291
|
generate_name=self._kwargs["generate_name"],
|
95
292
|
namespace=self._kwargs["namespace"], # Defaults to `default`
|
96
293
|
),
|
97
|
-
spec=
|
98
|
-
# Retries are handled by Metaflow when it is responsible for
|
99
|
-
# executing the flow. The responsibility is moved to Kubernetes
|
100
|
-
# when Argo Workflows is responsible for the execution.
|
101
|
-
backoff_limit=self._kwargs.get("retries", 0),
|
102
|
-
completions=1, # A single non-indexed pod job
|
103
|
-
ttl_seconds_after_finished=7
|
104
|
-
* 60
|
105
|
-
* 60 # Remove job after a week. TODO: Make this configurable
|
106
|
-
* 24,
|
107
|
-
template=client.V1PodTemplateSpec(
|
108
|
-
metadata=client.V1ObjectMeta(
|
109
|
-
annotations=self._kwargs.get("annotations", {}),
|
110
|
-
labels=self._kwargs.get("labels", {}),
|
111
|
-
namespace=self._kwargs["namespace"],
|
112
|
-
),
|
113
|
-
spec=client.V1PodSpec(
|
114
|
-
# Timeout is set on the pod and not the job (important!)
|
115
|
-
active_deadline_seconds=self._kwargs["timeout_in_seconds"],
|
116
|
-
# TODO (savin): Enable affinities for GPU scheduling.
|
117
|
-
# affinity=?,
|
118
|
-
containers=[
|
119
|
-
client.V1Container(
|
120
|
-
command=self._kwargs["command"],
|
121
|
-
ports=[
|
122
|
-
client.V1ContainerPort(
|
123
|
-
container_port=int(self._kwargs["port"])
|
124
|
-
)
|
125
|
-
]
|
126
|
-
if "port" in self._kwargs and self._kwargs["port"]
|
127
|
-
else None,
|
128
|
-
env=[
|
129
|
-
client.V1EnvVar(name=k, value=str(v))
|
130
|
-
for k, v in self._kwargs.get(
|
131
|
-
"environment_variables", {}
|
132
|
-
).items()
|
133
|
-
]
|
134
|
-
# And some downward API magic. Add (key, value)
|
135
|
-
# pairs below to make pod metadata available
|
136
|
-
# within Kubernetes container.
|
137
|
-
+ [
|
138
|
-
client.V1EnvVar(
|
139
|
-
name=k,
|
140
|
-
value_from=client.V1EnvVarSource(
|
141
|
-
field_ref=client.V1ObjectFieldSelector(
|
142
|
-
field_path=str(v)
|
143
|
-
)
|
144
|
-
),
|
145
|
-
)
|
146
|
-
for k, v in {
|
147
|
-
"METAFLOW_KUBERNETES_POD_NAMESPACE": "metadata.namespace",
|
148
|
-
"METAFLOW_KUBERNETES_POD_NAME": "metadata.name",
|
149
|
-
"METAFLOW_KUBERNETES_POD_ID": "metadata.uid",
|
150
|
-
"METAFLOW_KUBERNETES_SERVICE_ACCOUNT_NAME": "spec.serviceAccountName",
|
151
|
-
"METAFLOW_KUBERNETES_NODE_IP": "status.hostIP",
|
152
|
-
}.items()
|
153
|
-
]
|
154
|
-
+ [
|
155
|
-
client.V1EnvVar(name=k, value=str(v))
|
156
|
-
for k, v in inject_tracing_vars({}).items()
|
157
|
-
],
|
158
|
-
env_from=[
|
159
|
-
client.V1EnvFromSource(
|
160
|
-
secret_ref=client.V1SecretEnvSource(
|
161
|
-
name=str(k),
|
162
|
-
# optional=True
|
163
|
-
)
|
164
|
-
)
|
165
|
-
for k in list(self._kwargs.get("secrets", []))
|
166
|
-
+ KUBERNETES_SECRETS.split(",")
|
167
|
-
if k
|
168
|
-
],
|
169
|
-
image=self._kwargs["image"],
|
170
|
-
image_pull_policy=self._kwargs["image_pull_policy"],
|
171
|
-
name=self._kwargs["step_name"].replace("_", "-"),
|
172
|
-
resources=client.V1ResourceRequirements(
|
173
|
-
requests={
|
174
|
-
"cpu": str(self._kwargs["cpu"]),
|
175
|
-
"memory": "%sM" % str(self._kwargs["memory"]),
|
176
|
-
"ephemeral-storage": "%sM"
|
177
|
-
% str(self._kwargs["disk"]),
|
178
|
-
},
|
179
|
-
limits={
|
180
|
-
"%s.com/gpu".lower()
|
181
|
-
% self._kwargs["gpu_vendor"]: str(
|
182
|
-
self._kwargs["gpu"]
|
183
|
-
)
|
184
|
-
for k in [0]
|
185
|
-
# Don't set GPU limits if gpu isn't specified.
|
186
|
-
if self._kwargs["gpu"] is not None
|
187
|
-
},
|
188
|
-
),
|
189
|
-
volume_mounts=(
|
190
|
-
[
|
191
|
-
client.V1VolumeMount(
|
192
|
-
mount_path=self._kwargs.get("tmpfs_path"),
|
193
|
-
name="tmpfs-ephemeral-volume",
|
194
|
-
)
|
195
|
-
]
|
196
|
-
if tmpfs_enabled
|
197
|
-
else []
|
198
|
-
)
|
199
|
-
+ (
|
200
|
-
[
|
201
|
-
client.V1VolumeMount(
|
202
|
-
mount_path="/dev/shm", name="dhsm"
|
203
|
-
)
|
204
|
-
]
|
205
|
-
if shared_memory
|
206
|
-
else []
|
207
|
-
)
|
208
|
-
+ (
|
209
|
-
[
|
210
|
-
client.V1VolumeMount(
|
211
|
-
mount_path=path, name=claim
|
212
|
-
)
|
213
|
-
for claim, path in self._kwargs[
|
214
|
-
"persistent_volume_claims"
|
215
|
-
].items()
|
216
|
-
]
|
217
|
-
if self._kwargs["persistent_volume_claims"]
|
218
|
-
is not None
|
219
|
-
else []
|
220
|
-
),
|
221
|
-
)
|
222
|
-
],
|
223
|
-
node_selector=self._kwargs.get("node_selector"),
|
224
|
-
# TODO (savin): Support image_pull_secrets
|
225
|
-
# image_pull_secrets=?,
|
226
|
-
# TODO (savin): Support preemption policies
|
227
|
-
# preemption_policy=?,
|
228
|
-
#
|
229
|
-
# A Container in a Pod may fail for a number of
|
230
|
-
# reasons, such as because the process in it exited
|
231
|
-
# with a non-zero exit code, or the Container was
|
232
|
-
# killed due to OOM etc. If this happens, fail the pod
|
233
|
-
# and let Metaflow handle the retries.
|
234
|
-
restart_policy="Never",
|
235
|
-
service_account_name=self._kwargs["service_account"],
|
236
|
-
# Terminate the container immediately on SIGTERM
|
237
|
-
termination_grace_period_seconds=0,
|
238
|
-
tolerations=[
|
239
|
-
client.V1Toleration(**toleration)
|
240
|
-
for toleration in self._kwargs.get("tolerations") or []
|
241
|
-
],
|
242
|
-
volumes=(
|
243
|
-
[
|
244
|
-
client.V1Volume(
|
245
|
-
name="tmpfs-ephemeral-volume",
|
246
|
-
empty_dir=client.V1EmptyDirVolumeSource(
|
247
|
-
medium="Memory",
|
248
|
-
# Add default unit as ours differs from Kubernetes default.
|
249
|
-
size_limit="{}Mi".format(tmpfs_size),
|
250
|
-
),
|
251
|
-
)
|
252
|
-
]
|
253
|
-
if tmpfs_enabled
|
254
|
-
else []
|
255
|
-
)
|
256
|
-
+ (
|
257
|
-
[
|
258
|
-
client.V1Volume(
|
259
|
-
name="dhsm",
|
260
|
-
empty_dir=client.V1EmptyDirVolumeSource(
|
261
|
-
medium="Memory",
|
262
|
-
size_limit="{}Mi".format(shared_memory),
|
263
|
-
),
|
264
|
-
)
|
265
|
-
]
|
266
|
-
if shared_memory
|
267
|
-
else []
|
268
|
-
)
|
269
|
-
+ (
|
270
|
-
[
|
271
|
-
client.V1Volume(
|
272
|
-
name=claim,
|
273
|
-
persistent_volume_claim=client.V1PersistentVolumeClaimVolumeSource(
|
274
|
-
claim_name=claim
|
275
|
-
),
|
276
|
-
)
|
277
|
-
for claim in self._kwargs[
|
278
|
-
"persistent_volume_claims"
|
279
|
-
].keys()
|
280
|
-
]
|
281
|
-
if self._kwargs["persistent_volume_claims"] is not None
|
282
|
-
else []
|
283
|
-
),
|
284
|
-
# TODO (savin): Set termination_message_policy
|
285
|
-
),
|
286
|
-
),
|
287
|
-
),
|
294
|
+
spec=self.create_job_spec(),
|
288
295
|
)
|
289
296
|
return self
|
290
297
|
|
@@ -418,7 +425,7 @@ class RunningJob(object):
|
|
418
425
|
def best_effort_kill():
|
419
426
|
try:
|
420
427
|
self.kill()
|
421
|
-
except:
|
428
|
+
except Exception as ex:
|
422
429
|
pass
|
423
430
|
|
424
431
|
atexit.register(best_effort_kill)
|
@@ -482,9 +489,9 @@ class RunningJob(object):
|
|
482
489
|
# 3. If the pod object hasn't shown up yet, we set the parallelism to 0
|
483
490
|
# to preempt it.
|
484
491
|
client = self._client.get()
|
492
|
+
|
485
493
|
if not self.is_done:
|
486
494
|
if self.is_running:
|
487
|
-
|
488
495
|
# Case 1.
|
489
496
|
from kubernetes.stream import stream
|
490
497
|
|