ob-metaflow 2.12.30.2__py2.py3-none-any.whl → 2.13.6.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ob-metaflow might be problematic. Click here for more details.
- metaflow/__init__.py +3 -0
- metaflow/cards.py +1 -0
- metaflow/cli.py +185 -717
- metaflow/cli_args.py +17 -0
- metaflow/cli_components/__init__.py +0 -0
- metaflow/cli_components/dump_cmd.py +96 -0
- metaflow/cli_components/init_cmd.py +51 -0
- metaflow/cli_components/run_cmds.py +362 -0
- metaflow/cli_components/step_cmd.py +176 -0
- metaflow/cli_components/utils.py +140 -0
- metaflow/cmd/develop/stub_generator.py +9 -2
- metaflow/datastore/flow_datastore.py +2 -2
- metaflow/decorators.py +63 -2
- metaflow/exception.py +8 -2
- metaflow/extension_support/plugins.py +42 -27
- metaflow/flowspec.py +176 -23
- metaflow/graph.py +28 -27
- metaflow/includefile.py +50 -22
- metaflow/lint.py +35 -20
- metaflow/metadata_provider/heartbeat.py +23 -8
- metaflow/metaflow_config.py +10 -1
- metaflow/multicore_utils.py +31 -14
- metaflow/package.py +17 -3
- metaflow/parameters.py +97 -25
- metaflow/plugins/__init__.py +22 -0
- metaflow/plugins/airflow/airflow.py +18 -17
- metaflow/plugins/airflow/airflow_cli.py +1 -0
- metaflow/plugins/argo/argo_client.py +0 -2
- metaflow/plugins/argo/argo_workflows.py +195 -132
- metaflow/plugins/argo/argo_workflows_cli.py +1 -1
- metaflow/plugins/argo/argo_workflows_decorator.py +2 -4
- metaflow/plugins/argo/argo_workflows_deployer_objects.py +51 -9
- metaflow/plugins/argo/jobset_input_paths.py +0 -1
- metaflow/plugins/aws/aws_utils.py +6 -1
- metaflow/plugins/aws/batch/batch_client.py +1 -3
- metaflow/plugins/aws/batch/batch_decorator.py +13 -13
- metaflow/plugins/aws/secrets_manager/aws_secrets_manager_secrets_provider.py +13 -10
- metaflow/plugins/aws/step_functions/dynamo_db_client.py +0 -3
- metaflow/plugins/aws/step_functions/production_token.py +1 -1
- metaflow/plugins/aws/step_functions/step_functions.py +33 -1
- metaflow/plugins/aws/step_functions/step_functions_cli.py +1 -1
- metaflow/plugins/aws/step_functions/step_functions_decorator.py +0 -1
- metaflow/plugins/aws/step_functions/step_functions_deployer_objects.py +7 -9
- metaflow/plugins/cards/card_cli.py +7 -2
- metaflow/plugins/cards/card_creator.py +1 -0
- metaflow/plugins/cards/card_decorator.py +79 -8
- metaflow/plugins/cards/card_modules/basic.py +56 -5
- metaflow/plugins/cards/card_modules/card.py +16 -1
- metaflow/plugins/cards/card_modules/components.py +64 -16
- metaflow/plugins/cards/card_modules/main.js +27 -25
- metaflow/plugins/cards/card_modules/test_cards.py +4 -4
- metaflow/plugins/cards/component_serializer.py +1 -1
- metaflow/plugins/datatools/s3/s3.py +12 -4
- metaflow/plugins/datatools/s3/s3op.py +3 -3
- metaflow/plugins/events_decorator.py +338 -186
- metaflow/plugins/kubernetes/kube_utils.py +84 -1
- metaflow/plugins/kubernetes/kubernetes.py +40 -92
- metaflow/plugins/kubernetes/kubernetes_cli.py +32 -7
- metaflow/plugins/kubernetes/kubernetes_decorator.py +76 -4
- metaflow/plugins/kubernetes/kubernetes_job.py +23 -20
- metaflow/plugins/kubernetes/kubernetes_jobsets.py +41 -20
- metaflow/plugins/kubernetes/spot_metadata_cli.py +69 -0
- metaflow/plugins/kubernetes/spot_monitor_sidecar.py +109 -0
- metaflow/plugins/parallel_decorator.py +4 -1
- metaflow/plugins/project_decorator.py +33 -5
- metaflow/plugins/pypi/bootstrap.py +249 -81
- metaflow/plugins/pypi/conda_decorator.py +20 -10
- metaflow/plugins/pypi/conda_environment.py +83 -27
- metaflow/plugins/pypi/micromamba.py +82 -37
- metaflow/plugins/pypi/pip.py +9 -6
- metaflow/plugins/pypi/pypi_decorator.py +11 -9
- metaflow/plugins/pypi/utils.py +4 -2
- metaflow/plugins/timeout_decorator.py +2 -2
- metaflow/runner/click_api.py +240 -50
- metaflow/runner/deployer.py +1 -1
- metaflow/runner/deployer_impl.py +12 -11
- metaflow/runner/metaflow_runner.py +68 -34
- metaflow/runner/nbdeploy.py +2 -0
- metaflow/runner/nbrun.py +1 -1
- metaflow/runner/subprocess_manager.py +61 -10
- metaflow/runner/utils.py +208 -44
- metaflow/runtime.py +216 -112
- metaflow/sidecar/sidecar_worker.py +1 -1
- metaflow/tracing/tracing_modules.py +4 -1
- metaflow/user_configs/__init__.py +0 -0
- metaflow/user_configs/config_decorators.py +563 -0
- metaflow/user_configs/config_options.py +548 -0
- metaflow/user_configs/config_parameters.py +436 -0
- metaflow/util.py +22 -0
- metaflow/version.py +1 -1
- {ob_metaflow-2.12.30.2.dist-info → ob_metaflow-2.13.6.1.dist-info}/METADATA +12 -3
- {ob_metaflow-2.12.30.2.dist-info → ob_metaflow-2.13.6.1.dist-info}/RECORD +96 -84
- {ob_metaflow-2.12.30.2.dist-info → ob_metaflow-2.13.6.1.dist-info}/WHEEL +1 -1
- {ob_metaflow-2.12.30.2.dist-info → ob_metaflow-2.13.6.1.dist-info}/LICENSE +0 -0
- {ob_metaflow-2.12.30.2.dist-info → ob_metaflow-2.13.6.1.dist-info}/entry_points.txt +0 -0
- {ob_metaflow-2.12.30.2.dist-info → ob_metaflow-2.13.6.1.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,14 @@
|
|
|
1
|
-
|
|
1
|
+
import re
|
|
2
|
+
from typing import Dict, List, Optional
|
|
3
|
+
from metaflow.exception import CommandException, MetaflowException
|
|
2
4
|
from metaflow.util import get_username, get_latest_run_id
|
|
3
5
|
|
|
4
6
|
|
|
7
|
+
# avoid circular import by having the exception class contained here
|
|
8
|
+
class KubernetesException(MetaflowException):
|
|
9
|
+
headline = "Kubernetes error"
|
|
10
|
+
|
|
11
|
+
|
|
5
12
|
def parse_cli_options(flow_name, run_id, user, my_runs, echo):
|
|
6
13
|
if user and my_runs:
|
|
7
14
|
raise CommandException("--user and --my-runs are mutually exclusive.")
|
|
@@ -23,3 +30,79 @@ def parse_cli_options(flow_name, run_id, user, my_runs, echo):
|
|
|
23
30
|
raise CommandException("A previous run id was not found. Specify --run-id.")
|
|
24
31
|
|
|
25
32
|
return flow_name, run_id, user
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def qos_requests_and_limits(qos: str, cpu: int, memory: int, storage: int):
|
|
36
|
+
"return resource requests and limits for the kubernetes pod based on the given QoS Class"
|
|
37
|
+
# case insensitive matching for QoS class
|
|
38
|
+
qos = qos.lower()
|
|
39
|
+
# Determine the requests and limits to define chosen QoS class
|
|
40
|
+
qos_limits = {}
|
|
41
|
+
qos_requests = {}
|
|
42
|
+
if qos == "guaranteed":
|
|
43
|
+
# Guaranteed - has both cpu/memory limits. requests not required, as these will be inferred.
|
|
44
|
+
qos_limits = {
|
|
45
|
+
"cpu": str(cpu),
|
|
46
|
+
"memory": "%sM" % str(memory),
|
|
47
|
+
"ephemeral-storage": "%sM" % str(storage),
|
|
48
|
+
}
|
|
49
|
+
# NOTE: Even though Kubernetes will produce matching requests for the specified limits, this happens late in the lifecycle.
|
|
50
|
+
# We specify them explicitly here to make some K8S tooling happy, in case they rely on .resources.requests being present at time of submitting the job.
|
|
51
|
+
qos_requests = qos_limits
|
|
52
|
+
else:
|
|
53
|
+
# Burstable - not Guaranteed, and has a memory/cpu limit or request
|
|
54
|
+
qos_requests = {
|
|
55
|
+
"cpu": str(cpu),
|
|
56
|
+
"memory": "%sM" % str(memory),
|
|
57
|
+
"ephemeral-storage": "%sM" % str(storage),
|
|
58
|
+
}
|
|
59
|
+
# TODO: Add support for BestEffort once there is a use case for it.
|
|
60
|
+
# BestEffort - no limit or requests for cpu/memory
|
|
61
|
+
return qos_requests, qos_limits
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def validate_kube_labels(
|
|
65
|
+
labels: Optional[Dict[str, Optional[str]]],
|
|
66
|
+
) -> bool:
|
|
67
|
+
"""Validate label values.
|
|
68
|
+
|
|
69
|
+
This validates the kubernetes label values. It does not validate the keys.
|
|
70
|
+
Ideally, keys should be static and also the validation rules for keys are
|
|
71
|
+
more complex than those for values. For full validation rules, see:
|
|
72
|
+
|
|
73
|
+
https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#syntax-and-character-set
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
def validate_label(s: Optional[str]):
|
|
77
|
+
regex_match = r"^(([A-Za-z0-9][-A-Za-z0-9_.]{0,61})?[A-Za-z0-9])?$"
|
|
78
|
+
if not s:
|
|
79
|
+
# allow empty label
|
|
80
|
+
return True
|
|
81
|
+
if not re.search(regex_match, s):
|
|
82
|
+
raise KubernetesException(
|
|
83
|
+
'Invalid value: "%s"\n'
|
|
84
|
+
"A valid label must be an empty string or one that\n"
|
|
85
|
+
" - Consist of alphanumeric, '-', '_' or '.' characters\n"
|
|
86
|
+
" - Begins and ends with an alphanumeric character\n"
|
|
87
|
+
" - Is at most 63 characters" % s
|
|
88
|
+
)
|
|
89
|
+
return True
|
|
90
|
+
|
|
91
|
+
return all([validate_label(v) for v in labels.values()]) if labels else True
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def parse_kube_keyvalue_list(items: List[str], requires_both: bool = True):
|
|
95
|
+
try:
|
|
96
|
+
ret = {}
|
|
97
|
+
for item_str in items:
|
|
98
|
+
item = item_str.split("=", 1)
|
|
99
|
+
if requires_both:
|
|
100
|
+
item[1] # raise IndexError
|
|
101
|
+
if str(item[0]) in ret:
|
|
102
|
+
raise KubernetesException("Duplicate key found: %s" % str(item[0]))
|
|
103
|
+
ret[str(item[0])] = str(item[1]) if len(item) > 1 else None
|
|
104
|
+
return ret
|
|
105
|
+
except KubernetesException as e:
|
|
106
|
+
raise e
|
|
107
|
+
except (AttributeError, IndexError):
|
|
108
|
+
raise KubernetesException("Unable to parse kubernetes list: %s" % items)
|
|
@@ -1,11 +1,8 @@
|
|
|
1
|
-
import copy
|
|
2
1
|
import json
|
|
3
2
|
import math
|
|
4
3
|
import os
|
|
5
|
-
import re
|
|
6
4
|
import shlex
|
|
7
5
|
import time
|
|
8
|
-
from typing import Dict, List, Optional
|
|
9
6
|
from uuid import uuid4
|
|
10
7
|
|
|
11
8
|
from metaflow import current, util
|
|
@@ -35,7 +32,6 @@ from metaflow.metaflow_config import (
|
|
|
35
32
|
DEFAULT_SECRETS_BACKEND_TYPE,
|
|
36
33
|
GCP_SECRET_MANAGER_PREFIX,
|
|
37
34
|
KUBERNETES_FETCH_EC2_METADATA,
|
|
38
|
-
KUBERNETES_LABELS,
|
|
39
35
|
KUBERNETES_SANDBOX_INIT_SCRIPT,
|
|
40
36
|
OTEL_ENDPOINT,
|
|
41
37
|
S3_ENDPOINT_URL,
|
|
@@ -193,9 +189,11 @@ class Kubernetes(object):
|
|
|
193
189
|
persistent_volume_claims=None,
|
|
194
190
|
tolerations=None,
|
|
195
191
|
labels=None,
|
|
192
|
+
annotations=None,
|
|
196
193
|
shared_memory=None,
|
|
197
194
|
port=None,
|
|
198
195
|
num_parallel=None,
|
|
196
|
+
qos=None,
|
|
199
197
|
):
|
|
200
198
|
name = "js-%s" % str(uuid4())[:6]
|
|
201
199
|
jobset = (
|
|
@@ -228,6 +226,7 @@ class Kubernetes(object):
|
|
|
228
226
|
shared_memory=shared_memory,
|
|
229
227
|
port=port,
|
|
230
228
|
num_parallel=num_parallel,
|
|
229
|
+
qos=qos,
|
|
231
230
|
)
|
|
232
231
|
.environment_variable("METAFLOW_CODE_SHA", code_package_sha)
|
|
233
232
|
.environment_variable("METAFLOW_CODE_URL", code_package_url)
|
|
@@ -302,17 +301,13 @@ class Kubernetes(object):
|
|
|
302
301
|
# see get_datastore_root_from_config in datastore/local.py).
|
|
303
302
|
)
|
|
304
303
|
|
|
305
|
-
_labels = self._get_labels(labels)
|
|
306
|
-
for k, v in _labels.items():
|
|
307
|
-
jobset.label(k, v)
|
|
308
|
-
|
|
309
304
|
for k in list(
|
|
310
305
|
[] if not secrets else [secrets] if isinstance(secrets, str) else secrets
|
|
311
306
|
) + KUBERNETES_SECRETS.split(","):
|
|
312
307
|
jobset.secret(k)
|
|
313
308
|
|
|
314
309
|
initial_configs = init_config()
|
|
315
|
-
for entry in ["OBP_PERIMETER", "
|
|
310
|
+
for entry in ["OBP_PERIMETER", "OBP_INTEGRATIONS_URL"]:
|
|
316
311
|
if entry not in initial_configs:
|
|
317
312
|
raise KubernetesException(
|
|
318
313
|
f"{entry} was not found in metaflow config. Please make sure to run `outerbounds configure <...>` command which can be found on the Ourebounds UI or reach out to your Outerbounds support team."
|
|
@@ -320,8 +315,8 @@ class Kubernetes(object):
|
|
|
320
315
|
|
|
321
316
|
additional_obp_configs = {
|
|
322
317
|
"OBP_PERIMETER": initial_configs["OBP_PERIMETER"],
|
|
323
|
-
"
|
|
324
|
-
"
|
|
318
|
+
"OBP_INTEGRATIONS_URL": initial_configs[
|
|
319
|
+
"OBP_INTEGRATIONS_URL"
|
|
325
320
|
],
|
|
326
321
|
}
|
|
327
322
|
for k, v in additional_obp_configs.items():
|
|
@@ -393,13 +388,16 @@ class Kubernetes(object):
|
|
|
393
388
|
for name, value in env.items():
|
|
394
389
|
jobset.environment_variable(name, value)
|
|
395
390
|
|
|
396
|
-
|
|
391
|
+
system_annotations = {
|
|
397
392
|
"metaflow/user": user,
|
|
398
393
|
"metaflow/flow_name": flow_name,
|
|
399
394
|
"metaflow/control-task-id": task_id,
|
|
395
|
+
"metaflow/run_id": run_id,
|
|
396
|
+
"metaflow/step_name": step_name,
|
|
397
|
+
"metaflow/attempt": attempt,
|
|
400
398
|
}
|
|
401
399
|
if current.get("project_name"):
|
|
402
|
-
|
|
400
|
+
system_annotations.update(
|
|
403
401
|
{
|
|
404
402
|
"metaflow/project_name": current.project_name,
|
|
405
403
|
"metaflow/branch_name": current.branch_name,
|
|
@@ -407,15 +405,15 @@ class Kubernetes(object):
|
|
|
407
405
|
}
|
|
408
406
|
)
|
|
409
407
|
|
|
410
|
-
|
|
411
|
-
|
|
408
|
+
system_labels = {
|
|
409
|
+
"app.kubernetes.io/name": "metaflow-task",
|
|
410
|
+
"app.kubernetes.io/part-of": "metaflow",
|
|
411
|
+
}
|
|
412
|
+
|
|
413
|
+
jobset.labels({**({} if not labels else labels), **system_labels})
|
|
412
414
|
|
|
413
|
-
(
|
|
414
|
-
|
|
415
|
-
.annotation("metaflow/step_name", step_name)
|
|
416
|
-
.annotation("metaflow/attempt", attempt)
|
|
417
|
-
.label("app.kubernetes.io/name", "metaflow-task")
|
|
418
|
-
.label("app.kubernetes.io/part-of", "metaflow")
|
|
415
|
+
jobset.annotations(
|
|
416
|
+
{**({} if not annotations else annotations), **system_annotations}
|
|
419
417
|
)
|
|
420
418
|
# We need this task-id set so that all the nodes are aware of the control
|
|
421
419
|
# task's task-id. These "MF_" variables populate the `current.parallel` namedtuple
|
|
@@ -504,6 +502,8 @@ class Kubernetes(object):
|
|
|
504
502
|
shared_memory=None,
|
|
505
503
|
port=None,
|
|
506
504
|
name_pattern=None,
|
|
505
|
+
qos=None,
|
|
506
|
+
annotations=None,
|
|
507
507
|
):
|
|
508
508
|
if env is None:
|
|
509
509
|
env = {}
|
|
@@ -536,7 +536,8 @@ class Kubernetes(object):
|
|
|
536
536
|
retries=0,
|
|
537
537
|
step_name=step_name,
|
|
538
538
|
tolerations=tolerations,
|
|
539
|
-
labels=
|
|
539
|
+
labels=labels,
|
|
540
|
+
annotations=annotations,
|
|
540
541
|
use_tmpfs=use_tmpfs,
|
|
541
542
|
tmpfs_tempdir=tmpfs_tempdir,
|
|
542
543
|
tmpfs_size=tmpfs_size,
|
|
@@ -544,6 +545,7 @@ class Kubernetes(object):
|
|
|
544
545
|
persistent_volume_claims=persistent_volume_claims,
|
|
545
546
|
shared_memory=shared_memory,
|
|
546
547
|
port=port,
|
|
548
|
+
qos=qos,
|
|
547
549
|
)
|
|
548
550
|
.environment_variable("METAFLOW_CODE_SHA", code_package_sha)
|
|
549
551
|
.environment_variable("METAFLOW_CODE_URL", code_package_url)
|
|
@@ -654,13 +656,25 @@ class Kubernetes(object):
|
|
|
654
656
|
|
|
655
657
|
for name, value in env.items():
|
|
656
658
|
job.environment_variable(name, value)
|
|
659
|
+
# Add job specific labels
|
|
660
|
+
system_labels = {
|
|
661
|
+
"app.kubernetes.io/name": "metaflow-task",
|
|
662
|
+
"app.kubernetes.io/part-of": "metaflow",
|
|
663
|
+
}
|
|
664
|
+
for name, value in system_labels.items():
|
|
665
|
+
job.label(name, value)
|
|
657
666
|
|
|
658
|
-
annotations
|
|
659
|
-
|
|
667
|
+
# Add job specific annotations not set in the decorator.
|
|
668
|
+
system_annotations = {
|
|
660
669
|
"metaflow/flow_name": flow_name,
|
|
670
|
+
"metaflow/run_id": run_id,
|
|
671
|
+
"metaflow/step_name": step_name,
|
|
672
|
+
"metaflow/task_id": task_id,
|
|
673
|
+
"metaflow/attempt": attempt,
|
|
674
|
+
"metaflow/user": user,
|
|
661
675
|
}
|
|
662
676
|
if current.get("project_name"):
|
|
663
|
-
|
|
677
|
+
system_annotations.update(
|
|
664
678
|
{
|
|
665
679
|
"metaflow/project_name": current.project_name,
|
|
666
680
|
"metaflow/branch_name": current.branch_name,
|
|
@@ -668,18 +682,9 @@ class Kubernetes(object):
|
|
|
668
682
|
}
|
|
669
683
|
)
|
|
670
684
|
|
|
671
|
-
for name, value in
|
|
685
|
+
for name, value in system_annotations.items():
|
|
672
686
|
job.annotation(name, value)
|
|
673
687
|
|
|
674
|
-
(
|
|
675
|
-
job.annotation("metaflow/run_id", run_id)
|
|
676
|
-
.annotation("metaflow/step_name", step_name)
|
|
677
|
-
.annotation("metaflow/task_id", task_id)
|
|
678
|
-
.annotation("metaflow/attempt", attempt)
|
|
679
|
-
.label("app.kubernetes.io/name", "metaflow-task")
|
|
680
|
-
.label("app.kubernetes.io/part-of", "metaflow")
|
|
681
|
-
)
|
|
682
|
-
|
|
683
688
|
return job
|
|
684
689
|
|
|
685
690
|
def create_k8sjob(self, job):
|
|
@@ -787,60 +792,3 @@ class Kubernetes(object):
|
|
|
787
792
|
"stderr",
|
|
788
793
|
job_id=self._job.id,
|
|
789
794
|
)
|
|
790
|
-
|
|
791
|
-
@staticmethod
|
|
792
|
-
def _get_labels(extra_labels=None):
|
|
793
|
-
if extra_labels is None:
|
|
794
|
-
extra_labels = {}
|
|
795
|
-
env_labels = KUBERNETES_LABELS.split(",") if KUBERNETES_LABELS else []
|
|
796
|
-
env_labels = parse_kube_keyvalue_list(env_labels, False)
|
|
797
|
-
labels = {**env_labels, **extra_labels}
|
|
798
|
-
validate_kube_labels(labels)
|
|
799
|
-
return labels
|
|
800
|
-
|
|
801
|
-
|
|
802
|
-
def validate_kube_labels(
|
|
803
|
-
labels: Optional[Dict[str, Optional[str]]],
|
|
804
|
-
) -> bool:
|
|
805
|
-
"""Validate label values.
|
|
806
|
-
|
|
807
|
-
This validates the kubernetes label values. It does not validate the keys.
|
|
808
|
-
Ideally, keys should be static and also the validation rules for keys are
|
|
809
|
-
more complex than those for values. For full validation rules, see:
|
|
810
|
-
|
|
811
|
-
https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#syntax-and-character-set
|
|
812
|
-
"""
|
|
813
|
-
|
|
814
|
-
def validate_label(s: Optional[str]):
|
|
815
|
-
regex_match = r"^(([A-Za-z0-9][-A-Za-z0-9_.]{0,61})?[A-Za-z0-9])?$"
|
|
816
|
-
if not s:
|
|
817
|
-
# allow empty label
|
|
818
|
-
return True
|
|
819
|
-
if not re.search(regex_match, s):
|
|
820
|
-
raise KubernetesException(
|
|
821
|
-
'Invalid value: "%s"\n'
|
|
822
|
-
"A valid label must be an empty string or one that\n"
|
|
823
|
-
" - Consist of alphanumeric, '-', '_' or '.' characters\n"
|
|
824
|
-
" - Begins and ends with an alphanumeric character\n"
|
|
825
|
-
" - Is at most 63 characters" % s
|
|
826
|
-
)
|
|
827
|
-
return True
|
|
828
|
-
|
|
829
|
-
return all([validate_label(v) for v in labels.values()]) if labels else True
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
def parse_kube_keyvalue_list(items: List[str], requires_both: bool = True):
|
|
833
|
-
try:
|
|
834
|
-
ret = {}
|
|
835
|
-
for item_str in items:
|
|
836
|
-
item = item_str.split("=", 1)
|
|
837
|
-
if requires_both:
|
|
838
|
-
item[1] # raise IndexError
|
|
839
|
-
if str(item[0]) in ret:
|
|
840
|
-
raise KubernetesException("Duplicate key found: %s" % str(item[0]))
|
|
841
|
-
ret[str(item[0])] = str(item[1]) if len(item) > 1 else None
|
|
842
|
-
return ret
|
|
843
|
-
except KubernetesException as e:
|
|
844
|
-
raise e
|
|
845
|
-
except (AttributeError, IndexError):
|
|
846
|
-
raise KubernetesException("Unable to parse kubernetes list: %s" % items)
|
|
@@ -3,14 +3,17 @@ import sys
|
|
|
3
3
|
import time
|
|
4
4
|
import traceback
|
|
5
5
|
|
|
6
|
-
from metaflow.plugins.kubernetes.kube_utils import
|
|
6
|
+
from metaflow.plugins.kubernetes.kube_utils import (
|
|
7
|
+
parse_cli_options,
|
|
8
|
+
parse_kube_keyvalue_list,
|
|
9
|
+
)
|
|
7
10
|
from metaflow.plugins.kubernetes.kubernetes_client import KubernetesClient
|
|
8
11
|
import metaflow.tracing as tracing
|
|
9
12
|
from metaflow import JSONTypeClass, util
|
|
10
13
|
from metaflow._vendor import click
|
|
11
14
|
from metaflow.exception import METAFLOW_EXIT_DISALLOW_RETRY, MetaflowException
|
|
12
15
|
from metaflow.metadata_provider.util import sync_local_metadata_from_datastore
|
|
13
|
-
from metaflow.metaflow_config import DATASTORE_LOCAL_DIR
|
|
16
|
+
from metaflow.metaflow_config import DATASTORE_LOCAL_DIR
|
|
14
17
|
from metaflow.mflog import TASK_LOG_SOURCE
|
|
15
18
|
from metaflow.unbounded_foreach import UBF_CONTROL, UBF_TASK
|
|
16
19
|
|
|
@@ -18,9 +21,7 @@ from .kubernetes import (
|
|
|
18
21
|
Kubernetes,
|
|
19
22
|
KubernetesException,
|
|
20
23
|
KubernetesKilledException,
|
|
21
|
-
parse_kube_keyvalue_list,
|
|
22
24
|
)
|
|
23
|
-
from .kubernetes_decorator import KubernetesDecorator
|
|
24
25
|
|
|
25
26
|
|
|
26
27
|
@click.group()
|
|
@@ -33,12 +34,12 @@ def kubernetes():
|
|
|
33
34
|
pass
|
|
34
35
|
|
|
35
36
|
|
|
36
|
-
@tracing.cli_entrypoint("kubernetes/step")
|
|
37
37
|
@kubernetes.command(
|
|
38
38
|
help="Execute a single task on Kubernetes. This command calls the top-level step "
|
|
39
39
|
"command inside a Kubernetes pod with the given options. Typically you do not call "
|
|
40
40
|
"this command directly; it is used internally by Metaflow."
|
|
41
41
|
)
|
|
42
|
+
@tracing.cli_entrypoint("kubernetes/step")
|
|
42
43
|
@click.argument("step-name")
|
|
43
44
|
@click.argument("code-package-sha")
|
|
44
45
|
@click.argument("code-package-url")
|
|
@@ -126,6 +127,24 @@ def kubernetes():
|
|
|
126
127
|
type=int,
|
|
127
128
|
help="Number of parallel nodes to run as a multi-node job.",
|
|
128
129
|
)
|
|
130
|
+
@click.option(
|
|
131
|
+
"--qos",
|
|
132
|
+
default=None,
|
|
133
|
+
type=str,
|
|
134
|
+
help="Quality of Service class for the Kubernetes pod",
|
|
135
|
+
)
|
|
136
|
+
@click.option(
|
|
137
|
+
"--labels",
|
|
138
|
+
default=None,
|
|
139
|
+
type=JSONTypeClass(),
|
|
140
|
+
multiple=False,
|
|
141
|
+
)
|
|
142
|
+
@click.option(
|
|
143
|
+
"--annotations",
|
|
144
|
+
default=None,
|
|
145
|
+
type=JSONTypeClass(),
|
|
146
|
+
multiple=False,
|
|
147
|
+
)
|
|
129
148
|
@click.pass_context
|
|
130
149
|
def step(
|
|
131
150
|
ctx,
|
|
@@ -154,6 +173,9 @@ def step(
|
|
|
154
173
|
shared_memory=None,
|
|
155
174
|
port=None,
|
|
156
175
|
num_parallel=None,
|
|
176
|
+
qos=None,
|
|
177
|
+
labels=None,
|
|
178
|
+
annotations=None,
|
|
157
179
|
**kwargs
|
|
158
180
|
):
|
|
159
181
|
def echo(msg, stream="stderr", job_id=None, **kwargs):
|
|
@@ -168,7 +190,7 @@ def step(
|
|
|
168
190
|
executable = ctx.obj.environment.executable(step_name, executable)
|
|
169
191
|
|
|
170
192
|
# Set environment
|
|
171
|
-
env = {}
|
|
193
|
+
env = {"METAFLOW_FLOW_FILENAME": os.path.basename(sys.argv[0])}
|
|
172
194
|
env_deco = [deco for deco in node.decorators if deco.name == "environment"]
|
|
173
195
|
if env_deco:
|
|
174
196
|
env = env_deco[0].attributes["vars"]
|
|
@@ -294,8 +316,11 @@ def step(
|
|
|
294
316
|
shared_memory=shared_memory,
|
|
295
317
|
port=port,
|
|
296
318
|
num_parallel=num_parallel,
|
|
319
|
+
qos=qos,
|
|
320
|
+
labels=labels,
|
|
321
|
+
annotations=annotations,
|
|
297
322
|
)
|
|
298
|
-
except Exception
|
|
323
|
+
except Exception:
|
|
299
324
|
traceback.print_exc(chain=False)
|
|
300
325
|
_sync_metadata()
|
|
301
326
|
sys.exit(METAFLOW_EXIT_DISALLOW_RETRY)
|
|
@@ -19,6 +19,8 @@ from metaflow.metaflow_config import (
|
|
|
19
19
|
KUBERNETES_GPU_VENDOR,
|
|
20
20
|
KUBERNETES_IMAGE_PULL_POLICY,
|
|
21
21
|
KUBERNETES_MEMORY,
|
|
22
|
+
KUBERNETES_LABELS,
|
|
23
|
+
KUBERNETES_ANNOTATIONS,
|
|
22
24
|
KUBERNETES_NAMESPACE,
|
|
23
25
|
KUBERNETES_NODE_SELECTOR,
|
|
24
26
|
KUBERNETES_PERSISTENT_VOLUME_CLAIMS,
|
|
@@ -26,6 +28,7 @@ from metaflow.metaflow_config import (
|
|
|
26
28
|
KUBERNETES_SERVICE_ACCOUNT,
|
|
27
29
|
KUBERNETES_SHARED_MEMORY,
|
|
28
30
|
KUBERNETES_TOLERATIONS,
|
|
31
|
+
KUBERNETES_QOS,
|
|
29
32
|
)
|
|
30
33
|
from metaflow.plugins.resources_decorator import ResourcesDecorator
|
|
31
34
|
from metaflow.plugins.timeout_decorator import get_run_time_limit_for_task
|
|
@@ -33,7 +36,8 @@ from metaflow.sidecar import Sidecar
|
|
|
33
36
|
from metaflow.unbounded_foreach import UBF_CONTROL
|
|
34
37
|
|
|
35
38
|
from ..aws.aws_utils import get_docker_registry, get_ec2_instance_metadata
|
|
36
|
-
from .kubernetes import KubernetesException
|
|
39
|
+
from .kubernetes import KubernetesException
|
|
40
|
+
from .kube_utils import validate_kube_labels, parse_kube_keyvalue_list
|
|
37
41
|
|
|
38
42
|
from metaflow.metaflow_config import MAX_MEMORY_PER_TASK, MAX_CPU_PER_TASK
|
|
39
43
|
|
|
@@ -43,6 +47,8 @@ except NameError:
|
|
|
43
47
|
unicode = str
|
|
44
48
|
basestring = str
|
|
45
49
|
|
|
50
|
+
SUPPORTED_KUBERNETES_QOS_CLASSES = ["Guaranteed", "Burstable"]
|
|
51
|
+
|
|
46
52
|
|
|
47
53
|
class KubernetesDecorator(StepDecorator):
|
|
48
54
|
"""
|
|
@@ -88,6 +94,10 @@ class KubernetesDecorator(StepDecorator):
|
|
|
88
94
|
tolerations : List[str], default []
|
|
89
95
|
The default is extracted from METAFLOW_KUBERNETES_TOLERATIONS.
|
|
90
96
|
Kubernetes tolerations to use when launching pod in Kubernetes.
|
|
97
|
+
labels: Dict[str, str], default: METAFLOW_KUBERNETES_LABELS
|
|
98
|
+
Kubernetes labels to use when launching pod in Kubernetes.
|
|
99
|
+
annotations: Dict[str, str], default: METAFLOW_KUBERNETES_ANNOTATIONS
|
|
100
|
+
Kubernetes annotations to use when launching pod in Kubernetes.
|
|
91
101
|
use_tmpfs : bool, default False
|
|
92
102
|
This enables an explicit tmpfs mount for this step.
|
|
93
103
|
tmpfs_tempdir : bool, default True
|
|
@@ -111,6 +121,8 @@ class KubernetesDecorator(StepDecorator):
|
|
|
111
121
|
hostname_resolution_timeout: int, default 10 * 60
|
|
112
122
|
Timeout in seconds for the workers tasks in the gang scheduled cluster to resolve the hostname of control task.
|
|
113
123
|
Only applicable when @parallel is used.
|
|
124
|
+
qos: str, default: Burstable
|
|
125
|
+
Quality of Service class to assign to the pod. Supported values are: Guaranteed, Burstable, BestEffort
|
|
114
126
|
"""
|
|
115
127
|
|
|
116
128
|
name = "kubernetes"
|
|
@@ -128,6 +140,8 @@ class KubernetesDecorator(StepDecorator):
|
|
|
128
140
|
"gpu_vendor": None,
|
|
129
141
|
"tolerations": None, # e.g., [{"key": "arch", "operator": "Equal", "value": "amd"},
|
|
130
142
|
# {"key": "foo", "operator": "Equal", "value": "bar"}]
|
|
143
|
+
"labels": None, # e.g. {"test-label": "value", "another-label":"value2"}
|
|
144
|
+
"annotations": None, # e.g. {"note": "value", "another-note": "value2"}
|
|
131
145
|
"use_tmpfs": None,
|
|
132
146
|
"tmpfs_tempdir": True,
|
|
133
147
|
"tmpfs_size": None,
|
|
@@ -138,6 +152,7 @@ class KubernetesDecorator(StepDecorator):
|
|
|
138
152
|
"compute_pool": None,
|
|
139
153
|
"executable": None,
|
|
140
154
|
"hostname_resolution_timeout": 10 * 60,
|
|
155
|
+
"qos": KUBERNETES_QOS,
|
|
141
156
|
}
|
|
142
157
|
package_url = None
|
|
143
158
|
package_sha = None
|
|
@@ -147,8 +162,8 @@ class KubernetesDecorator(StepDecorator):
|
|
|
147
162
|
supports_conda_environment = True
|
|
148
163
|
target_platform = "linux-64"
|
|
149
164
|
|
|
150
|
-
def
|
|
151
|
-
super(KubernetesDecorator, self).
|
|
165
|
+
def init(self):
|
|
166
|
+
super(KubernetesDecorator, self).init()
|
|
152
167
|
|
|
153
168
|
if not self.attributes["namespace"]:
|
|
154
169
|
self.attributes["namespace"] = KUBERNETES_NAMESPACE
|
|
@@ -213,6 +228,36 @@ class KubernetesDecorator(StepDecorator):
|
|
|
213
228
|
self.attributes["memory"] = KUBERNETES_MEMORY
|
|
214
229
|
if self.attributes["disk"] == self.defaults["disk"] and KUBERNETES_DISK:
|
|
215
230
|
self.attributes["disk"] = KUBERNETES_DISK
|
|
231
|
+
# Label source precedence (decreasing):
|
|
232
|
+
# - System labels (set outside of decorator)
|
|
233
|
+
# - Decorator labels: @kubernetes(labels={})
|
|
234
|
+
# - Environment variable labels: METAFLOW_KUBERNETES_LABELS=
|
|
235
|
+
deco_labels = {}
|
|
236
|
+
if self.attributes["labels"] is not None:
|
|
237
|
+
deco_labels = self.attributes["labels"]
|
|
238
|
+
|
|
239
|
+
env_labels = {}
|
|
240
|
+
if KUBERNETES_LABELS:
|
|
241
|
+
env_labels = parse_kube_keyvalue_list(KUBERNETES_LABELS.split(","), False)
|
|
242
|
+
|
|
243
|
+
self.attributes["labels"] = {**env_labels, **deco_labels}
|
|
244
|
+
|
|
245
|
+
# Annotations
|
|
246
|
+
# annotation precedence (decreasing):
|
|
247
|
+
# - System annotations (set outside of decorator)
|
|
248
|
+
# - Decorator annotations: @kubernetes(annotations={})
|
|
249
|
+
# - Environment annotations: METAFLOW_KUBERNETES_ANNOTATIONS=
|
|
250
|
+
deco_annotations = {}
|
|
251
|
+
if self.attributes["annotations"] is not None:
|
|
252
|
+
deco_annotations = self.attributes["annotations"]
|
|
253
|
+
|
|
254
|
+
env_annotations = {}
|
|
255
|
+
if KUBERNETES_ANNOTATIONS:
|
|
256
|
+
env_annotations = parse_kube_keyvalue_list(
|
|
257
|
+
KUBERNETES_ANNOTATIONS.split(","), False
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
self.attributes["annotations"] = {**env_annotations, **deco_annotations}
|
|
216
261
|
|
|
217
262
|
# If no docker image is explicitly specified, impute a default image.
|
|
218
263
|
if not self.attributes["image"]:
|
|
@@ -261,6 +306,17 @@ class KubernetesDecorator(StepDecorator):
|
|
|
261
306
|
self.step = step
|
|
262
307
|
self.flow_datastore = flow_datastore
|
|
263
308
|
|
|
309
|
+
if (
|
|
310
|
+
self.attributes["qos"] is not None
|
|
311
|
+
# case insensitive matching.
|
|
312
|
+
and self.attributes["qos"].lower()
|
|
313
|
+
not in [c.lower() for c in SUPPORTED_KUBERNETES_QOS_CLASSES]
|
|
314
|
+
):
|
|
315
|
+
raise MetaflowException(
|
|
316
|
+
"*%s* is not a valid Kubernetes QoS class. Choose one of the following: %s"
|
|
317
|
+
% (self.attributes["qos"], ", ".join(SUPPORTED_KUBERNETES_QOS_CLASSES))
|
|
318
|
+
)
|
|
319
|
+
|
|
264
320
|
if any([deco.name == "batch" for deco in decos]):
|
|
265
321
|
raise MetaflowException(
|
|
266
322
|
"Step *{step}* is marked for execution both on AWS Batch and "
|
|
@@ -369,6 +425,9 @@ class KubernetesDecorator(StepDecorator):
|
|
|
369
425
|
)
|
|
370
426
|
)
|
|
371
427
|
|
|
428
|
+
validate_kube_labels(self.attributes["labels"])
|
|
429
|
+
# TODO: add validation to annotations as well?
|
|
430
|
+
|
|
372
431
|
def package_init(self, flow, step_name, environment):
|
|
373
432
|
try:
|
|
374
433
|
# Kubernetes is a soft dependency.
|
|
@@ -424,7 +483,12 @@ class KubernetesDecorator(StepDecorator):
|
|
|
424
483
|
"=".join([key, str(val)]) if val else key
|
|
425
484
|
for key, val in v.items()
|
|
426
485
|
]
|
|
427
|
-
elif k in [
|
|
486
|
+
elif k in [
|
|
487
|
+
"tolerations",
|
|
488
|
+
"persistent_volume_claims",
|
|
489
|
+
"labels",
|
|
490
|
+
"annotations",
|
|
491
|
+
]:
|
|
428
492
|
cli_args.command_options[k] = json.dumps(v)
|
|
429
493
|
else:
|
|
430
494
|
cli_args.command_options[k] = v
|
|
@@ -498,6 +562,13 @@ class KubernetesDecorator(StepDecorator):
|
|
|
498
562
|
self._save_logs_sidecar = Sidecar("save_logs_periodically")
|
|
499
563
|
self._save_logs_sidecar.start()
|
|
500
564
|
|
|
565
|
+
# Start spot termination monitor sidecar.
|
|
566
|
+
current._update_env(
|
|
567
|
+
{"spot_termination_notice": "/tmp/spot_termination_notice"}
|
|
568
|
+
)
|
|
569
|
+
self._spot_monitor_sidecar = Sidecar("spot_termination_monitor")
|
|
570
|
+
self._spot_monitor_sidecar.start()
|
|
571
|
+
|
|
501
572
|
num_parallel = None
|
|
502
573
|
if hasattr(flow, "_parallel_ubf_iter"):
|
|
503
574
|
num_parallel = flow._parallel_ubf_iter.num_parallel
|
|
@@ -556,6 +627,7 @@ class KubernetesDecorator(StepDecorator):
|
|
|
556
627
|
|
|
557
628
|
try:
|
|
558
629
|
self._save_logs_sidecar.terminate()
|
|
630
|
+
self._spot_monitor_sidecar.terminate()
|
|
559
631
|
except:
|
|
560
632
|
# Best effort kill
|
|
561
633
|
pass
|