ob-metaflow 2.13.0.1__py2.py3-none-any.whl → 2.13.1.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ob-metaflow might be problematic. Click here for more details.
- metaflow/metadata_provider/heartbeat.py +23 -8
- metaflow/metaflow_config.py +2 -0
- metaflow/plugins/argo/argo_client.py +0 -2
- metaflow/plugins/argo/argo_workflows.py +86 -104
- metaflow/plugins/argo/argo_workflows_cli.py +0 -1
- metaflow/plugins/argo/argo_workflows_decorator.py +2 -4
- metaflow/plugins/argo/jobset_input_paths.py +0 -1
- metaflow/plugins/aws/aws_utils.py +6 -1
- metaflow/plugins/aws/batch/batch_client.py +1 -3
- metaflow/plugins/aws/batch/batch_decorator.py +11 -11
- metaflow/plugins/aws/step_functions/dynamo_db_client.py +0 -3
- metaflow/plugins/aws/step_functions/production_token.py +1 -1
- metaflow/plugins/aws/step_functions/step_functions.py +1 -1
- metaflow/plugins/aws/step_functions/step_functions_cli.py +0 -1
- metaflow/plugins/aws/step_functions/step_functions_decorator.py +0 -1
- metaflow/plugins/aws/step_functions/step_functions_deployer_objects.py +0 -1
- metaflow/plugins/kubernetes/kube_utils.py +55 -1
- metaflow/plugins/kubernetes/kubernetes.py +33 -80
- metaflow/plugins/kubernetes/kubernetes_cli.py +22 -5
- metaflow/plugins/kubernetes/kubernetes_decorator.py +49 -2
- metaflow/plugins/kubernetes/kubernetes_job.py +3 -6
- metaflow/plugins/kubernetes/kubernetes_jobsets.py +22 -5
- metaflow/plugins/pypi/bootstrap.py +87 -54
- metaflow/plugins/pypi/conda_environment.py +7 -6
- metaflow/plugins/pypi/micromamba.py +35 -21
- metaflow/plugins/pypi/pip.py +2 -4
- metaflow/plugins/pypi/utils.py +4 -2
- metaflow/version.py +1 -1
- {ob_metaflow-2.13.0.1.dist-info → ob_metaflow-2.13.1.1.dist-info}/METADATA +2 -2
- {ob_metaflow-2.13.0.1.dist-info → ob_metaflow-2.13.1.1.dist-info}/RECORD +34 -34
- {ob_metaflow-2.13.0.1.dist-info → ob_metaflow-2.13.1.1.dist-info}/WHEEL +1 -1
- {ob_metaflow-2.13.0.1.dist-info → ob_metaflow-2.13.1.1.dist-info}/LICENSE +0 -0
- {ob_metaflow-2.13.0.1.dist-info → ob_metaflow-2.13.1.1.dist-info}/entry_points.txt +0 -0
- {ob_metaflow-2.13.0.1.dist-info → ob_metaflow-2.13.1.1.dist-info}/top_level.txt +0 -0
|
@@ -1,7 +1,14 @@
|
|
|
1
|
-
|
|
1
|
+
import re
|
|
2
|
+
from typing import Dict, List, Optional
|
|
3
|
+
from metaflow.exception import CommandException, MetaflowException
|
|
2
4
|
from metaflow.util import get_username, get_latest_run_id
|
|
3
5
|
|
|
4
6
|
|
|
7
|
+
# avoid circular import by having the exception class contained here
|
|
8
|
+
class KubernetesException(MetaflowException):
|
|
9
|
+
headline = "Kubernetes error"
|
|
10
|
+
|
|
11
|
+
|
|
5
12
|
def parse_cli_options(flow_name, run_id, user, my_runs, echo):
|
|
6
13
|
if user and my_runs:
|
|
7
14
|
raise CommandException("--user and --my-runs are mutually exclusive.")
|
|
@@ -52,3 +59,50 @@ def qos_requests_and_limits(qos: str, cpu: int, memory: int, storage: int):
|
|
|
52
59
|
# TODO: Add support for BestEffort once there is a use case for it.
|
|
53
60
|
# BestEffort - no limit or requests for cpu/memory
|
|
54
61
|
return qos_requests, qos_limits
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def validate_kube_labels(
|
|
65
|
+
labels: Optional[Dict[str, Optional[str]]],
|
|
66
|
+
) -> bool:
|
|
67
|
+
"""Validate label values.
|
|
68
|
+
|
|
69
|
+
This validates the kubernetes label values. It does not validate the keys.
|
|
70
|
+
Ideally, keys should be static and also the validation rules for keys are
|
|
71
|
+
more complex than those for values. For full validation rules, see:
|
|
72
|
+
|
|
73
|
+
https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#syntax-and-character-set
|
|
74
|
+
"""
|
|
75
|
+
|
|
76
|
+
def validate_label(s: Optional[str]):
|
|
77
|
+
regex_match = r"^(([A-Za-z0-9][-A-Za-z0-9_.]{0,61})?[A-Za-z0-9])?$"
|
|
78
|
+
if not s:
|
|
79
|
+
# allow empty label
|
|
80
|
+
return True
|
|
81
|
+
if not re.search(regex_match, s):
|
|
82
|
+
raise KubernetesException(
|
|
83
|
+
'Invalid value: "%s"\n'
|
|
84
|
+
"A valid label must be an empty string or one that\n"
|
|
85
|
+
" - Consist of alphanumeric, '-', '_' or '.' characters\n"
|
|
86
|
+
" - Begins and ends with an alphanumeric character\n"
|
|
87
|
+
" - Is at most 63 characters" % s
|
|
88
|
+
)
|
|
89
|
+
return True
|
|
90
|
+
|
|
91
|
+
return all([validate_label(v) for v in labels.values()]) if labels else True
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
def parse_kube_keyvalue_list(items: List[str], requires_both: bool = True):
|
|
95
|
+
try:
|
|
96
|
+
ret = {}
|
|
97
|
+
for item_str in items:
|
|
98
|
+
item = item_str.split("=", 1)
|
|
99
|
+
if requires_both:
|
|
100
|
+
item[1] # raise IndexError
|
|
101
|
+
if str(item[0]) in ret:
|
|
102
|
+
raise KubernetesException("Duplicate key found: %s" % str(item[0]))
|
|
103
|
+
ret[str(item[0])] = str(item[1]) if len(item) > 1 else None
|
|
104
|
+
return ret
|
|
105
|
+
except KubernetesException as e:
|
|
106
|
+
raise e
|
|
107
|
+
except (AttributeError, IndexError):
|
|
108
|
+
raise KubernetesException("Unable to parse kubernetes list: %s" % items)
|
|
@@ -1,11 +1,8 @@
|
|
|
1
|
-
import copy
|
|
2
1
|
import json
|
|
3
2
|
import math
|
|
4
3
|
import os
|
|
5
|
-
import re
|
|
6
4
|
import shlex
|
|
7
5
|
import time
|
|
8
|
-
from typing import Dict, List, Optional
|
|
9
6
|
from uuid import uuid4
|
|
10
7
|
|
|
11
8
|
from metaflow import current, util
|
|
@@ -35,7 +32,6 @@ from metaflow.metaflow_config import (
|
|
|
35
32
|
DEFAULT_SECRETS_BACKEND_TYPE,
|
|
36
33
|
GCP_SECRET_MANAGER_PREFIX,
|
|
37
34
|
KUBERNETES_FETCH_EC2_METADATA,
|
|
38
|
-
KUBERNETES_LABELS,
|
|
39
35
|
KUBERNETES_SANDBOX_INIT_SCRIPT,
|
|
40
36
|
OTEL_ENDPOINT,
|
|
41
37
|
S3_ENDPOINT_URL,
|
|
@@ -193,6 +189,7 @@ class Kubernetes(object):
|
|
|
193
189
|
persistent_volume_claims=None,
|
|
194
190
|
tolerations=None,
|
|
195
191
|
labels=None,
|
|
192
|
+
annotations=None,
|
|
196
193
|
shared_memory=None,
|
|
197
194
|
port=None,
|
|
198
195
|
num_parallel=None,
|
|
@@ -304,10 +301,6 @@ class Kubernetes(object):
|
|
|
304
301
|
# see get_datastore_root_from_config in datastore/local.py).
|
|
305
302
|
)
|
|
306
303
|
|
|
307
|
-
_labels = self._get_labels(labels)
|
|
308
|
-
for k, v in _labels.items():
|
|
309
|
-
jobset.label(k, v)
|
|
310
|
-
|
|
311
304
|
for k in list(
|
|
312
305
|
[] if not secrets else [secrets] if isinstance(secrets, str) else secrets
|
|
313
306
|
) + KUBERNETES_SECRETS.split(","):
|
|
@@ -395,13 +388,16 @@ class Kubernetes(object):
|
|
|
395
388
|
for name, value in env.items():
|
|
396
389
|
jobset.environment_variable(name, value)
|
|
397
390
|
|
|
398
|
-
|
|
391
|
+
system_annotations = {
|
|
399
392
|
"metaflow/user": user,
|
|
400
393
|
"metaflow/flow_name": flow_name,
|
|
401
394
|
"metaflow/control-task-id": task_id,
|
|
395
|
+
"metaflow/run_id": run_id,
|
|
396
|
+
"metaflow/step_name": step_name,
|
|
397
|
+
"metaflow/attempt": attempt,
|
|
402
398
|
}
|
|
403
399
|
if current.get("project_name"):
|
|
404
|
-
|
|
400
|
+
system_annotations.update(
|
|
405
401
|
{
|
|
406
402
|
"metaflow/project_name": current.project_name,
|
|
407
403
|
"metaflow/branch_name": current.branch_name,
|
|
@@ -409,15 +405,15 @@ class Kubernetes(object):
|
|
|
409
405
|
}
|
|
410
406
|
)
|
|
411
407
|
|
|
412
|
-
|
|
413
|
-
|
|
408
|
+
system_labels = {
|
|
409
|
+
"app.kubernetes.io/name": "metaflow-task",
|
|
410
|
+
"app.kubernetes.io/part-of": "metaflow",
|
|
411
|
+
}
|
|
414
412
|
|
|
415
|
-
(
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
.label("app.kubernetes.io/name", "metaflow-task")
|
|
420
|
-
.label("app.kubernetes.io/part-of", "metaflow")
|
|
413
|
+
jobset.labels({**({} if not labels else labels), **system_labels})
|
|
414
|
+
|
|
415
|
+
jobset.annotations(
|
|
416
|
+
{**({} if not annotations else annotations), **system_annotations}
|
|
421
417
|
)
|
|
422
418
|
# We need this task-id set so that all the nodes are aware of the control
|
|
423
419
|
# task's task-id. These "MF_" variables populate the `current.parallel` namedtuple
|
|
@@ -507,6 +503,7 @@ class Kubernetes(object):
|
|
|
507
503
|
port=None,
|
|
508
504
|
name_pattern=None,
|
|
509
505
|
qos=None,
|
|
506
|
+
annotations=None,
|
|
510
507
|
):
|
|
511
508
|
if env is None:
|
|
512
509
|
env = {}
|
|
@@ -539,7 +536,8 @@ class Kubernetes(object):
|
|
|
539
536
|
retries=0,
|
|
540
537
|
step_name=step_name,
|
|
541
538
|
tolerations=tolerations,
|
|
542
|
-
labels=
|
|
539
|
+
labels=labels,
|
|
540
|
+
annotations=annotations,
|
|
543
541
|
use_tmpfs=use_tmpfs,
|
|
544
542
|
tmpfs_tempdir=tmpfs_tempdir,
|
|
545
543
|
tmpfs_size=tmpfs_size,
|
|
@@ -658,13 +656,25 @@ class Kubernetes(object):
|
|
|
658
656
|
|
|
659
657
|
for name, value in env.items():
|
|
660
658
|
job.environment_variable(name, value)
|
|
659
|
+
# Add job specific labels
|
|
660
|
+
system_labels = {
|
|
661
|
+
"app.kubernetes.io/name": "metaflow-task",
|
|
662
|
+
"app.kubernetes.io/part-of": "metaflow",
|
|
663
|
+
}
|
|
664
|
+
for name, value in system_labels.items():
|
|
665
|
+
job.label(name, value)
|
|
661
666
|
|
|
662
|
-
annotations
|
|
663
|
-
|
|
667
|
+
# Add job specific annotations not set in the decorator.
|
|
668
|
+
system_annotations = {
|
|
664
669
|
"metaflow/flow_name": flow_name,
|
|
670
|
+
"metaflow/run_id": run_id,
|
|
671
|
+
"metaflow/step_name": step_name,
|
|
672
|
+
"metaflow/task_id": task_id,
|
|
673
|
+
"metaflow/attempt": attempt,
|
|
674
|
+
"metaflow/user": user,
|
|
665
675
|
}
|
|
666
676
|
if current.get("project_name"):
|
|
667
|
-
|
|
677
|
+
system_annotations.update(
|
|
668
678
|
{
|
|
669
679
|
"metaflow/project_name": current.project_name,
|
|
670
680
|
"metaflow/branch_name": current.branch_name,
|
|
@@ -672,7 +682,7 @@ class Kubernetes(object):
|
|
|
672
682
|
}
|
|
673
683
|
)
|
|
674
684
|
|
|
675
|
-
for name, value in
|
|
685
|
+
for name, value in system_annotations.items():
|
|
676
686
|
job.annotation(name, value)
|
|
677
687
|
|
|
678
688
|
(
|
|
@@ -791,60 +801,3 @@ class Kubernetes(object):
|
|
|
791
801
|
"stderr",
|
|
792
802
|
job_id=self._job.id,
|
|
793
803
|
)
|
|
794
|
-
|
|
795
|
-
@staticmethod
|
|
796
|
-
def _get_labels(extra_labels=None):
|
|
797
|
-
if extra_labels is None:
|
|
798
|
-
extra_labels = {}
|
|
799
|
-
env_labels = KUBERNETES_LABELS.split(",") if KUBERNETES_LABELS else []
|
|
800
|
-
env_labels = parse_kube_keyvalue_list(env_labels, False)
|
|
801
|
-
labels = {**env_labels, **extra_labels}
|
|
802
|
-
validate_kube_labels(labels)
|
|
803
|
-
return labels
|
|
804
|
-
|
|
805
|
-
|
|
806
|
-
def validate_kube_labels(
|
|
807
|
-
labels: Optional[Dict[str, Optional[str]]],
|
|
808
|
-
) -> bool:
|
|
809
|
-
"""Validate label values.
|
|
810
|
-
|
|
811
|
-
This validates the kubernetes label values. It does not validate the keys.
|
|
812
|
-
Ideally, keys should be static and also the validation rules for keys are
|
|
813
|
-
more complex than those for values. For full validation rules, see:
|
|
814
|
-
|
|
815
|
-
https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#syntax-and-character-set
|
|
816
|
-
"""
|
|
817
|
-
|
|
818
|
-
def validate_label(s: Optional[str]):
|
|
819
|
-
regex_match = r"^(([A-Za-z0-9][-A-Za-z0-9_.]{0,61})?[A-Za-z0-9])?$"
|
|
820
|
-
if not s:
|
|
821
|
-
# allow empty label
|
|
822
|
-
return True
|
|
823
|
-
if not re.search(regex_match, s):
|
|
824
|
-
raise KubernetesException(
|
|
825
|
-
'Invalid value: "%s"\n'
|
|
826
|
-
"A valid label must be an empty string or one that\n"
|
|
827
|
-
" - Consist of alphanumeric, '-', '_' or '.' characters\n"
|
|
828
|
-
" - Begins and ends with an alphanumeric character\n"
|
|
829
|
-
" - Is at most 63 characters" % s
|
|
830
|
-
)
|
|
831
|
-
return True
|
|
832
|
-
|
|
833
|
-
return all([validate_label(v) for v in labels.values()]) if labels else True
|
|
834
|
-
|
|
835
|
-
|
|
836
|
-
def parse_kube_keyvalue_list(items: List[str], requires_both: bool = True):
|
|
837
|
-
try:
|
|
838
|
-
ret = {}
|
|
839
|
-
for item_str in items:
|
|
840
|
-
item = item_str.split("=", 1)
|
|
841
|
-
if requires_both:
|
|
842
|
-
item[1] # raise IndexError
|
|
843
|
-
if str(item[0]) in ret:
|
|
844
|
-
raise KubernetesException("Duplicate key found: %s" % str(item[0]))
|
|
845
|
-
ret[str(item[0])] = str(item[1]) if len(item) > 1 else None
|
|
846
|
-
return ret
|
|
847
|
-
except KubernetesException as e:
|
|
848
|
-
raise e
|
|
849
|
-
except (AttributeError, IndexError):
|
|
850
|
-
raise KubernetesException("Unable to parse kubernetes list: %s" % items)
|
|
@@ -3,14 +3,17 @@ import sys
|
|
|
3
3
|
import time
|
|
4
4
|
import traceback
|
|
5
5
|
|
|
6
|
-
from metaflow.plugins.kubernetes.kube_utils import
|
|
6
|
+
from metaflow.plugins.kubernetes.kube_utils import (
|
|
7
|
+
parse_cli_options,
|
|
8
|
+
parse_kube_keyvalue_list,
|
|
9
|
+
)
|
|
7
10
|
from metaflow.plugins.kubernetes.kubernetes_client import KubernetesClient
|
|
8
11
|
import metaflow.tracing as tracing
|
|
9
12
|
from metaflow import JSONTypeClass, util
|
|
10
13
|
from metaflow._vendor import click
|
|
11
14
|
from metaflow.exception import METAFLOW_EXIT_DISALLOW_RETRY, MetaflowException
|
|
12
15
|
from metaflow.metadata_provider.util import sync_local_metadata_from_datastore
|
|
13
|
-
from metaflow.metaflow_config import DATASTORE_LOCAL_DIR
|
|
16
|
+
from metaflow.metaflow_config import DATASTORE_LOCAL_DIR
|
|
14
17
|
from metaflow.mflog import TASK_LOG_SOURCE
|
|
15
18
|
from metaflow.unbounded_foreach import UBF_CONTROL, UBF_TASK
|
|
16
19
|
|
|
@@ -18,9 +21,7 @@ from .kubernetes import (
|
|
|
18
21
|
Kubernetes,
|
|
19
22
|
KubernetesException,
|
|
20
23
|
KubernetesKilledException,
|
|
21
|
-
parse_kube_keyvalue_list,
|
|
22
24
|
)
|
|
23
|
-
from .kubernetes_decorator import KubernetesDecorator
|
|
24
25
|
|
|
25
26
|
|
|
26
27
|
@click.group()
|
|
@@ -132,6 +133,18 @@ def kubernetes():
|
|
|
132
133
|
type=str,
|
|
133
134
|
help="Quality of Service class for the Kubernetes pod",
|
|
134
135
|
)
|
|
136
|
+
@click.option(
|
|
137
|
+
"--labels",
|
|
138
|
+
default=None,
|
|
139
|
+
type=JSONTypeClass(),
|
|
140
|
+
multiple=False,
|
|
141
|
+
)
|
|
142
|
+
@click.option(
|
|
143
|
+
"--annotations",
|
|
144
|
+
default=None,
|
|
145
|
+
type=JSONTypeClass(),
|
|
146
|
+
multiple=False,
|
|
147
|
+
)
|
|
135
148
|
@click.pass_context
|
|
136
149
|
def step(
|
|
137
150
|
ctx,
|
|
@@ -161,6 +174,8 @@ def step(
|
|
|
161
174
|
port=None,
|
|
162
175
|
num_parallel=None,
|
|
163
176
|
qos=None,
|
|
177
|
+
labels=None,
|
|
178
|
+
annotations=None,
|
|
164
179
|
**kwargs
|
|
165
180
|
):
|
|
166
181
|
def echo(msg, stream="stderr", job_id=None, **kwargs):
|
|
@@ -302,8 +317,10 @@ def step(
|
|
|
302
317
|
port=port,
|
|
303
318
|
num_parallel=num_parallel,
|
|
304
319
|
qos=qos,
|
|
320
|
+
labels=labels,
|
|
321
|
+
annotations=annotations,
|
|
305
322
|
)
|
|
306
|
-
except Exception
|
|
323
|
+
except Exception:
|
|
307
324
|
traceback.print_exc(chain=False)
|
|
308
325
|
_sync_metadata()
|
|
309
326
|
sys.exit(METAFLOW_EXIT_DISALLOW_RETRY)
|
|
@@ -19,6 +19,8 @@ from metaflow.metaflow_config import (
|
|
|
19
19
|
KUBERNETES_GPU_VENDOR,
|
|
20
20
|
KUBERNETES_IMAGE_PULL_POLICY,
|
|
21
21
|
KUBERNETES_MEMORY,
|
|
22
|
+
KUBERNETES_LABELS,
|
|
23
|
+
KUBERNETES_ANNOTATIONS,
|
|
22
24
|
KUBERNETES_NAMESPACE,
|
|
23
25
|
KUBERNETES_NODE_SELECTOR,
|
|
24
26
|
KUBERNETES_PERSISTENT_VOLUME_CLAIMS,
|
|
@@ -34,7 +36,8 @@ from metaflow.sidecar import Sidecar
|
|
|
34
36
|
from metaflow.unbounded_foreach import UBF_CONTROL
|
|
35
37
|
|
|
36
38
|
from ..aws.aws_utils import get_docker_registry, get_ec2_instance_metadata
|
|
37
|
-
from .kubernetes import KubernetesException
|
|
39
|
+
from .kubernetes import KubernetesException
|
|
40
|
+
from .kube_utils import validate_kube_labels, parse_kube_keyvalue_list
|
|
38
41
|
|
|
39
42
|
from metaflow.metaflow_config import MAX_MEMORY_PER_TASK, MAX_CPU_PER_TASK
|
|
40
43
|
|
|
@@ -91,6 +94,10 @@ class KubernetesDecorator(StepDecorator):
|
|
|
91
94
|
tolerations : List[str], default []
|
|
92
95
|
The default is extracted from METAFLOW_KUBERNETES_TOLERATIONS.
|
|
93
96
|
Kubernetes tolerations to use when launching pod in Kubernetes.
|
|
97
|
+
labels: Dict[str, str], default: METAFLOW_KUBERNETES_LABELS
|
|
98
|
+
Kubernetes labels to use when launching pod in Kubernetes.
|
|
99
|
+
annotations: Dict[str, str], default: METAFLOW_KUBERNETES_ANNOTATIONS
|
|
100
|
+
Kubernetes annotations to use when launching pod in Kubernetes.
|
|
94
101
|
use_tmpfs : bool, default False
|
|
95
102
|
This enables an explicit tmpfs mount for this step.
|
|
96
103
|
tmpfs_tempdir : bool, default True
|
|
@@ -133,6 +140,8 @@ class KubernetesDecorator(StepDecorator):
|
|
|
133
140
|
"gpu_vendor": None,
|
|
134
141
|
"tolerations": None, # e.g., [{"key": "arch", "operator": "Equal", "value": "amd"},
|
|
135
142
|
# {"key": "foo", "operator": "Equal", "value": "bar"}]
|
|
143
|
+
"labels": None, # e.g. {"test-label": "value", "another-label":"value2"}
|
|
144
|
+
"annotations": None, # e.g. {"note": "value", "another-note": "value2"}
|
|
136
145
|
"use_tmpfs": None,
|
|
137
146
|
"tmpfs_tempdir": True,
|
|
138
147
|
"tmpfs_size": None,
|
|
@@ -219,6 +228,36 @@ class KubernetesDecorator(StepDecorator):
|
|
|
219
228
|
self.attributes["memory"] = KUBERNETES_MEMORY
|
|
220
229
|
if self.attributes["disk"] == self.defaults["disk"] and KUBERNETES_DISK:
|
|
221
230
|
self.attributes["disk"] = KUBERNETES_DISK
|
|
231
|
+
# Label source precedence (decreasing):
|
|
232
|
+
# - System labels (set outside of decorator)
|
|
233
|
+
# - Decorator labels: @kubernetes(labels={})
|
|
234
|
+
# - Environment variable labels: METAFLOW_KUBERNETES_LABELS=
|
|
235
|
+
deco_labels = {}
|
|
236
|
+
if self.attributes["labels"] is not None:
|
|
237
|
+
deco_labels = self.attributes["labels"]
|
|
238
|
+
|
|
239
|
+
env_labels = {}
|
|
240
|
+
if KUBERNETES_LABELS:
|
|
241
|
+
env_labels = parse_kube_keyvalue_list(KUBERNETES_LABELS.split(","), False)
|
|
242
|
+
|
|
243
|
+
self.attributes["labels"] = {**env_labels, **deco_labels}
|
|
244
|
+
|
|
245
|
+
# Annotations
|
|
246
|
+
# annotation precedence (decreasing):
|
|
247
|
+
# - System annotations (set outside of decorator)
|
|
248
|
+
# - Decorator annotations: @kubernetes(annotations={})
|
|
249
|
+
# - Environment annotations: METAFLOW_KUBERNETES_ANNOTATIONS=
|
|
250
|
+
deco_annotations = {}
|
|
251
|
+
if self.attributes["annotations"] is not None:
|
|
252
|
+
deco_annotations = self.attributes["annotations"]
|
|
253
|
+
|
|
254
|
+
env_annotations = {}
|
|
255
|
+
if KUBERNETES_ANNOTATIONS:
|
|
256
|
+
env_annotations = parse_kube_keyvalue_list(
|
|
257
|
+
KUBERNETES_ANNOTATIONS.split(","), False
|
|
258
|
+
)
|
|
259
|
+
|
|
260
|
+
self.attributes["annotations"] = {**env_annotations, **deco_annotations}
|
|
222
261
|
|
|
223
262
|
# If no docker image is explicitly specified, impute a default image.
|
|
224
263
|
if not self.attributes["image"]:
|
|
@@ -386,6 +425,9 @@ class KubernetesDecorator(StepDecorator):
|
|
|
386
425
|
)
|
|
387
426
|
)
|
|
388
427
|
|
|
428
|
+
validate_kube_labels(self.attributes["labels"])
|
|
429
|
+
# TODO: add validation to annotations as well?
|
|
430
|
+
|
|
389
431
|
def package_init(self, flow, step_name, environment):
|
|
390
432
|
try:
|
|
391
433
|
# Kubernetes is a soft dependency.
|
|
@@ -441,7 +483,12 @@ class KubernetesDecorator(StepDecorator):
|
|
|
441
483
|
"=".join([key, str(val)]) if val else key
|
|
442
484
|
for key, val in v.items()
|
|
443
485
|
]
|
|
444
|
-
elif k in [
|
|
486
|
+
elif k in [
|
|
487
|
+
"tolerations",
|
|
488
|
+
"persistent_volume_claims",
|
|
489
|
+
"labels",
|
|
490
|
+
"annotations",
|
|
491
|
+
]:
|
|
445
492
|
cli_args.command_options[k] = json.dumps(v)
|
|
446
493
|
else:
|
|
447
494
|
cli_args.command_options[k] = v
|
|
@@ -1,23 +1,20 @@
|
|
|
1
|
-
import copy
|
|
2
1
|
import json
|
|
3
2
|
import math
|
|
4
3
|
import random
|
|
5
|
-
import sys
|
|
6
4
|
import time
|
|
7
5
|
|
|
8
6
|
from metaflow.exception import MetaflowException
|
|
9
7
|
from metaflow.metaflow_config import KUBERNETES_SECRETS
|
|
10
8
|
from metaflow.tracing import inject_tracing_vars
|
|
11
|
-
from metaflow.unbounded_foreach import UBF_CONTROL, UBF_TASK
|
|
12
9
|
from metaflow.metaflow_config_funcs import init_config
|
|
13
10
|
|
|
14
11
|
CLIENT_REFRESH_INTERVAL_SECONDS = 300
|
|
12
|
+
|
|
13
|
+
from .kube_utils import qos_requests_and_limits
|
|
15
14
|
from .kubernetes_jobsets import (
|
|
16
15
|
KubernetesJobSet,
|
|
17
16
|
) # We need this import for Kubernetes Client.
|
|
18
17
|
|
|
19
|
-
from .kube_utils import qos_requests_and_limits
|
|
20
|
-
|
|
21
18
|
|
|
22
19
|
class KubernetesJobException(MetaflowException):
|
|
23
20
|
headline = "Kubernetes job error"
|
|
@@ -451,7 +448,7 @@ class RunningJob(object):
|
|
|
451
448
|
def best_effort_kill():
|
|
452
449
|
try:
|
|
453
450
|
self.kill()
|
|
454
|
-
except Exception
|
|
451
|
+
except Exception:
|
|
455
452
|
pass
|
|
456
453
|
|
|
457
454
|
atexit.register(best_effort_kill)
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import copy
|
|
2
1
|
import json
|
|
3
2
|
import math
|
|
4
3
|
import random
|
|
@@ -7,7 +6,6 @@ from collections import namedtuple
|
|
|
7
6
|
from metaflow.exception import MetaflowException
|
|
8
7
|
from metaflow.metaflow_config import KUBERNETES_JOBSET_GROUP, KUBERNETES_JOBSET_VERSION
|
|
9
8
|
from metaflow.tracing import inject_tracing_vars
|
|
10
|
-
from metaflow.metaflow_config import KUBERNETES_SECRETS
|
|
11
9
|
|
|
12
10
|
from .kube_utils import qos_requests_and_limits
|
|
13
11
|
|
|
@@ -257,7 +255,7 @@ class RunningJobSet(object):
|
|
|
257
255
|
def best_effort_kill():
|
|
258
256
|
try:
|
|
259
257
|
self.kill()
|
|
260
|
-
except Exception
|
|
258
|
+
except Exception:
|
|
261
259
|
pass
|
|
262
260
|
|
|
263
261
|
atexit.register(best_effort_kill)
|
|
@@ -342,7 +340,7 @@ class RunningJobSet(object):
|
|
|
342
340
|
stdout=True,
|
|
343
341
|
tty=False,
|
|
344
342
|
)
|
|
345
|
-
except Exception
|
|
343
|
+
except Exception:
|
|
346
344
|
with client.ApiClient() as api_client:
|
|
347
345
|
# If we are unable to kill the control pod then
|
|
348
346
|
# Delete the jobset to kill the subsequent pods.
|
|
@@ -862,6 +860,16 @@ class KubernetesJobSet(object):
|
|
|
862
860
|
self._annotations = dict(self._annotations, **{name: value})
|
|
863
861
|
return self
|
|
864
862
|
|
|
863
|
+
def labels(self, labels):
|
|
864
|
+
for k, v in labels.items():
|
|
865
|
+
self.label(k, v)
|
|
866
|
+
return self
|
|
867
|
+
|
|
868
|
+
def annotations(self, annotations):
|
|
869
|
+
for k, v in annotations.items():
|
|
870
|
+
self.annotation(k, v)
|
|
871
|
+
return self
|
|
872
|
+
|
|
865
873
|
def secret(self, name):
|
|
866
874
|
self.worker.secret(name)
|
|
867
875
|
self.control.secret(name)
|
|
@@ -987,15 +995,24 @@ class KubernetesArgoJobSet(object):
|
|
|
987
995
|
self._labels = dict(self._labels, **{name: value})
|
|
988
996
|
return self
|
|
989
997
|
|
|
998
|
+
def labels(self, labels):
|
|
999
|
+
for k, v in labels.items():
|
|
1000
|
+
self.label(k, v)
|
|
1001
|
+
return self
|
|
1002
|
+
|
|
990
1003
|
def annotation(self, name, value):
|
|
991
1004
|
self.worker.annotation(name, value)
|
|
992
1005
|
self.control.annotation(name, value)
|
|
993
1006
|
self._annotations = dict(self._annotations, **{name: value})
|
|
994
1007
|
return self
|
|
995
1008
|
|
|
1009
|
+
def annotations(self, annotations):
|
|
1010
|
+
for k, v in annotations.items():
|
|
1011
|
+
self.annotation(k, v)
|
|
1012
|
+
return self
|
|
1013
|
+
|
|
996
1014
|
def dump(self):
|
|
997
1015
|
client = self._kubernetes_sdk
|
|
998
|
-
import json
|
|
999
1016
|
|
|
1000
1017
|
data = json.dumps(
|
|
1001
1018
|
client.ApiClient().sanitize_for_serialization(
|