ob-metaflow 2.12.30.2__py2.py3-none-any.whl → 2.13.6.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ob-metaflow might be problematic. Click here for more details.

Files changed (96) hide show
  1. metaflow/__init__.py +3 -0
  2. metaflow/cards.py +1 -0
  3. metaflow/cli.py +185 -717
  4. metaflow/cli_args.py +17 -0
  5. metaflow/cli_components/__init__.py +0 -0
  6. metaflow/cli_components/dump_cmd.py +96 -0
  7. metaflow/cli_components/init_cmd.py +51 -0
  8. metaflow/cli_components/run_cmds.py +362 -0
  9. metaflow/cli_components/step_cmd.py +176 -0
  10. metaflow/cli_components/utils.py +140 -0
  11. metaflow/cmd/develop/stub_generator.py +9 -2
  12. metaflow/datastore/flow_datastore.py +2 -2
  13. metaflow/decorators.py +63 -2
  14. metaflow/exception.py +8 -2
  15. metaflow/extension_support/plugins.py +42 -27
  16. metaflow/flowspec.py +176 -23
  17. metaflow/graph.py +28 -27
  18. metaflow/includefile.py +50 -22
  19. metaflow/lint.py +35 -20
  20. metaflow/metadata_provider/heartbeat.py +23 -8
  21. metaflow/metaflow_config.py +10 -1
  22. metaflow/multicore_utils.py +31 -14
  23. metaflow/package.py +17 -3
  24. metaflow/parameters.py +97 -25
  25. metaflow/plugins/__init__.py +22 -0
  26. metaflow/plugins/airflow/airflow.py +18 -17
  27. metaflow/plugins/airflow/airflow_cli.py +1 -0
  28. metaflow/plugins/argo/argo_client.py +0 -2
  29. metaflow/plugins/argo/argo_workflows.py +195 -132
  30. metaflow/plugins/argo/argo_workflows_cli.py +1 -1
  31. metaflow/plugins/argo/argo_workflows_decorator.py +2 -4
  32. metaflow/plugins/argo/argo_workflows_deployer_objects.py +51 -9
  33. metaflow/plugins/argo/jobset_input_paths.py +0 -1
  34. metaflow/plugins/aws/aws_utils.py +6 -1
  35. metaflow/plugins/aws/batch/batch_client.py +1 -3
  36. metaflow/plugins/aws/batch/batch_decorator.py +13 -13
  37. metaflow/plugins/aws/secrets_manager/aws_secrets_manager_secrets_provider.py +13 -10
  38. metaflow/plugins/aws/step_functions/dynamo_db_client.py +0 -3
  39. metaflow/plugins/aws/step_functions/production_token.py +1 -1
  40. metaflow/plugins/aws/step_functions/step_functions.py +33 -1
  41. metaflow/plugins/aws/step_functions/step_functions_cli.py +1 -1
  42. metaflow/plugins/aws/step_functions/step_functions_decorator.py +0 -1
  43. metaflow/plugins/aws/step_functions/step_functions_deployer_objects.py +7 -9
  44. metaflow/plugins/cards/card_cli.py +7 -2
  45. metaflow/plugins/cards/card_creator.py +1 -0
  46. metaflow/plugins/cards/card_decorator.py +79 -8
  47. metaflow/plugins/cards/card_modules/basic.py +56 -5
  48. metaflow/plugins/cards/card_modules/card.py +16 -1
  49. metaflow/plugins/cards/card_modules/components.py +64 -16
  50. metaflow/plugins/cards/card_modules/main.js +27 -25
  51. metaflow/plugins/cards/card_modules/test_cards.py +4 -4
  52. metaflow/plugins/cards/component_serializer.py +1 -1
  53. metaflow/plugins/datatools/s3/s3.py +12 -4
  54. metaflow/plugins/datatools/s3/s3op.py +3 -3
  55. metaflow/plugins/events_decorator.py +338 -186
  56. metaflow/plugins/kubernetes/kube_utils.py +84 -1
  57. metaflow/plugins/kubernetes/kubernetes.py +40 -92
  58. metaflow/plugins/kubernetes/kubernetes_cli.py +32 -7
  59. metaflow/plugins/kubernetes/kubernetes_decorator.py +76 -4
  60. metaflow/plugins/kubernetes/kubernetes_job.py +23 -20
  61. metaflow/plugins/kubernetes/kubernetes_jobsets.py +41 -20
  62. metaflow/plugins/kubernetes/spot_metadata_cli.py +69 -0
  63. metaflow/plugins/kubernetes/spot_monitor_sidecar.py +109 -0
  64. metaflow/plugins/parallel_decorator.py +4 -1
  65. metaflow/plugins/project_decorator.py +33 -5
  66. metaflow/plugins/pypi/bootstrap.py +249 -81
  67. metaflow/plugins/pypi/conda_decorator.py +20 -10
  68. metaflow/plugins/pypi/conda_environment.py +83 -27
  69. metaflow/plugins/pypi/micromamba.py +82 -37
  70. metaflow/plugins/pypi/pip.py +9 -6
  71. metaflow/plugins/pypi/pypi_decorator.py +11 -9
  72. metaflow/plugins/pypi/utils.py +4 -2
  73. metaflow/plugins/timeout_decorator.py +2 -2
  74. metaflow/runner/click_api.py +240 -50
  75. metaflow/runner/deployer.py +1 -1
  76. metaflow/runner/deployer_impl.py +12 -11
  77. metaflow/runner/metaflow_runner.py +68 -34
  78. metaflow/runner/nbdeploy.py +2 -0
  79. metaflow/runner/nbrun.py +1 -1
  80. metaflow/runner/subprocess_manager.py +61 -10
  81. metaflow/runner/utils.py +208 -44
  82. metaflow/runtime.py +216 -112
  83. metaflow/sidecar/sidecar_worker.py +1 -1
  84. metaflow/tracing/tracing_modules.py +4 -1
  85. metaflow/user_configs/__init__.py +0 -0
  86. metaflow/user_configs/config_decorators.py +563 -0
  87. metaflow/user_configs/config_options.py +548 -0
  88. metaflow/user_configs/config_parameters.py +436 -0
  89. metaflow/util.py +22 -0
  90. metaflow/version.py +1 -1
  91. {ob_metaflow-2.12.30.2.dist-info → ob_metaflow-2.13.6.1.dist-info}/METADATA +12 -3
  92. {ob_metaflow-2.12.30.2.dist-info → ob_metaflow-2.13.6.1.dist-info}/RECORD +96 -84
  93. {ob_metaflow-2.12.30.2.dist-info → ob_metaflow-2.13.6.1.dist-info}/WHEEL +1 -1
  94. {ob_metaflow-2.12.30.2.dist-info → ob_metaflow-2.13.6.1.dist-info}/LICENSE +0 -0
  95. {ob_metaflow-2.12.30.2.dist-info → ob_metaflow-2.13.6.1.dist-info}/entry_points.txt +0 -0
  96. {ob_metaflow-2.12.30.2.dist-info → ob_metaflow-2.13.6.1.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,14 @@
1
- from metaflow.exception import CommandException
1
+ import re
2
+ from typing import Dict, List, Optional
3
+ from metaflow.exception import CommandException, MetaflowException
2
4
  from metaflow.util import get_username, get_latest_run_id
3
5
 
4
6
 
7
+ # avoid circular import by having the exception class contained here
8
+ class KubernetesException(MetaflowException):
9
+ headline = "Kubernetes error"
10
+
11
+
5
12
  def parse_cli_options(flow_name, run_id, user, my_runs, echo):
6
13
  if user and my_runs:
7
14
  raise CommandException("--user and --my-runs are mutually exclusive.")
@@ -23,3 +30,79 @@ def parse_cli_options(flow_name, run_id, user, my_runs, echo):
23
30
  raise CommandException("A previous run id was not found. Specify --run-id.")
24
31
 
25
32
  return flow_name, run_id, user
33
+
34
+
35
+ def qos_requests_and_limits(qos: str, cpu: int, memory: int, storage: int):
36
+ "return resource requests and limits for the kubernetes pod based on the given QoS Class"
37
+ # case insensitive matching for QoS class
38
+ qos = qos.lower()
39
+ # Determine the requests and limits to define chosen QoS class
40
+ qos_limits = {}
41
+ qos_requests = {}
42
+ if qos == "guaranteed":
43
+ # Guaranteed - has both cpu/memory limits. requests not required, as these will be inferred.
44
+ qos_limits = {
45
+ "cpu": str(cpu),
46
+ "memory": "%sM" % str(memory),
47
+ "ephemeral-storage": "%sM" % str(storage),
48
+ }
49
+ # NOTE: Even though Kubernetes will produce matching requests for the specified limits, this happens late in the lifecycle.
50
+ # We specify them explicitly here to make some K8S tooling happy, in case they rely on .resources.requests being present at time of submitting the job.
51
+ qos_requests = qos_limits
52
+ else:
53
+ # Burstable - not Guaranteed, and has a memory/cpu limit or request
54
+ qos_requests = {
55
+ "cpu": str(cpu),
56
+ "memory": "%sM" % str(memory),
57
+ "ephemeral-storage": "%sM" % str(storage),
58
+ }
59
+ # TODO: Add support for BestEffort once there is a use case for it.
60
+ # BestEffort - no limit or requests for cpu/memory
61
+ return qos_requests, qos_limits
62
+
63
+
64
+ def validate_kube_labels(
65
+ labels: Optional[Dict[str, Optional[str]]],
66
+ ) -> bool:
67
+ """Validate label values.
68
+
69
+ This validates the kubernetes label values. It does not validate the keys.
70
+ Ideally, keys should be static and also the validation rules for keys are
71
+ more complex than those for values. For full validation rules, see:
72
+
73
+ https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#syntax-and-character-set
74
+ """
75
+
76
+ def validate_label(s: Optional[str]):
77
+ regex_match = r"^(([A-Za-z0-9][-A-Za-z0-9_.]{0,61})?[A-Za-z0-9])?$"
78
+ if not s:
79
+ # allow empty label
80
+ return True
81
+ if not re.search(regex_match, s):
82
+ raise KubernetesException(
83
+ 'Invalid value: "%s"\n'
84
+ "A valid label must be an empty string or one that\n"
85
+ " - Consist of alphanumeric, '-', '_' or '.' characters\n"
86
+ " - Begins and ends with an alphanumeric character\n"
87
+ " - Is at most 63 characters" % s
88
+ )
89
+ return True
90
+
91
+ return all([validate_label(v) for v in labels.values()]) if labels else True
92
+
93
+
94
+ def parse_kube_keyvalue_list(items: List[str], requires_both: bool = True):
95
+ try:
96
+ ret = {}
97
+ for item_str in items:
98
+ item = item_str.split("=", 1)
99
+ if requires_both:
100
+ item[1] # raise IndexError
101
+ if str(item[0]) in ret:
102
+ raise KubernetesException("Duplicate key found: %s" % str(item[0]))
103
+ ret[str(item[0])] = str(item[1]) if len(item) > 1 else None
104
+ return ret
105
+ except KubernetesException as e:
106
+ raise e
107
+ except (AttributeError, IndexError):
108
+ raise KubernetesException("Unable to parse kubernetes list: %s" % items)
@@ -1,11 +1,8 @@
1
- import copy
2
1
  import json
3
2
  import math
4
3
  import os
5
- import re
6
4
  import shlex
7
5
  import time
8
- from typing import Dict, List, Optional
9
6
  from uuid import uuid4
10
7
 
11
8
  from metaflow import current, util
@@ -35,7 +32,6 @@ from metaflow.metaflow_config import (
35
32
  DEFAULT_SECRETS_BACKEND_TYPE,
36
33
  GCP_SECRET_MANAGER_PREFIX,
37
34
  KUBERNETES_FETCH_EC2_METADATA,
38
- KUBERNETES_LABELS,
39
35
  KUBERNETES_SANDBOX_INIT_SCRIPT,
40
36
  OTEL_ENDPOINT,
41
37
  S3_ENDPOINT_URL,
@@ -193,9 +189,11 @@ class Kubernetes(object):
193
189
  persistent_volume_claims=None,
194
190
  tolerations=None,
195
191
  labels=None,
192
+ annotations=None,
196
193
  shared_memory=None,
197
194
  port=None,
198
195
  num_parallel=None,
196
+ qos=None,
199
197
  ):
200
198
  name = "js-%s" % str(uuid4())[:6]
201
199
  jobset = (
@@ -228,6 +226,7 @@ class Kubernetes(object):
228
226
  shared_memory=shared_memory,
229
227
  port=port,
230
228
  num_parallel=num_parallel,
229
+ qos=qos,
231
230
  )
232
231
  .environment_variable("METAFLOW_CODE_SHA", code_package_sha)
233
232
  .environment_variable("METAFLOW_CODE_URL", code_package_url)
@@ -302,17 +301,13 @@ class Kubernetes(object):
302
301
  # see get_datastore_root_from_config in datastore/local.py).
303
302
  )
304
303
 
305
- _labels = self._get_labels(labels)
306
- for k, v in _labels.items():
307
- jobset.label(k, v)
308
-
309
304
  for k in list(
310
305
  [] if not secrets else [secrets] if isinstance(secrets, str) else secrets
311
306
  ) + KUBERNETES_SECRETS.split(","):
312
307
  jobset.secret(k)
313
308
 
314
309
  initial_configs = init_config()
315
- for entry in ["OBP_PERIMETER", "OBP_INTEGRATIONS_SECRETS_METADATA_URL"]:
310
+ for entry in ["OBP_PERIMETER", "OBP_INTEGRATIONS_URL"]:
316
311
  if entry not in initial_configs:
317
312
  raise KubernetesException(
318
313
  f"{entry} was not found in metaflow config. Please make sure to run `outerbounds configure <...>` command which can be found on the Ourebounds UI or reach out to your Outerbounds support team."
@@ -320,8 +315,8 @@ class Kubernetes(object):
320
315
 
321
316
  additional_obp_configs = {
322
317
  "OBP_PERIMETER": initial_configs["OBP_PERIMETER"],
323
- "OBP_INTEGRATIONS_SECRETS_METADATA_URL": initial_configs[
324
- "OBP_INTEGRATIONS_SECRETS_METADATA_URL"
318
+ "OBP_INTEGRATIONS_URL": initial_configs[
319
+ "OBP_INTEGRATIONS_URL"
325
320
  ],
326
321
  }
327
322
  for k, v in additional_obp_configs.items():
@@ -393,13 +388,16 @@ class Kubernetes(object):
393
388
  for name, value in env.items():
394
389
  jobset.environment_variable(name, value)
395
390
 
396
- annotations = {
391
+ system_annotations = {
397
392
  "metaflow/user": user,
398
393
  "metaflow/flow_name": flow_name,
399
394
  "metaflow/control-task-id": task_id,
395
+ "metaflow/run_id": run_id,
396
+ "metaflow/step_name": step_name,
397
+ "metaflow/attempt": attempt,
400
398
  }
401
399
  if current.get("project_name"):
402
- annotations.update(
400
+ system_annotations.update(
403
401
  {
404
402
  "metaflow/project_name": current.project_name,
405
403
  "metaflow/branch_name": current.branch_name,
@@ -407,15 +405,15 @@ class Kubernetes(object):
407
405
  }
408
406
  )
409
407
 
410
- for name, value in annotations.items():
411
- jobset.annotation(name, value)
408
+ system_labels = {
409
+ "app.kubernetes.io/name": "metaflow-task",
410
+ "app.kubernetes.io/part-of": "metaflow",
411
+ }
412
+
413
+ jobset.labels({**({} if not labels else labels), **system_labels})
412
414
 
413
- (
414
- jobset.annotation("metaflow/run_id", run_id)
415
- .annotation("metaflow/step_name", step_name)
416
- .annotation("metaflow/attempt", attempt)
417
- .label("app.kubernetes.io/name", "metaflow-task")
418
- .label("app.kubernetes.io/part-of", "metaflow")
415
+ jobset.annotations(
416
+ {**({} if not annotations else annotations), **system_annotations}
419
417
  )
420
418
  # We need this task-id set so that all the nodes are aware of the control
421
419
  # task's task-id. These "MF_" variables populate the `current.parallel` namedtuple
@@ -504,6 +502,8 @@ class Kubernetes(object):
504
502
  shared_memory=None,
505
503
  port=None,
506
504
  name_pattern=None,
505
+ qos=None,
506
+ annotations=None,
507
507
  ):
508
508
  if env is None:
509
509
  env = {}
@@ -536,7 +536,8 @@ class Kubernetes(object):
536
536
  retries=0,
537
537
  step_name=step_name,
538
538
  tolerations=tolerations,
539
- labels=self._get_labels(labels),
539
+ labels=labels,
540
+ annotations=annotations,
540
541
  use_tmpfs=use_tmpfs,
541
542
  tmpfs_tempdir=tmpfs_tempdir,
542
543
  tmpfs_size=tmpfs_size,
@@ -544,6 +545,7 @@ class Kubernetes(object):
544
545
  persistent_volume_claims=persistent_volume_claims,
545
546
  shared_memory=shared_memory,
546
547
  port=port,
548
+ qos=qos,
547
549
  )
548
550
  .environment_variable("METAFLOW_CODE_SHA", code_package_sha)
549
551
  .environment_variable("METAFLOW_CODE_URL", code_package_url)
@@ -654,13 +656,25 @@ class Kubernetes(object):
654
656
 
655
657
  for name, value in env.items():
656
658
  job.environment_variable(name, value)
659
+ # Add job specific labels
660
+ system_labels = {
661
+ "app.kubernetes.io/name": "metaflow-task",
662
+ "app.kubernetes.io/part-of": "metaflow",
663
+ }
664
+ for name, value in system_labels.items():
665
+ job.label(name, value)
657
666
 
658
- annotations = {
659
- "metaflow/user": user,
667
+ # Add job specific annotations not set in the decorator.
668
+ system_annotations = {
660
669
  "metaflow/flow_name": flow_name,
670
+ "metaflow/run_id": run_id,
671
+ "metaflow/step_name": step_name,
672
+ "metaflow/task_id": task_id,
673
+ "metaflow/attempt": attempt,
674
+ "metaflow/user": user,
661
675
  }
662
676
  if current.get("project_name"):
663
- annotations.update(
677
+ system_annotations.update(
664
678
  {
665
679
  "metaflow/project_name": current.project_name,
666
680
  "metaflow/branch_name": current.branch_name,
@@ -668,18 +682,9 @@ class Kubernetes(object):
668
682
  }
669
683
  )
670
684
 
671
- for name, value in annotations.items():
685
+ for name, value in system_annotations.items():
672
686
  job.annotation(name, value)
673
687
 
674
- (
675
- job.annotation("metaflow/run_id", run_id)
676
- .annotation("metaflow/step_name", step_name)
677
- .annotation("metaflow/task_id", task_id)
678
- .annotation("metaflow/attempt", attempt)
679
- .label("app.kubernetes.io/name", "metaflow-task")
680
- .label("app.kubernetes.io/part-of", "metaflow")
681
- )
682
-
683
688
  return job
684
689
 
685
690
  def create_k8sjob(self, job):
@@ -787,60 +792,3 @@ class Kubernetes(object):
787
792
  "stderr",
788
793
  job_id=self._job.id,
789
794
  )
790
-
791
- @staticmethod
792
- def _get_labels(extra_labels=None):
793
- if extra_labels is None:
794
- extra_labels = {}
795
- env_labels = KUBERNETES_LABELS.split(",") if KUBERNETES_LABELS else []
796
- env_labels = parse_kube_keyvalue_list(env_labels, False)
797
- labels = {**env_labels, **extra_labels}
798
- validate_kube_labels(labels)
799
- return labels
800
-
801
-
802
- def validate_kube_labels(
803
- labels: Optional[Dict[str, Optional[str]]],
804
- ) -> bool:
805
- """Validate label values.
806
-
807
- This validates the kubernetes label values. It does not validate the keys.
808
- Ideally, keys should be static and also the validation rules for keys are
809
- more complex than those for values. For full validation rules, see:
810
-
811
- https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/#syntax-and-character-set
812
- """
813
-
814
- def validate_label(s: Optional[str]):
815
- regex_match = r"^(([A-Za-z0-9][-A-Za-z0-9_.]{0,61})?[A-Za-z0-9])?$"
816
- if not s:
817
- # allow empty label
818
- return True
819
- if not re.search(regex_match, s):
820
- raise KubernetesException(
821
- 'Invalid value: "%s"\n'
822
- "A valid label must be an empty string or one that\n"
823
- " - Consist of alphanumeric, '-', '_' or '.' characters\n"
824
- " - Begins and ends with an alphanumeric character\n"
825
- " - Is at most 63 characters" % s
826
- )
827
- return True
828
-
829
- return all([validate_label(v) for v in labels.values()]) if labels else True
830
-
831
-
832
- def parse_kube_keyvalue_list(items: List[str], requires_both: bool = True):
833
- try:
834
- ret = {}
835
- for item_str in items:
836
- item = item_str.split("=", 1)
837
- if requires_both:
838
- item[1] # raise IndexError
839
- if str(item[0]) in ret:
840
- raise KubernetesException("Duplicate key found: %s" % str(item[0]))
841
- ret[str(item[0])] = str(item[1]) if len(item) > 1 else None
842
- return ret
843
- except KubernetesException as e:
844
- raise e
845
- except (AttributeError, IndexError):
846
- raise KubernetesException("Unable to parse kubernetes list: %s" % items)
@@ -3,14 +3,17 @@ import sys
3
3
  import time
4
4
  import traceback
5
5
 
6
- from metaflow.plugins.kubernetes.kube_utils import parse_cli_options
6
+ from metaflow.plugins.kubernetes.kube_utils import (
7
+ parse_cli_options,
8
+ parse_kube_keyvalue_list,
9
+ )
7
10
  from metaflow.plugins.kubernetes.kubernetes_client import KubernetesClient
8
11
  import metaflow.tracing as tracing
9
12
  from metaflow import JSONTypeClass, util
10
13
  from metaflow._vendor import click
11
14
  from metaflow.exception import METAFLOW_EXIT_DISALLOW_RETRY, MetaflowException
12
15
  from metaflow.metadata_provider.util import sync_local_metadata_from_datastore
13
- from metaflow.metaflow_config import DATASTORE_LOCAL_DIR, KUBERNETES_LABELS
16
+ from metaflow.metaflow_config import DATASTORE_LOCAL_DIR
14
17
  from metaflow.mflog import TASK_LOG_SOURCE
15
18
  from metaflow.unbounded_foreach import UBF_CONTROL, UBF_TASK
16
19
 
@@ -18,9 +21,7 @@ from .kubernetes import (
18
21
  Kubernetes,
19
22
  KubernetesException,
20
23
  KubernetesKilledException,
21
- parse_kube_keyvalue_list,
22
24
  )
23
- from .kubernetes_decorator import KubernetesDecorator
24
25
 
25
26
 
26
27
  @click.group()
@@ -33,12 +34,12 @@ def kubernetes():
33
34
  pass
34
35
 
35
36
 
36
- @tracing.cli_entrypoint("kubernetes/step")
37
37
  @kubernetes.command(
38
38
  help="Execute a single task on Kubernetes. This command calls the top-level step "
39
39
  "command inside a Kubernetes pod with the given options. Typically you do not call "
40
40
  "this command directly; it is used internally by Metaflow."
41
41
  )
42
+ @tracing.cli_entrypoint("kubernetes/step")
42
43
  @click.argument("step-name")
43
44
  @click.argument("code-package-sha")
44
45
  @click.argument("code-package-url")
@@ -126,6 +127,24 @@ def kubernetes():
126
127
  type=int,
127
128
  help="Number of parallel nodes to run as a multi-node job.",
128
129
  )
130
+ @click.option(
131
+ "--qos",
132
+ default=None,
133
+ type=str,
134
+ help="Quality of Service class for the Kubernetes pod",
135
+ )
136
+ @click.option(
137
+ "--labels",
138
+ default=None,
139
+ type=JSONTypeClass(),
140
+ multiple=False,
141
+ )
142
+ @click.option(
143
+ "--annotations",
144
+ default=None,
145
+ type=JSONTypeClass(),
146
+ multiple=False,
147
+ )
129
148
  @click.pass_context
130
149
  def step(
131
150
  ctx,
@@ -154,6 +173,9 @@ def step(
154
173
  shared_memory=None,
155
174
  port=None,
156
175
  num_parallel=None,
176
+ qos=None,
177
+ labels=None,
178
+ annotations=None,
157
179
  **kwargs
158
180
  ):
159
181
  def echo(msg, stream="stderr", job_id=None, **kwargs):
@@ -168,7 +190,7 @@ def step(
168
190
  executable = ctx.obj.environment.executable(step_name, executable)
169
191
 
170
192
  # Set environment
171
- env = {}
193
+ env = {"METAFLOW_FLOW_FILENAME": os.path.basename(sys.argv[0])}
172
194
  env_deco = [deco for deco in node.decorators if deco.name == "environment"]
173
195
  if env_deco:
174
196
  env = env_deco[0].attributes["vars"]
@@ -294,8 +316,11 @@ def step(
294
316
  shared_memory=shared_memory,
295
317
  port=port,
296
318
  num_parallel=num_parallel,
319
+ qos=qos,
320
+ labels=labels,
321
+ annotations=annotations,
297
322
  )
298
- except Exception as e:
323
+ except Exception:
299
324
  traceback.print_exc(chain=False)
300
325
  _sync_metadata()
301
326
  sys.exit(METAFLOW_EXIT_DISALLOW_RETRY)
@@ -19,6 +19,8 @@ from metaflow.metaflow_config import (
19
19
  KUBERNETES_GPU_VENDOR,
20
20
  KUBERNETES_IMAGE_PULL_POLICY,
21
21
  KUBERNETES_MEMORY,
22
+ KUBERNETES_LABELS,
23
+ KUBERNETES_ANNOTATIONS,
22
24
  KUBERNETES_NAMESPACE,
23
25
  KUBERNETES_NODE_SELECTOR,
24
26
  KUBERNETES_PERSISTENT_VOLUME_CLAIMS,
@@ -26,6 +28,7 @@ from metaflow.metaflow_config import (
26
28
  KUBERNETES_SERVICE_ACCOUNT,
27
29
  KUBERNETES_SHARED_MEMORY,
28
30
  KUBERNETES_TOLERATIONS,
31
+ KUBERNETES_QOS,
29
32
  )
30
33
  from metaflow.plugins.resources_decorator import ResourcesDecorator
31
34
  from metaflow.plugins.timeout_decorator import get_run_time_limit_for_task
@@ -33,7 +36,8 @@ from metaflow.sidecar import Sidecar
33
36
  from metaflow.unbounded_foreach import UBF_CONTROL
34
37
 
35
38
  from ..aws.aws_utils import get_docker_registry, get_ec2_instance_metadata
36
- from .kubernetes import KubernetesException, parse_kube_keyvalue_list
39
+ from .kubernetes import KubernetesException
40
+ from .kube_utils import validate_kube_labels, parse_kube_keyvalue_list
37
41
 
38
42
  from metaflow.metaflow_config import MAX_MEMORY_PER_TASK, MAX_CPU_PER_TASK
39
43
 
@@ -43,6 +47,8 @@ except NameError:
43
47
  unicode = str
44
48
  basestring = str
45
49
 
50
+ SUPPORTED_KUBERNETES_QOS_CLASSES = ["Guaranteed", "Burstable"]
51
+
46
52
 
47
53
  class KubernetesDecorator(StepDecorator):
48
54
  """
@@ -88,6 +94,10 @@ class KubernetesDecorator(StepDecorator):
88
94
  tolerations : List[str], default []
89
95
  The default is extracted from METAFLOW_KUBERNETES_TOLERATIONS.
90
96
  Kubernetes tolerations to use when launching pod in Kubernetes.
97
+ labels: Dict[str, str], default: METAFLOW_KUBERNETES_LABELS
98
+ Kubernetes labels to use when launching pod in Kubernetes.
99
+ annotations: Dict[str, str], default: METAFLOW_KUBERNETES_ANNOTATIONS
100
+ Kubernetes annotations to use when launching pod in Kubernetes.
91
101
  use_tmpfs : bool, default False
92
102
  This enables an explicit tmpfs mount for this step.
93
103
  tmpfs_tempdir : bool, default True
@@ -111,6 +121,8 @@ class KubernetesDecorator(StepDecorator):
111
121
  hostname_resolution_timeout: int, default 10 * 60
112
122
  Timeout in seconds for the workers tasks in the gang scheduled cluster to resolve the hostname of control task.
113
123
  Only applicable when @parallel is used.
124
+ qos: str, default: Burstable
125
+ Quality of Service class to assign to the pod. Supported values are: Guaranteed, Burstable, BestEffort
114
126
  """
115
127
 
116
128
  name = "kubernetes"
@@ -128,6 +140,8 @@ class KubernetesDecorator(StepDecorator):
128
140
  "gpu_vendor": None,
129
141
  "tolerations": None, # e.g., [{"key": "arch", "operator": "Equal", "value": "amd"},
130
142
  # {"key": "foo", "operator": "Equal", "value": "bar"}]
143
+ "labels": None, # e.g. {"test-label": "value", "another-label":"value2"}
144
+ "annotations": None, # e.g. {"note": "value", "another-note": "value2"}
131
145
  "use_tmpfs": None,
132
146
  "tmpfs_tempdir": True,
133
147
  "tmpfs_size": None,
@@ -138,6 +152,7 @@ class KubernetesDecorator(StepDecorator):
138
152
  "compute_pool": None,
139
153
  "executable": None,
140
154
  "hostname_resolution_timeout": 10 * 60,
155
+ "qos": KUBERNETES_QOS,
141
156
  }
142
157
  package_url = None
143
158
  package_sha = None
@@ -147,8 +162,8 @@ class KubernetesDecorator(StepDecorator):
147
162
  supports_conda_environment = True
148
163
  target_platform = "linux-64"
149
164
 
150
- def __init__(self, attributes=None, statically_defined=False):
151
- super(KubernetesDecorator, self).__init__(attributes, statically_defined)
165
+ def init(self):
166
+ super(KubernetesDecorator, self).init()
152
167
 
153
168
  if not self.attributes["namespace"]:
154
169
  self.attributes["namespace"] = KUBERNETES_NAMESPACE
@@ -213,6 +228,36 @@ class KubernetesDecorator(StepDecorator):
213
228
  self.attributes["memory"] = KUBERNETES_MEMORY
214
229
  if self.attributes["disk"] == self.defaults["disk"] and KUBERNETES_DISK:
215
230
  self.attributes["disk"] = KUBERNETES_DISK
231
+ # Label source precedence (decreasing):
232
+ # - System labels (set outside of decorator)
233
+ # - Decorator labels: @kubernetes(labels={})
234
+ # - Environment variable labels: METAFLOW_KUBERNETES_LABELS=
235
+ deco_labels = {}
236
+ if self.attributes["labels"] is not None:
237
+ deco_labels = self.attributes["labels"]
238
+
239
+ env_labels = {}
240
+ if KUBERNETES_LABELS:
241
+ env_labels = parse_kube_keyvalue_list(KUBERNETES_LABELS.split(","), False)
242
+
243
+ self.attributes["labels"] = {**env_labels, **deco_labels}
244
+
245
+ # Annotations
246
+ # annotation precedence (decreasing):
247
+ # - System annotations (set outside of decorator)
248
+ # - Decorator annotations: @kubernetes(annotations={})
249
+ # - Environment annotations: METAFLOW_KUBERNETES_ANNOTATIONS=
250
+ deco_annotations = {}
251
+ if self.attributes["annotations"] is not None:
252
+ deco_annotations = self.attributes["annotations"]
253
+
254
+ env_annotations = {}
255
+ if KUBERNETES_ANNOTATIONS:
256
+ env_annotations = parse_kube_keyvalue_list(
257
+ KUBERNETES_ANNOTATIONS.split(","), False
258
+ )
259
+
260
+ self.attributes["annotations"] = {**env_annotations, **deco_annotations}
216
261
 
217
262
  # If no docker image is explicitly specified, impute a default image.
218
263
  if not self.attributes["image"]:
@@ -261,6 +306,17 @@ class KubernetesDecorator(StepDecorator):
261
306
  self.step = step
262
307
  self.flow_datastore = flow_datastore
263
308
 
309
+ if (
310
+ self.attributes["qos"] is not None
311
+ # case insensitive matching.
312
+ and self.attributes["qos"].lower()
313
+ not in [c.lower() for c in SUPPORTED_KUBERNETES_QOS_CLASSES]
314
+ ):
315
+ raise MetaflowException(
316
+ "*%s* is not a valid Kubernetes QoS class. Choose one of the following: %s"
317
+ % (self.attributes["qos"], ", ".join(SUPPORTED_KUBERNETES_QOS_CLASSES))
318
+ )
319
+
264
320
  if any([deco.name == "batch" for deco in decos]):
265
321
  raise MetaflowException(
266
322
  "Step *{step}* is marked for execution both on AWS Batch and "
@@ -369,6 +425,9 @@ class KubernetesDecorator(StepDecorator):
369
425
  )
370
426
  )
371
427
 
428
+ validate_kube_labels(self.attributes["labels"])
429
+ # TODO: add validation to annotations as well?
430
+
372
431
  def package_init(self, flow, step_name, environment):
373
432
  try:
374
433
  # Kubernetes is a soft dependency.
@@ -424,7 +483,12 @@ class KubernetesDecorator(StepDecorator):
424
483
  "=".join([key, str(val)]) if val else key
425
484
  for key, val in v.items()
426
485
  ]
427
- elif k in ["tolerations", "persistent_volume_claims"]:
486
+ elif k in [
487
+ "tolerations",
488
+ "persistent_volume_claims",
489
+ "labels",
490
+ "annotations",
491
+ ]:
428
492
  cli_args.command_options[k] = json.dumps(v)
429
493
  else:
430
494
  cli_args.command_options[k] = v
@@ -498,6 +562,13 @@ class KubernetesDecorator(StepDecorator):
498
562
  self._save_logs_sidecar = Sidecar("save_logs_periodically")
499
563
  self._save_logs_sidecar.start()
500
564
 
565
+ # Start spot termination monitor sidecar.
566
+ current._update_env(
567
+ {"spot_termination_notice": "/tmp/spot_termination_notice"}
568
+ )
569
+ self._spot_monitor_sidecar = Sidecar("spot_termination_monitor")
570
+ self._spot_monitor_sidecar.start()
571
+
501
572
  num_parallel = None
502
573
  if hasattr(flow, "_parallel_ubf_iter"):
503
574
  num_parallel = flow._parallel_ubf_iter.num_parallel
@@ -556,6 +627,7 @@ class KubernetesDecorator(StepDecorator):
556
627
 
557
628
  try:
558
629
  self._save_logs_sidecar.terminate()
630
+ self._spot_monitor_sidecar.terminate()
559
631
  except:
560
632
  # Best effort kill
561
633
  pass