ob-metaflow-extensions 1.1.130__py2.py3-none-any.whl → 1.5.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ob-metaflow-extensions might be problematic. Click here for more details.

Files changed (105) hide show
  1. metaflow_extensions/outerbounds/__init__.py +1 -1
  2. metaflow_extensions/outerbounds/plugins/__init__.py +34 -4
  3. metaflow_extensions/outerbounds/plugins/apps/__init__.py +0 -0
  4. metaflow_extensions/outerbounds/plugins/apps/app_cli.py +0 -0
  5. metaflow_extensions/outerbounds/plugins/apps/app_utils.py +187 -0
  6. metaflow_extensions/outerbounds/plugins/apps/consts.py +3 -0
  7. metaflow_extensions/outerbounds/plugins/apps/core/__init__.py +15 -0
  8. metaflow_extensions/outerbounds/plugins/apps/core/_state_machine.py +506 -0
  9. metaflow_extensions/outerbounds/plugins/apps/core/_vendor/__init__.py +0 -0
  10. metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/__init__.py +4 -0
  11. metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/spinners.py +478 -0
  12. metaflow_extensions/outerbounds/plugins/apps/core/app_config.py +128 -0
  13. metaflow_extensions/outerbounds/plugins/apps/core/app_deploy_decorator.py +330 -0
  14. metaflow_extensions/outerbounds/plugins/apps/core/artifacts.py +0 -0
  15. metaflow_extensions/outerbounds/plugins/apps/core/capsule.py +958 -0
  16. metaflow_extensions/outerbounds/plugins/apps/core/click_importer.py +24 -0
  17. metaflow_extensions/outerbounds/plugins/apps/core/code_package/__init__.py +3 -0
  18. metaflow_extensions/outerbounds/plugins/apps/core/code_package/code_packager.py +618 -0
  19. metaflow_extensions/outerbounds/plugins/apps/core/code_package/examples.py +125 -0
  20. metaflow_extensions/outerbounds/plugins/apps/core/config/__init__.py +15 -0
  21. metaflow_extensions/outerbounds/plugins/apps/core/config/cli_generator.py +165 -0
  22. metaflow_extensions/outerbounds/plugins/apps/core/config/config_utils.py +966 -0
  23. metaflow_extensions/outerbounds/plugins/apps/core/config/schema_export.py +299 -0
  24. metaflow_extensions/outerbounds/plugins/apps/core/config/typed_configs.py +233 -0
  25. metaflow_extensions/outerbounds/plugins/apps/core/config/typed_init_generator.py +537 -0
  26. metaflow_extensions/outerbounds/plugins/apps/core/config/unified_config.py +1125 -0
  27. metaflow_extensions/outerbounds/plugins/apps/core/config_schema.yaml +337 -0
  28. metaflow_extensions/outerbounds/plugins/apps/core/dependencies.py +115 -0
  29. metaflow_extensions/outerbounds/plugins/apps/core/deployer.py +959 -0
  30. metaflow_extensions/outerbounds/plugins/apps/core/experimental/__init__.py +89 -0
  31. metaflow_extensions/outerbounds/plugins/apps/core/perimeters.py +87 -0
  32. metaflow_extensions/outerbounds/plugins/apps/core/secrets.py +164 -0
  33. metaflow_extensions/outerbounds/plugins/apps/core/utils.py +233 -0
  34. metaflow_extensions/outerbounds/plugins/apps/core/validations.py +17 -0
  35. metaflow_extensions/outerbounds/plugins/apps/deploy_decorator.py +201 -0
  36. metaflow_extensions/outerbounds/plugins/apps/supervisord_utils.py +243 -0
  37. metaflow_extensions/outerbounds/plugins/aws/__init__.py +4 -0
  38. metaflow_extensions/outerbounds/plugins/aws/assume_role.py +3 -0
  39. metaflow_extensions/outerbounds/plugins/aws/assume_role_decorator.py +118 -0
  40. metaflow_extensions/outerbounds/plugins/card_utilities/injector.py +1 -1
  41. metaflow_extensions/outerbounds/plugins/checkpoint_datastores/__init__.py +2 -0
  42. metaflow_extensions/outerbounds/plugins/checkpoint_datastores/coreweave.py +71 -0
  43. metaflow_extensions/outerbounds/plugins/checkpoint_datastores/external_chckpt.py +85 -0
  44. metaflow_extensions/outerbounds/plugins/checkpoint_datastores/nebius.py +73 -0
  45. metaflow_extensions/outerbounds/plugins/fast_bakery/baker.py +110 -0
  46. metaflow_extensions/outerbounds/plugins/fast_bakery/docker_environment.py +43 -9
  47. metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery.py +12 -0
  48. metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py +18 -44
  49. metaflow_extensions/outerbounds/plugins/kubernetes/pod_killer.py +374 -0
  50. metaflow_extensions/outerbounds/plugins/nim/card.py +2 -16
  51. metaflow_extensions/outerbounds/plugins/nim/{__init__.py → nim_decorator.py} +13 -49
  52. metaflow_extensions/outerbounds/plugins/nim/nim_manager.py +294 -233
  53. metaflow_extensions/outerbounds/plugins/nim/utils.py +36 -0
  54. metaflow_extensions/outerbounds/plugins/nvcf/constants.py +2 -2
  55. metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py +100 -19
  56. metaflow_extensions/outerbounds/plugins/nvcf/nvcf_decorator.py +6 -1
  57. metaflow_extensions/outerbounds/plugins/nvct/__init__.py +0 -0
  58. metaflow_extensions/outerbounds/plugins/nvct/exceptions.py +71 -0
  59. metaflow_extensions/outerbounds/plugins/nvct/nvct.py +131 -0
  60. metaflow_extensions/outerbounds/plugins/nvct/nvct_cli.py +289 -0
  61. metaflow_extensions/outerbounds/plugins/nvct/nvct_decorator.py +286 -0
  62. metaflow_extensions/outerbounds/plugins/nvct/nvct_runner.py +218 -0
  63. metaflow_extensions/outerbounds/plugins/nvct/utils.py +29 -0
  64. metaflow_extensions/outerbounds/plugins/ollama/__init__.py +225 -0
  65. metaflow_extensions/outerbounds/plugins/ollama/constants.py +1 -0
  66. metaflow_extensions/outerbounds/plugins/ollama/exceptions.py +22 -0
  67. metaflow_extensions/outerbounds/plugins/ollama/ollama.py +1924 -0
  68. metaflow_extensions/outerbounds/plugins/ollama/status_card.py +292 -0
  69. metaflow_extensions/outerbounds/plugins/optuna/__init__.py +48 -0
  70. metaflow_extensions/outerbounds/plugins/profilers/simple_card_decorator.py +96 -0
  71. metaflow_extensions/outerbounds/plugins/s3_proxy/__init__.py +7 -0
  72. metaflow_extensions/outerbounds/plugins/s3_proxy/binary_caller.py +132 -0
  73. metaflow_extensions/outerbounds/plugins/s3_proxy/constants.py +11 -0
  74. metaflow_extensions/outerbounds/plugins/s3_proxy/exceptions.py +13 -0
  75. metaflow_extensions/outerbounds/plugins/s3_proxy/proxy_bootstrap.py +59 -0
  76. metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_api.py +93 -0
  77. metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_decorator.py +250 -0
  78. metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_manager.py +225 -0
  79. metaflow_extensions/outerbounds/plugins/secrets/secrets.py +38 -2
  80. metaflow_extensions/outerbounds/plugins/snowflake/snowflake.py +81 -11
  81. metaflow_extensions/outerbounds/plugins/snowpark/snowpark.py +18 -8
  82. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_cli.py +6 -0
  83. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_client.py +45 -18
  84. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_decorator.py +18 -9
  85. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_job.py +10 -4
  86. metaflow_extensions/outerbounds/plugins/torchtune/__init__.py +163 -0
  87. metaflow_extensions/outerbounds/plugins/vllm/__init__.py +255 -0
  88. metaflow_extensions/outerbounds/plugins/vllm/constants.py +1 -0
  89. metaflow_extensions/outerbounds/plugins/vllm/exceptions.py +1 -0
  90. metaflow_extensions/outerbounds/plugins/vllm/status_card.py +352 -0
  91. metaflow_extensions/outerbounds/plugins/vllm/vllm_manager.py +621 -0
  92. metaflow_extensions/outerbounds/remote_config.py +46 -9
  93. metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py +94 -2
  94. metaflow_extensions/outerbounds/toplevel/ob_internal.py +4 -0
  95. metaflow_extensions/outerbounds/toplevel/plugins/ollama/__init__.py +1 -0
  96. metaflow_extensions/outerbounds/toplevel/plugins/optuna/__init__.py +1 -0
  97. metaflow_extensions/outerbounds/toplevel/plugins/torchtune/__init__.py +1 -0
  98. metaflow_extensions/outerbounds/toplevel/plugins/vllm/__init__.py +1 -0
  99. metaflow_extensions/outerbounds/toplevel/s3_proxy.py +88 -0
  100. {ob_metaflow_extensions-1.1.130.dist-info → ob_metaflow_extensions-1.5.1.dist-info}/METADATA +2 -2
  101. ob_metaflow_extensions-1.5.1.dist-info/RECORD +133 -0
  102. metaflow_extensions/outerbounds/plugins/nim/utilities.py +0 -5
  103. ob_metaflow_extensions-1.1.130.dist-info/RECORD +0 -56
  104. {ob_metaflow_extensions-1.1.130.dist-info → ob_metaflow_extensions-1.5.1.dist-info}/WHEEL +0 -0
  105. {ob_metaflow_extensions-1.1.130.dist-info → ob_metaflow_extensions-1.5.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,85 @@
1
+ from metaflow.user_decorators.user_flow_decorator import FlowMutator
2
+ from metaflow.user_decorators.mutable_flow import MutableFlow
3
+ from metaflow.user_decorators.mutable_step import MutableStep
4
+ import os
5
+
6
+
7
+ class _ExternalCheckpointFlowDeco(FlowMutator):
8
+ def init(self, *args, **kwargs):
9
+ self.bucket_path = kwargs.get("bucket_path", None)
10
+
11
+ self.secrets = kwargs.get("secrets", [])
12
+ if self.bucket_path is None:
13
+ raise ValueError(
14
+ "`bucket_path` keyword argument is required for the coreweave_datastore"
15
+ )
16
+ if not self.bucket_path.startswith("s3://"):
17
+ raise ValueError(
18
+ "`bucket_path` must start with `s3://` for the coreweave_datastore"
19
+ )
20
+ if self.secrets is None:
21
+ raise ValueError(
22
+ "`secrets` keyword argument is required for the coreweave_datastore"
23
+ )
24
+
25
+ def _swap_secrets(self, mutable_flow: MutableFlow) -> None:
26
+ from metaflow import (
27
+ checkpoint,
28
+ model,
29
+ huggingface_hub,
30
+ secrets,
31
+ with_artifact_store,
32
+ )
33
+
34
+ def _add_secrets(step: MutableStep) -> None:
35
+ decos_to_add = []
36
+ swapping_decos = {
37
+ "huggingface_hub": huggingface_hub,
38
+ "model": model,
39
+ "checkpoint": checkpoint,
40
+ }
41
+ already_has_secrets = False
42
+ secrets_present_in_deco = []
43
+ for d in step.decorator_specs:
44
+ name, _, _, deco_kwargs = d
45
+ if name in swapping_decos:
46
+ decos_to_add.append((name, deco_kwargs))
47
+ elif name == "secrets":
48
+ already_has_secrets = True
49
+ secrets_present_in_deco.extend(deco_kwargs["sources"])
50
+
51
+ # If the step aleady has secrets then take all the sources in
52
+ # the secrets and add the addtional secrets to the existing secrets
53
+ secrets_to_add = self.secrets
54
+ if already_has_secrets:
55
+ secrets_to_add.extend(secrets_present_in_deco)
56
+
57
+ secrets_to_add = list(set(secrets_to_add))
58
+
59
+ if len(decos_to_add) == 0:
60
+ if already_has_secrets:
61
+ step.remove_decorator("secrets")
62
+
63
+ step.add_decorator(
64
+ secrets,
65
+ deco_kwargs=dict(
66
+ sources=secrets_to_add,
67
+ ),
68
+ )
69
+ return
70
+
71
+ for d, _ in decos_to_add:
72
+ step.remove_decorator(d)
73
+
74
+ step.add_decorator(
75
+ secrets,
76
+ deco_kwargs=dict(
77
+ sources=secrets_to_add,
78
+ ),
79
+ )
80
+ for d, attrs in decos_to_add:
81
+ _deco_to_add = swapping_decos[d]
82
+ step.add_decorator(_deco_to_add, deco_kwargs=attrs)
83
+
84
+ for step_name, step in mutable_flow.steps:
85
+ _add_secrets(step)
@@ -0,0 +1,73 @@
1
+ from metaflow.user_decorators.mutable_flow import MutableFlow
2
+ from .external_chckpt import _ExternalCheckpointFlowDeco
3
+ import os
4
+
5
+ NEBIUS_ENDPOINT_URL = "https://storage.eu-north1.nebius.cloud:443"
6
+
7
+
8
+ class nebius_checkpoints(_ExternalCheckpointFlowDeco):
9
+
10
+ """
11
+
12
+ This decorator is used for setting the nebius's S3 compatible object store as the artifact store for
13
+ checkpoints/models created by the flow.
14
+
15
+ Parameters
16
+ ----------
17
+ secrets: list
18
+ A list of secrets to be added to the step. These secrets should contain any secrets that are required globally and the secret
19
+ for the nebius object store. The secret should contain the following keys:
20
+ - NEBIUS_ACCESS_KEY
21
+ - NEBIUS_SECRET_KEY
22
+
23
+ bucket_path: str
24
+ The path to the bucket to store the checkpoints/models.
25
+
26
+ endpoint_url: str
27
+ The endpoint url for the nebius object store. Defaults to `https://storage.eu-north1.nebius.cloud:443`
28
+
29
+ Usage
30
+ -----
31
+ ```python
32
+ from metaflow import checkpoint, step, FlowSpec, nebius_checkpoints
33
+
34
+ @nebius_checkpoints(secrets=[], bucket_path=None)
35
+ class MyFlow(FlowSpec):
36
+ @checkpoint
37
+ @step
38
+ def start(self):
39
+ # Saves the checkpoint in the nebius object store
40
+ current.checkpoint.save("./foo.txt")
41
+
42
+ @step
43
+ def end(self):
44
+ pass
45
+ ```
46
+ """
47
+
48
+ def __init__(self, *args, **kwargs):
49
+ super().__init__(*args, **kwargs)
50
+
51
+ def init(self, *args, **kwargs):
52
+ super().init(*args, **kwargs)
53
+ self.nebius_endpoint_url = kwargs.get("endpoint_url", NEBIUS_ENDPOINT_URL)
54
+
55
+ def pre_mutate(self, mutable_flow: MutableFlow) -> None:
56
+ from metaflow import (
57
+ with_artifact_store,
58
+ )
59
+
60
+ def _nebius_config():
61
+ return {
62
+ "root": self.bucket_path,
63
+ "client_params": {
64
+ "aws_access_key_id": os.environ.get("NEBIUS_ACCESS_KEY"),
65
+ "aws_secret_access_key": os.environ.get("NEBIUS_SECRET_KEY"),
66
+ "endpoint_url": self.nebius_endpoint_url,
67
+ },
68
+ }
69
+
70
+ mutable_flow.add_decorator(
71
+ with_artifact_store, deco_kwargs=dict(type="s3", config=_nebius_config)
72
+ )
73
+ self._swap_secrets(mutable_flow)
@@ -0,0 +1,110 @@
1
+ import threading
2
+ import time
3
+ import sys
4
+ from typing import Dict, Optional, Any, Callable
5
+ from functools import partial
6
+ from metaflow.exception import MetaflowException
7
+ from metaflow.metaflow_config import FAST_BAKERY_URL
8
+
9
+ from .fast_bakery import FastBakery, FastBakeryApiResponse, FastBakeryException
10
+ from .docker_environment import cache_request
11
+
12
+ BAKERY_METAFILE = ".imagebakery-cache"
13
+
14
+
15
+ class BakerException(MetaflowException):
16
+ headline = "Ran into an error while baking image"
17
+
18
+ def __init__(self, msg):
19
+ super(BakerException, self).__init__(msg)
20
+
21
+
22
+ def bake_image(
23
+ cache_file_path: str,
24
+ ref: Optional[str] = None,
25
+ python: Optional[str] = None,
26
+ pypi_packages: Optional[Dict[str, str]] = None,
27
+ conda_packages: Optional[Dict[str, str]] = None,
28
+ base_image: Optional[str] = None,
29
+ logger: Optional[Callable[[str], Any]] = None,
30
+ ) -> FastBakeryApiResponse:
31
+ """
32
+ Bakes a Docker image with the specified dependencies.
33
+
34
+ Args:
35
+ cache_file_path: Path to the cache file
36
+ ref: Reference identifier for this bake (for logging purposes)
37
+ python: Python version to use
38
+ pypi_packages: Dictionary of PyPI packages and versions
39
+ conda_packages: Dictionary of Conda packages and versions
40
+ base_image: Base Docker image to use
41
+ logger: Optional logger function to output progress
42
+
43
+ Returns:
44
+ FastBakeryApiResponse: The response from the bakery service
45
+
46
+ Raises:
47
+ BakerException: If the baking process fails
48
+ """
49
+ # Default logger if none provided
50
+ if logger is None:
51
+ logger = partial(print, file=sys.stderr)
52
+
53
+ # Thread lock for logging
54
+ logger_lock = threading.Lock()
55
+ images_baked = 0
56
+
57
+ @cache_request(cache_file_path)
58
+ def _cached_bake(
59
+ ref=None,
60
+ python=None,
61
+ pypi_packages=None,
62
+ conda_packages=None,
63
+ base_image=None,
64
+ ):
65
+ try:
66
+ bakery = FastBakery(url=FAST_BAKERY_URL)
67
+ bakery._reset_payload()
68
+ bakery.python_version(python)
69
+ bakery.pypi_packages(pypi_packages)
70
+ bakery.conda_packages(conda_packages)
71
+ bakery.base_image(base_image)
72
+ # bakery.ignore_cache()
73
+
74
+ with logger_lock:
75
+ logger(f"🍳 Baking [{ref}] ...")
76
+ logger(f" 🐍 Python: {python}")
77
+
78
+ if pypi_packages:
79
+ logger(f" 📦 PyPI packages:")
80
+ for package, version in pypi_packages.items():
81
+ logger(f" 🔧 {package}: {version}")
82
+
83
+ if conda_packages:
84
+ logger(f" 📦 Conda packages:")
85
+ for package, version in conda_packages.items():
86
+ logger(f" 🔧 {package}: {version}")
87
+
88
+ logger(f" 🏗️ Base image: {base_image}")
89
+
90
+ start_time = time.time()
91
+ res = bakery.bake()
92
+ # TODO: Get actual bake time from bakery
93
+ bake_time = time.time() - start_time
94
+
95
+ with logger_lock:
96
+ logger(f"🏁 Baked [{ref}] in {bake_time:.2f} seconds!")
97
+ nonlocal images_baked
98
+ images_baked += 1
99
+ return res
100
+ except FastBakeryException as ex:
101
+ raise BakerException(f"Bake [{ref}] failed: {str(ex)}")
102
+
103
+ # Call the cached bake function with the provided parameters
104
+ return _cached_bake(
105
+ ref=ref,
106
+ python=python,
107
+ pypi_packages=pypi_packages,
108
+ conda_packages=conda_packages,
109
+ base_image=base_image,
110
+ )
@@ -90,6 +90,7 @@ class DockerEnvironmentException(MetaflowException):
90
90
  class DockerEnvironment(MetaflowEnvironment):
91
91
  TYPE = "fast-bakery"
92
92
  _filecache = None
93
+ _force_rebuild = False
93
94
 
94
95
  def __init__(self, flow):
95
96
  self.skipped_steps = set()
@@ -178,12 +179,20 @@ class DockerEnvironment(MetaflowEnvironment):
178
179
 
179
180
  if self.skipped_steps:
180
181
  self.delegate = CondaEnvironment(self.flow)
182
+ self.delegate._force_rebuild = self._force_rebuild
181
183
  self.delegate.set_local_root(self.local_root)
182
184
  self.delegate.validate_environment(echo, self.datastore_type)
183
185
  self.delegate.init_environment(echo, self.skipped_steps)
184
186
 
185
187
  def _bake(self, steps) -> Dict[str, FastBakeryApiResponse]:
186
188
  metafile_path = get_fastbakery_metafile_path(self.local_root, self.flow.name)
189
+ if self._force_rebuild:
190
+ # clear the metafile if force rebuilding, effectively skipping the cache.
191
+ try:
192
+ os.remove(metafile_path)
193
+ except Exception:
194
+ pass
195
+
187
196
  logger_lock = threading.Lock()
188
197
 
189
198
  @cache_request(metafile_path)
@@ -201,7 +210,8 @@ class DockerEnvironment(MetaflowEnvironment):
201
210
  bakery.pypi_packages(pypi_packages)
202
211
  bakery.conda_packages(conda_packages)
203
212
  bakery.base_image(base_image)
204
- # bakery.ignore_cache()
213
+ if self._force_rebuild:
214
+ bakery.ignore_cache()
205
215
 
206
216
  with logger_lock:
207
217
  self.logger(f"🍳 Baking [{ref}] ...")
@@ -267,7 +277,7 @@ class DockerEnvironment(MetaflowEnvironment):
267
277
  packages = get_pinned_conda_libs(python, self.datastore_type)
268
278
  packages.update(dependencies.attributes["packages"] if dependencies else {})
269
279
 
270
- return {
280
+ requested = {
271
281
  "python": python,
272
282
  "pypi_packages": (
273
283
  packages if isinstance(dependencies, PyPIStepDecorator) else None
@@ -277,15 +287,35 @@ class DockerEnvironment(MetaflowEnvironment):
277
287
  ),
278
288
  "base_image": base_image,
279
289
  }
290
+ dedup_key = hashlib.sha256(
291
+ json.dumps(requested).encode("utf-8")
292
+ ).hexdigest()
293
+
294
+ return step.name, dedup_key, requested
280
295
 
281
296
  with ThreadPoolExecutor() as executor:
282
297
  prepared_args = list(executor.map(prepare_step, steps))
283
- for i, args in enumerate(prepared_args, 1):
284
- args["ref"] = f"#{i:02d}"
285
- futures = [executor.submit(_cached_bake, **args) for args in prepared_args]
298
+ # Deduplicate the requests for baking images of steps.
299
+ # We do not want to bake the same image twice.
300
+ dedup_requests = {}
301
+ for step_name, key, args in prepared_args:
302
+ if key not in dedup_requests:
303
+ dedup_requests[key] = {"step_names": set(), "args": args}
304
+ dedup_requests[key]["step_names"].add(step_name)
305
+
306
+ # unique futures
307
+ futures = []
308
+ for i, kv in enumerate(dedup_requests.items(), 1):
309
+ key, value = kv
310
+ future = executor.submit(
311
+ _cached_bake, **{**value["args"], "ref": f"#{i:02d}"}
312
+ )
313
+ futures.append({"step_names": value["step_names"], "future": future})
314
+
286
315
  results = {}
287
- for step, future in zip(steps, futures):
288
- results[step.name] = future.result()
316
+ for item in futures:
317
+ for step_name in item["step_names"]:
318
+ results[step_name] = item["future"].result()
289
319
 
290
320
  return results
291
321
 
@@ -321,12 +351,16 @@ class DockerEnvironment(MetaflowEnvironment):
321
351
  config.append("--disable=F0401")
322
352
  return config
323
353
 
324
- def get_package_commands(self, codepackage_url, datastore_type):
354
+ def get_package_commands(
355
+ self, codepackage_url, datastore_type, code_package_metadata=None
356
+ ):
325
357
  # we must set the skip install flag at this stage in order to skip package downloads,
326
358
  # doing so in bootstrap_commands is too late in the lifecycle.
327
359
  return [
328
360
  "export METAFLOW_SKIP_INSTALL_DEPENDENCIES=$FASTBAKERY_IMAGE",
329
- ] + super().get_package_commands(codepackage_url, datastore_type)
361
+ ] + super().get_package_commands(
362
+ codepackage_url, datastore_type, code_package_metadata=code_package_metadata
363
+ )
330
364
 
331
365
  def bootstrap_commands(self, step_name, datastore_type):
332
366
  if step_name in self.skipped_steps:
@@ -81,6 +81,17 @@ class FastBakery:
81
81
 
82
82
  def _reset_payload(self):
83
83
  self._payload = {}
84
+ from metaflow_extensions.outerbounds.remote_config import init_config
85
+ from os import environ
86
+
87
+ conf = init_config()
88
+ if "OBP_PERIMETER" in conf:
89
+ perimeter = conf["OBP_PERIMETER"]
90
+ else:
91
+ # if the perimeter is not in metaflow config, try to get it from the environment
92
+ perimeter = environ.get("OBP_PERIMETER", "")
93
+
94
+ self._payload["perimeterName"] = perimeter
84
95
 
85
96
  def python_version(self, version: str):
86
97
  self._payload["pythonVersion"] = version
@@ -111,6 +122,7 @@ class FastBakery:
111
122
  "responseMaxAgeSeconds": 0,
112
123
  "layerMaxAgeSeconds": 0,
113
124
  "baseImageMaxAgeSeconds": 0,
125
+ "overwriteExistingLayers": True, # Used primarily to rewrite possibly corrupted layers.
114
126
  }
115
127
  return self
116
128
 
@@ -5,6 +5,7 @@ import time
5
5
 
6
6
  from metaflow.exception import MetaflowException
7
7
  from metaflow.metaflow_config import KUBERNETES_NAMESPACE
8
+ from .pod_killer import PodKiller
8
9
 
9
10
 
10
11
  CLIENT_REFRESH_INTERVAL_SECONDS = 300
@@ -105,50 +106,23 @@ class KubernetesClient(object):
105
106
  return list(results)
106
107
 
107
108
  def kill_pods(self, flow_name, run_id, user, echo):
108
- from kubernetes.stream import stream
109
-
110
- api_instance = self._client.CoreV1Api()
111
- job_api = self._client.BatchV1Api()
112
- pods = self._find_active_pods(flow_name, run_id, user)
113
-
114
- def _kill_pod(pod):
115
- echo("Killing Kubernetes pod %s\n" % pod.metadata.name)
116
- try:
117
- stream(
118
- api_instance.connect_get_namespaced_pod_exec,
119
- name=pod.metadata.name,
120
- namespace=pod.metadata.namespace,
121
- command=[
122
- "/bin/sh",
123
- "-c",
124
- "/sbin/killall5",
125
- ],
126
- stderr=True,
127
- stdin=False,
128
- stdout=True,
129
- tty=False,
130
- )
131
- except Exception:
132
- # best effort kill for pod can fail.
133
- try:
134
- job_name = pod.metadata.labels.get("job-name", None)
135
- if job_name is None:
136
- raise Exception("Could not determine job name")
137
-
138
- job_api.patch_namespaced_job(
139
- name=job_name,
140
- namespace=pod.metadata.namespace,
141
- field_manager="metaflow",
142
- body={"spec": {"parallelism": 0}},
143
- )
144
- except Exception as e:
145
- echo("failed to kill pod %s - %s" % (pod.metadata.name, str(e)))
146
-
147
- with ThreadPoolExecutor() as executor:
148
- operated_pods = list(executor.map(_kill_pod, pods))
149
-
150
- if not operated_pods:
151
- echo("No active Kubernetes pods found for run *%s*" % run_id)
109
+ # Create PodKiller instance
110
+ killer = PodKiller(self._client, echo, self._namespace)
111
+
112
+ # Process all matching jobs and jobsets based on their outcomes
113
+ (
114
+ job_jobset_results,
115
+ num_jobs,
116
+ num_jobsets,
117
+ ) = killer.process_matching_jobs_and_jobsets(flow_name, run_id, user)
118
+
119
+ if job_jobset_results:
120
+ successful_operations = sum(1 for result in job_jobset_results if result)
121
+ echo(
122
+ f"Found and processed {num_jobs} jobs and {num_jobsets} jobsets, {successful_operations} operations successful\n"
123
+ )
124
+ else:
125
+ echo("No matching jobs or jobsets found for run *%s*" % run_id)
152
126
 
153
127
  def job(self, **kwargs):
154
128
  from metaflow.plugins.kubernetes.kubernetes_job import KubernetesJob