ob-metaflow-extensions 1.1.151__py2.py3-none-any.whl → 1.6.2__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metaflow_extensions/outerbounds/__init__.py +1 -1
- metaflow_extensions/outerbounds/plugins/__init__.py +24 -3
- metaflow_extensions/outerbounds/plugins/apps/app_cli.py +0 -0
- metaflow_extensions/outerbounds/plugins/apps/core/__init__.py +16 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_state_machine.py +506 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_vendor/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/__init__.py +4 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/spinners.py +478 -0
- metaflow_extensions/outerbounds/plugins/apps/core/app_config.py +128 -0
- metaflow_extensions/outerbounds/plugins/apps/core/app_deploy_decorator.py +333 -0
- metaflow_extensions/outerbounds/plugins/apps/core/artifacts.py +0 -0
- metaflow_extensions/outerbounds/plugins/apps/core/capsule.py +1029 -0
- metaflow_extensions/outerbounds/plugins/apps/core/click_importer.py +24 -0
- metaflow_extensions/outerbounds/plugins/apps/core/code_package/__init__.py +3 -0
- metaflow_extensions/outerbounds/plugins/apps/core/code_package/code_packager.py +618 -0
- metaflow_extensions/outerbounds/plugins/apps/core/code_package/examples.py +125 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/__init__.py +15 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/cli_generator.py +165 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/config_utils.py +966 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/schema_export.py +299 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/typed_configs.py +233 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/typed_init_generator.py +537 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/unified_config.py +1125 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config_schema.yaml +337 -0
- metaflow_extensions/outerbounds/plugins/apps/core/dependencies.py +115 -0
- metaflow_extensions/outerbounds/plugins/apps/core/deployer.py +1300 -0
- metaflow_extensions/outerbounds/plugins/apps/core/exceptions.py +341 -0
- metaflow_extensions/outerbounds/plugins/apps/core/experimental/__init__.py +89 -0
- metaflow_extensions/outerbounds/plugins/apps/core/perimeters.py +123 -0
- metaflow_extensions/outerbounds/plugins/apps/core/secrets.py +164 -0
- metaflow_extensions/outerbounds/plugins/apps/core/utils.py +233 -0
- metaflow_extensions/outerbounds/plugins/apps/core/validations.py +17 -0
- metaflow_extensions/outerbounds/plugins/aws/__init__.py +4 -0
- metaflow_extensions/outerbounds/plugins/aws/assume_role.py +3 -0
- metaflow_extensions/outerbounds/plugins/aws/assume_role_decorator.py +118 -0
- metaflow_extensions/outerbounds/plugins/checkpoint_datastores/coreweave.py +9 -77
- metaflow_extensions/outerbounds/plugins/checkpoint_datastores/external_chckpt.py +85 -0
- metaflow_extensions/outerbounds/plugins/checkpoint_datastores/nebius.py +7 -78
- metaflow_extensions/outerbounds/plugins/fast_bakery/baker.py +119 -0
- metaflow_extensions/outerbounds/plugins/fast_bakery/docker_environment.py +17 -3
- metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery.py +1 -0
- metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py +18 -44
- metaflow_extensions/outerbounds/plugins/kubernetes/pod_killer.py +374 -0
- metaflow_extensions/outerbounds/plugins/nim/card.py +1 -6
- metaflow_extensions/outerbounds/plugins/nim/{__init__.py → nim_decorator.py} +13 -49
- metaflow_extensions/outerbounds/plugins/nim/nim_manager.py +294 -233
- metaflow_extensions/outerbounds/plugins/nim/utils.py +36 -0
- metaflow_extensions/outerbounds/plugins/nvcf/constants.py +2 -2
- metaflow_extensions/outerbounds/plugins/nvct/nvct_decorator.py +32 -8
- metaflow_extensions/outerbounds/plugins/nvct/nvct_runner.py +1 -1
- metaflow_extensions/outerbounds/plugins/ollama/__init__.py +171 -16
- metaflow_extensions/outerbounds/plugins/ollama/constants.py +1 -0
- metaflow_extensions/outerbounds/plugins/ollama/exceptions.py +22 -0
- metaflow_extensions/outerbounds/plugins/ollama/ollama.py +1710 -114
- metaflow_extensions/outerbounds/plugins/ollama/status_card.py +292 -0
- metaflow_extensions/outerbounds/plugins/optuna/__init__.py +49 -0
- metaflow_extensions/outerbounds/plugins/profilers/simple_card_decorator.py +96 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/__init__.py +7 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/binary_caller.py +132 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/constants.py +11 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/exceptions.py +13 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/proxy_bootstrap.py +59 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_api.py +93 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_decorator.py +250 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_manager.py +225 -0
- metaflow_extensions/outerbounds/plugins/snowflake/snowflake.py +37 -7
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark.py +18 -8
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_cli.py +6 -0
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_client.py +45 -18
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_decorator.py +18 -9
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_job.py +10 -4
- metaflow_extensions/outerbounds/plugins/torchtune/__init__.py +163 -0
- metaflow_extensions/outerbounds/plugins/vllm/__init__.py +255 -0
- metaflow_extensions/outerbounds/plugins/vllm/constants.py +1 -0
- metaflow_extensions/outerbounds/plugins/vllm/exceptions.py +1 -0
- metaflow_extensions/outerbounds/plugins/vllm/status_card.py +352 -0
- metaflow_extensions/outerbounds/plugins/vllm/vllm_manager.py +621 -0
- metaflow_extensions/outerbounds/remote_config.py +46 -9
- metaflow_extensions/outerbounds/toplevel/apps/__init__.py +9 -0
- metaflow_extensions/outerbounds/toplevel/apps/exceptions.py +11 -0
- metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py +86 -2
- metaflow_extensions/outerbounds/toplevel/ob_internal.py +4 -0
- metaflow_extensions/outerbounds/toplevel/plugins/optuna/__init__.py +1 -0
- metaflow_extensions/outerbounds/toplevel/plugins/torchtune/__init__.py +1 -0
- metaflow_extensions/outerbounds/toplevel/plugins/vllm/__init__.py +1 -0
- metaflow_extensions/outerbounds/toplevel/s3_proxy.py +88 -0
- {ob_metaflow_extensions-1.1.151.dist-info → ob_metaflow_extensions-1.6.2.dist-info}/METADATA +2 -2
- ob_metaflow_extensions-1.6.2.dist-info/RECORD +136 -0
- metaflow_extensions/outerbounds/plugins/nim/utilities.py +0 -5
- ob_metaflow_extensions-1.1.151.dist-info/RECORD +0 -74
- {ob_metaflow_extensions-1.1.151.dist-info → ob_metaflow_extensions-1.6.2.dist-info}/WHEEL +0 -0
- {ob_metaflow_extensions-1.1.151.dist-info → ob_metaflow_extensions-1.6.2.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
from metaflow.user_decorators.user_flow_decorator import FlowMutator
|
|
2
|
+
from metaflow.user_decorators.mutable_flow import MutableFlow
|
|
3
|
+
from metaflow.user_decorators.mutable_step import MutableStep
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class _ExternalCheckpointFlowDeco(FlowMutator):
|
|
8
|
+
def init(self, *args, **kwargs):
|
|
9
|
+
self.bucket_path = kwargs.get("bucket_path", None)
|
|
10
|
+
|
|
11
|
+
self.secrets = kwargs.get("secrets", [])
|
|
12
|
+
if self.bucket_path is None:
|
|
13
|
+
raise ValueError(
|
|
14
|
+
"`bucket_path` keyword argument is required for the coreweave_datastore"
|
|
15
|
+
)
|
|
16
|
+
if not self.bucket_path.startswith("s3://"):
|
|
17
|
+
raise ValueError(
|
|
18
|
+
"`bucket_path` must start with `s3://` for the coreweave_datastore"
|
|
19
|
+
)
|
|
20
|
+
if self.secrets is None:
|
|
21
|
+
raise ValueError(
|
|
22
|
+
"`secrets` keyword argument is required for the coreweave_datastore"
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
def _swap_secrets(self, mutable_flow: MutableFlow) -> None:
|
|
26
|
+
from metaflow import (
|
|
27
|
+
checkpoint,
|
|
28
|
+
model,
|
|
29
|
+
huggingface_hub,
|
|
30
|
+
secrets,
|
|
31
|
+
with_artifact_store,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
def _add_secrets(step: MutableStep) -> None:
|
|
35
|
+
decos_to_add = []
|
|
36
|
+
swapping_decos = {
|
|
37
|
+
"huggingface_hub": huggingface_hub,
|
|
38
|
+
"model": model,
|
|
39
|
+
"checkpoint": checkpoint,
|
|
40
|
+
}
|
|
41
|
+
already_has_secrets = False
|
|
42
|
+
secrets_present_in_deco = []
|
|
43
|
+
for d in step.decorator_specs:
|
|
44
|
+
name, _, _, deco_kwargs = d
|
|
45
|
+
if name in swapping_decos:
|
|
46
|
+
decos_to_add.append((name, deco_kwargs))
|
|
47
|
+
elif name == "secrets":
|
|
48
|
+
already_has_secrets = True
|
|
49
|
+
secrets_present_in_deco.extend(deco_kwargs["sources"])
|
|
50
|
+
|
|
51
|
+
# If the step aleady has secrets then take all the sources in
|
|
52
|
+
# the secrets and add the addtional secrets to the existing secrets
|
|
53
|
+
secrets_to_add = self.secrets
|
|
54
|
+
if already_has_secrets:
|
|
55
|
+
secrets_to_add.extend(secrets_present_in_deco)
|
|
56
|
+
|
|
57
|
+
secrets_to_add = list(set(secrets_to_add))
|
|
58
|
+
|
|
59
|
+
if len(decos_to_add) == 0:
|
|
60
|
+
if already_has_secrets:
|
|
61
|
+
step.remove_decorator("secrets")
|
|
62
|
+
|
|
63
|
+
step.add_decorator(
|
|
64
|
+
secrets,
|
|
65
|
+
deco_kwargs=dict(
|
|
66
|
+
sources=secrets_to_add,
|
|
67
|
+
),
|
|
68
|
+
)
|
|
69
|
+
return
|
|
70
|
+
|
|
71
|
+
for d, _ in decos_to_add:
|
|
72
|
+
step.remove_decorator(d)
|
|
73
|
+
|
|
74
|
+
step.add_decorator(
|
|
75
|
+
secrets,
|
|
76
|
+
deco_kwargs=dict(
|
|
77
|
+
sources=secrets_to_add,
|
|
78
|
+
),
|
|
79
|
+
)
|
|
80
|
+
for d, attrs in decos_to_add:
|
|
81
|
+
_deco_to_add = swapping_decos[d]
|
|
82
|
+
step.add_decorator(_deco_to_add, deco_kwargs=attrs)
|
|
83
|
+
|
|
84
|
+
for step_name, step in mutable_flow.steps:
|
|
85
|
+
_add_secrets(step)
|
|
@@ -1,14 +1,11 @@
|
|
|
1
|
-
from metaflow.
|
|
2
|
-
|
|
3
|
-
MutableStep,
|
|
4
|
-
CustomFlowDecorator,
|
|
5
|
-
)
|
|
1
|
+
from metaflow.user_decorators.mutable_flow import MutableFlow
|
|
2
|
+
from .external_chckpt import _ExternalCheckpointFlowDeco
|
|
6
3
|
import os
|
|
7
4
|
|
|
8
5
|
NEBIUS_ENDPOINT_URL = "https://storage.eu-north1.nebius.cloud:443"
|
|
9
6
|
|
|
10
7
|
|
|
11
|
-
class nebius_checkpoints(
|
|
8
|
+
class nebius_checkpoints(_ExternalCheckpointFlowDeco):
|
|
12
9
|
|
|
13
10
|
"""
|
|
14
11
|
|
|
@@ -52,78 +49,14 @@ class nebius_checkpoints(CustomFlowDecorator):
|
|
|
52
49
|
super().__init__(*args, **kwargs)
|
|
53
50
|
|
|
54
51
|
def init(self, *args, **kwargs):
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
self.secrets = kwargs.get("secrets", [])
|
|
58
|
-
if self.bucket_path is None:
|
|
59
|
-
raise ValueError(
|
|
60
|
-
"`bucket_path` keyword argument is required for the coreweave_datastore"
|
|
61
|
-
)
|
|
62
|
-
if not self.bucket_path.startswith("s3://"):
|
|
63
|
-
raise ValueError(
|
|
64
|
-
"`bucket_path` must start with `s3://` for the coreweave_datastore"
|
|
65
|
-
)
|
|
66
|
-
|
|
52
|
+
super().init(*args, **kwargs)
|
|
67
53
|
self.nebius_endpoint_url = kwargs.get("endpoint_url", NEBIUS_ENDPOINT_URL)
|
|
68
|
-
if self.secrets is None:
|
|
69
|
-
raise ValueError(
|
|
70
|
-
"`secrets` keyword argument is required for the coreweave_datastore"
|
|
71
|
-
)
|
|
72
54
|
|
|
73
|
-
def
|
|
55
|
+
def pre_mutate(self, mutable_flow: MutableFlow) -> None:
|
|
74
56
|
from metaflow import (
|
|
75
|
-
checkpoint,
|
|
76
|
-
model,
|
|
77
|
-
huggingface_hub,
|
|
78
|
-
secrets,
|
|
79
57
|
with_artifact_store,
|
|
80
58
|
)
|
|
81
59
|
|
|
82
|
-
def _add_secrets(step: MutableStep) -> None:
|
|
83
|
-
decos_to_add = []
|
|
84
|
-
swapping_decos = {
|
|
85
|
-
"huggingface_hub": huggingface_hub,
|
|
86
|
-
"model": model,
|
|
87
|
-
"checkpoint": checkpoint,
|
|
88
|
-
}
|
|
89
|
-
already_has_secrets = False
|
|
90
|
-
secrets_present_in_deco = []
|
|
91
|
-
for d in step.decorators:
|
|
92
|
-
if d.name in swapping_decos:
|
|
93
|
-
decos_to_add.append((d.name, d.attributes))
|
|
94
|
-
elif d.name == "secrets":
|
|
95
|
-
already_has_secrets = True
|
|
96
|
-
secrets_present_in_deco.extend(d.attributes["sources"])
|
|
97
|
-
|
|
98
|
-
# If the step aleady has secrets then take all the sources in
|
|
99
|
-
# the secrets and add the addtional secrets to the existing secrets
|
|
100
|
-
secrets_to_add = self.secrets
|
|
101
|
-
if already_has_secrets:
|
|
102
|
-
secrets_to_add.extend(secrets_present_in_deco)
|
|
103
|
-
|
|
104
|
-
secrets_to_add = list(set(secrets_to_add))
|
|
105
|
-
|
|
106
|
-
if len(decos_to_add) == 0:
|
|
107
|
-
if already_has_secrets:
|
|
108
|
-
step.remove_decorator("secrets")
|
|
109
|
-
|
|
110
|
-
step.add_decorator(
|
|
111
|
-
secrets,
|
|
112
|
-
sources=secrets_to_add,
|
|
113
|
-
)
|
|
114
|
-
return
|
|
115
|
-
|
|
116
|
-
for d, _ in decos_to_add:
|
|
117
|
-
step.remove_decorator(d)
|
|
118
|
-
|
|
119
|
-
step.add_decorator(
|
|
120
|
-
secrets,
|
|
121
|
-
sources=secrets_to_add,
|
|
122
|
-
)
|
|
123
|
-
for d, attrs in decos_to_add:
|
|
124
|
-
_deco_to_add = swapping_decos[d]
|
|
125
|
-
step.add_decorator(_deco_to_add, **attrs)
|
|
126
|
-
|
|
127
60
|
def _nebius_config():
|
|
128
61
|
return {
|
|
129
62
|
"root": self.bucket_path,
|
|
@@ -135,10 +68,6 @@ class nebius_checkpoints(CustomFlowDecorator):
|
|
|
135
68
|
}
|
|
136
69
|
|
|
137
70
|
mutable_flow.add_decorator(
|
|
138
|
-
with_artifact_store,
|
|
139
|
-
type="s3",
|
|
140
|
-
config=_nebius_config,
|
|
71
|
+
with_artifact_store, deco_kwargs=dict(type="s3", config=_nebius_config)
|
|
141
72
|
)
|
|
142
|
-
|
|
143
|
-
for step_name, step in mutable_flow.steps:
|
|
144
|
-
_add_secrets(step)
|
|
73
|
+
self._swap_secrets(mutable_flow)
|
|
@@ -0,0 +1,119 @@
|
|
|
1
|
+
import threading
|
|
2
|
+
import time
|
|
3
|
+
import sys
|
|
4
|
+
from typing import Dict, Optional, Any, Callable
|
|
5
|
+
from functools import partial
|
|
6
|
+
from metaflow.exception import MetaflowException
|
|
7
|
+
from metaflow.metaflow_config import FAST_BAKERY_URL
|
|
8
|
+
|
|
9
|
+
from .fast_bakery import FastBakery, FastBakeryApiResponse, FastBakeryException
|
|
10
|
+
from .docker_environment import cache_request
|
|
11
|
+
|
|
12
|
+
BAKERY_METAFILE = ".imagebakery-cache"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class BakerException(MetaflowException):
|
|
16
|
+
headline = "Ran into an error while baking image"
|
|
17
|
+
|
|
18
|
+
def __init__(self, msg):
|
|
19
|
+
super(BakerException, self).__init__(msg)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def bake_image(
|
|
23
|
+
cache_file_path: str,
|
|
24
|
+
ref: Optional[str] = None,
|
|
25
|
+
python: Optional[str] = None,
|
|
26
|
+
pypi_packages: Optional[Dict[str, str]] = None,
|
|
27
|
+
conda_packages: Optional[Dict[str, str]] = None,
|
|
28
|
+
base_image: Optional[str] = None,
|
|
29
|
+
logger: Optional[Callable[[str], Any]] = None,
|
|
30
|
+
fast_bakery_url: Optional[str] = None,
|
|
31
|
+
) -> FastBakeryApiResponse:
|
|
32
|
+
"""
|
|
33
|
+
Bakes a Docker image with the specified dependencies.
|
|
34
|
+
|
|
35
|
+
Args:
|
|
36
|
+
cache_file_path: Path to the cache file
|
|
37
|
+
ref: Reference identifier for this bake (for logging purposes)
|
|
38
|
+
python: Python version to use
|
|
39
|
+
pypi_packages: Dictionary of PyPI packages and versions
|
|
40
|
+
conda_packages: Dictionary of Conda packages and versions
|
|
41
|
+
base_image: Base Docker image to use
|
|
42
|
+
logger: Optional logger function to output progress
|
|
43
|
+
fast_bakery_url: Optional FB URL
|
|
44
|
+
|
|
45
|
+
Returns:
|
|
46
|
+
FastBakeryApiResponse: The response from the bakery service
|
|
47
|
+
|
|
48
|
+
Raises:
|
|
49
|
+
BakerException: If the baking process fails
|
|
50
|
+
"""
|
|
51
|
+
# Default logger if none provided
|
|
52
|
+
if logger is None:
|
|
53
|
+
logger = partial(print, file=sys.stderr)
|
|
54
|
+
|
|
55
|
+
if all([fast_bakery_url is None and FAST_BAKERY_URL is None]):
|
|
56
|
+
raise BakerException(
|
|
57
|
+
"Image Bakery endpoint missing. METAFLOW_FAST_BAKERY_URL environment/configuration variable not found."
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
fast_bakery_url = fast_bakery_url or FAST_BAKERY_URL
|
|
61
|
+
|
|
62
|
+
# Thread lock for logging
|
|
63
|
+
logger_lock = threading.Lock()
|
|
64
|
+
images_baked = 0
|
|
65
|
+
|
|
66
|
+
@cache_request(cache_file_path)
|
|
67
|
+
def _cached_bake(
|
|
68
|
+
ref=None,
|
|
69
|
+
python=None,
|
|
70
|
+
pypi_packages=None,
|
|
71
|
+
conda_packages=None,
|
|
72
|
+
base_image=None,
|
|
73
|
+
):
|
|
74
|
+
try:
|
|
75
|
+
bakery = FastBakery(url=fast_bakery_url)
|
|
76
|
+
bakery._reset_payload()
|
|
77
|
+
bakery.python_version(python)
|
|
78
|
+
bakery.pypi_packages(pypi_packages)
|
|
79
|
+
bakery.conda_packages(conda_packages)
|
|
80
|
+
bakery.base_image(base_image)
|
|
81
|
+
# bakery.ignore_cache()
|
|
82
|
+
|
|
83
|
+
with logger_lock:
|
|
84
|
+
logger(f"🍳 Baking [{ref}] ...")
|
|
85
|
+
logger(f" 🐍 Python: {python}")
|
|
86
|
+
|
|
87
|
+
if pypi_packages:
|
|
88
|
+
logger(f" 📦 PyPI packages:")
|
|
89
|
+
for package, version in pypi_packages.items():
|
|
90
|
+
logger(f" 🔧 {package}: {version}")
|
|
91
|
+
|
|
92
|
+
if conda_packages:
|
|
93
|
+
logger(f" 📦 Conda packages:")
|
|
94
|
+
for package, version in conda_packages.items():
|
|
95
|
+
logger(f" 🔧 {package}: {version}")
|
|
96
|
+
|
|
97
|
+
logger(f" 🏗️ Base image: {base_image}")
|
|
98
|
+
|
|
99
|
+
start_time = time.time()
|
|
100
|
+
res = bakery.bake()
|
|
101
|
+
# TODO: Get actual bake time from bakery
|
|
102
|
+
bake_time = time.time() - start_time
|
|
103
|
+
|
|
104
|
+
with logger_lock:
|
|
105
|
+
logger(f"🏁 Baked [{ref}] in {bake_time:.2f} seconds!")
|
|
106
|
+
nonlocal images_baked
|
|
107
|
+
images_baked += 1
|
|
108
|
+
return res
|
|
109
|
+
except FastBakeryException as ex:
|
|
110
|
+
raise BakerException(f"Bake [{ref}] failed: {str(ex)}")
|
|
111
|
+
|
|
112
|
+
# Call the cached bake function with the provided parameters
|
|
113
|
+
return _cached_bake(
|
|
114
|
+
ref=ref,
|
|
115
|
+
python=python,
|
|
116
|
+
pypi_packages=pypi_packages,
|
|
117
|
+
conda_packages=conda_packages,
|
|
118
|
+
base_image=base_image,
|
|
119
|
+
)
|
|
@@ -90,6 +90,7 @@ class DockerEnvironmentException(MetaflowException):
|
|
|
90
90
|
class DockerEnvironment(MetaflowEnvironment):
|
|
91
91
|
TYPE = "fast-bakery"
|
|
92
92
|
_filecache = None
|
|
93
|
+
_force_rebuild = False
|
|
93
94
|
|
|
94
95
|
def __init__(self, flow):
|
|
95
96
|
self.skipped_steps = set()
|
|
@@ -178,12 +179,20 @@ class DockerEnvironment(MetaflowEnvironment):
|
|
|
178
179
|
|
|
179
180
|
if self.skipped_steps:
|
|
180
181
|
self.delegate = CondaEnvironment(self.flow)
|
|
182
|
+
self.delegate._force_rebuild = self._force_rebuild
|
|
181
183
|
self.delegate.set_local_root(self.local_root)
|
|
182
184
|
self.delegate.validate_environment(echo, self.datastore_type)
|
|
183
185
|
self.delegate.init_environment(echo, self.skipped_steps)
|
|
184
186
|
|
|
185
187
|
def _bake(self, steps) -> Dict[str, FastBakeryApiResponse]:
|
|
186
188
|
metafile_path = get_fastbakery_metafile_path(self.local_root, self.flow.name)
|
|
189
|
+
if self._force_rebuild:
|
|
190
|
+
# clear the metafile if force rebuilding, effectively skipping the cache.
|
|
191
|
+
try:
|
|
192
|
+
os.remove(metafile_path)
|
|
193
|
+
except Exception:
|
|
194
|
+
pass
|
|
195
|
+
|
|
187
196
|
logger_lock = threading.Lock()
|
|
188
197
|
|
|
189
198
|
@cache_request(metafile_path)
|
|
@@ -201,7 +210,8 @@ class DockerEnvironment(MetaflowEnvironment):
|
|
|
201
210
|
bakery.pypi_packages(pypi_packages)
|
|
202
211
|
bakery.conda_packages(conda_packages)
|
|
203
212
|
bakery.base_image(base_image)
|
|
204
|
-
|
|
213
|
+
if self._force_rebuild:
|
|
214
|
+
bakery.ignore_cache()
|
|
205
215
|
|
|
206
216
|
with logger_lock:
|
|
207
217
|
self.logger(f"🍳 Baking [{ref}] ...")
|
|
@@ -341,12 +351,16 @@ class DockerEnvironment(MetaflowEnvironment):
|
|
|
341
351
|
config.append("--disable=F0401")
|
|
342
352
|
return config
|
|
343
353
|
|
|
344
|
-
def get_package_commands(
|
|
354
|
+
def get_package_commands(
|
|
355
|
+
self, codepackage_url, datastore_type, code_package_metadata=None
|
|
356
|
+
):
|
|
345
357
|
# we must set the skip install flag at this stage in order to skip package downloads,
|
|
346
358
|
# doing so in bootstrap_commands is too late in the lifecycle.
|
|
347
359
|
return [
|
|
348
360
|
"export METAFLOW_SKIP_INSTALL_DEPENDENCIES=$FASTBAKERY_IMAGE",
|
|
349
|
-
] + super().get_package_commands(
|
|
361
|
+
] + super().get_package_commands(
|
|
362
|
+
codepackage_url, datastore_type, code_package_metadata=code_package_metadata
|
|
363
|
+
)
|
|
350
364
|
|
|
351
365
|
def bootstrap_commands(self, step_name, datastore_type):
|
|
352
366
|
if step_name in self.skipped_steps:
|
|
@@ -5,6 +5,7 @@ import time
|
|
|
5
5
|
|
|
6
6
|
from metaflow.exception import MetaflowException
|
|
7
7
|
from metaflow.metaflow_config import KUBERNETES_NAMESPACE
|
|
8
|
+
from .pod_killer import PodKiller
|
|
8
9
|
|
|
9
10
|
|
|
10
11
|
CLIENT_REFRESH_INTERVAL_SECONDS = 300
|
|
@@ -105,50 +106,23 @@ class KubernetesClient(object):
|
|
|
105
106
|
return list(results)
|
|
106
107
|
|
|
107
108
|
def kill_pods(self, flow_name, run_id, user, echo):
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
],
|
|
126
|
-
stderr=True,
|
|
127
|
-
stdin=False,
|
|
128
|
-
stdout=True,
|
|
129
|
-
tty=False,
|
|
130
|
-
)
|
|
131
|
-
except Exception:
|
|
132
|
-
# best effort kill for pod can fail.
|
|
133
|
-
try:
|
|
134
|
-
job_name = pod.metadata.labels.get("job-name", None)
|
|
135
|
-
if job_name is None:
|
|
136
|
-
raise Exception("Could not determine job name")
|
|
137
|
-
|
|
138
|
-
job_api.patch_namespaced_job(
|
|
139
|
-
name=job_name,
|
|
140
|
-
namespace=pod.metadata.namespace,
|
|
141
|
-
field_manager="metaflow",
|
|
142
|
-
body={"spec": {"parallelism": 0}},
|
|
143
|
-
)
|
|
144
|
-
except Exception as e:
|
|
145
|
-
echo("failed to kill pod %s - %s" % (pod.metadata.name, str(e)))
|
|
146
|
-
|
|
147
|
-
with ThreadPoolExecutor() as executor:
|
|
148
|
-
operated_pods = list(executor.map(_kill_pod, pods))
|
|
149
|
-
|
|
150
|
-
if not operated_pods:
|
|
151
|
-
echo("No active Kubernetes pods found for run *%s*" % run_id)
|
|
109
|
+
# Create PodKiller instance
|
|
110
|
+
killer = PodKiller(self._client, echo, self._namespace)
|
|
111
|
+
|
|
112
|
+
# Process all matching jobs and jobsets based on their outcomes
|
|
113
|
+
(
|
|
114
|
+
job_jobset_results,
|
|
115
|
+
num_jobs,
|
|
116
|
+
num_jobsets,
|
|
117
|
+
) = killer.process_matching_jobs_and_jobsets(flow_name, run_id, user)
|
|
118
|
+
|
|
119
|
+
if job_jobset_results:
|
|
120
|
+
successful_operations = sum(1 for result in job_jobset_results if result)
|
|
121
|
+
echo(
|
|
122
|
+
f"Found and processed {num_jobs} jobs and {num_jobsets} jobsets, {successful_operations} operations successful\n"
|
|
123
|
+
)
|
|
124
|
+
else:
|
|
125
|
+
echo("No matching jobs or jobsets found for run *%s*" % run_id)
|
|
152
126
|
|
|
153
127
|
def job(self, **kwargs):
|
|
154
128
|
from metaflow.plugins.kubernetes.kubernetes_job import KubernetesJob
|