ob-metaflow-extensions 1.1.130__py2.py3-none-any.whl → 1.5.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ob-metaflow-extensions might be problematic. Click here for more details.
- metaflow_extensions/outerbounds/__init__.py +1 -1
- metaflow_extensions/outerbounds/plugins/__init__.py +34 -4
- metaflow_extensions/outerbounds/plugins/apps/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/apps/app_cli.py +0 -0
- metaflow_extensions/outerbounds/plugins/apps/app_utils.py +187 -0
- metaflow_extensions/outerbounds/plugins/apps/consts.py +3 -0
- metaflow_extensions/outerbounds/plugins/apps/core/__init__.py +15 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_state_machine.py +506 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_vendor/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/__init__.py +4 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/spinners.py +478 -0
- metaflow_extensions/outerbounds/plugins/apps/core/app_config.py +128 -0
- metaflow_extensions/outerbounds/plugins/apps/core/app_deploy_decorator.py +330 -0
- metaflow_extensions/outerbounds/plugins/apps/core/artifacts.py +0 -0
- metaflow_extensions/outerbounds/plugins/apps/core/capsule.py +958 -0
- metaflow_extensions/outerbounds/plugins/apps/core/click_importer.py +24 -0
- metaflow_extensions/outerbounds/plugins/apps/core/code_package/__init__.py +3 -0
- metaflow_extensions/outerbounds/plugins/apps/core/code_package/code_packager.py +618 -0
- metaflow_extensions/outerbounds/plugins/apps/core/code_package/examples.py +125 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/__init__.py +15 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/cli_generator.py +165 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/config_utils.py +966 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/schema_export.py +299 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/typed_configs.py +233 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/typed_init_generator.py +537 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/unified_config.py +1125 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config_schema.yaml +337 -0
- metaflow_extensions/outerbounds/plugins/apps/core/dependencies.py +115 -0
- metaflow_extensions/outerbounds/plugins/apps/core/deployer.py +959 -0
- metaflow_extensions/outerbounds/plugins/apps/core/experimental/__init__.py +89 -0
- metaflow_extensions/outerbounds/plugins/apps/core/perimeters.py +87 -0
- metaflow_extensions/outerbounds/plugins/apps/core/secrets.py +164 -0
- metaflow_extensions/outerbounds/plugins/apps/core/utils.py +233 -0
- metaflow_extensions/outerbounds/plugins/apps/core/validations.py +17 -0
- metaflow_extensions/outerbounds/plugins/apps/deploy_decorator.py +201 -0
- metaflow_extensions/outerbounds/plugins/apps/supervisord_utils.py +243 -0
- metaflow_extensions/outerbounds/plugins/aws/__init__.py +4 -0
- metaflow_extensions/outerbounds/plugins/aws/assume_role.py +3 -0
- metaflow_extensions/outerbounds/plugins/aws/assume_role_decorator.py +118 -0
- metaflow_extensions/outerbounds/plugins/card_utilities/injector.py +1 -1
- metaflow_extensions/outerbounds/plugins/checkpoint_datastores/__init__.py +2 -0
- metaflow_extensions/outerbounds/plugins/checkpoint_datastores/coreweave.py +71 -0
- metaflow_extensions/outerbounds/plugins/checkpoint_datastores/external_chckpt.py +85 -0
- metaflow_extensions/outerbounds/plugins/checkpoint_datastores/nebius.py +73 -0
- metaflow_extensions/outerbounds/plugins/fast_bakery/baker.py +110 -0
- metaflow_extensions/outerbounds/plugins/fast_bakery/docker_environment.py +43 -9
- metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery.py +12 -0
- metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py +18 -44
- metaflow_extensions/outerbounds/plugins/kubernetes/pod_killer.py +374 -0
- metaflow_extensions/outerbounds/plugins/nim/card.py +2 -16
- metaflow_extensions/outerbounds/plugins/nim/{__init__.py → nim_decorator.py} +13 -49
- metaflow_extensions/outerbounds/plugins/nim/nim_manager.py +294 -233
- metaflow_extensions/outerbounds/plugins/nim/utils.py +36 -0
- metaflow_extensions/outerbounds/plugins/nvcf/constants.py +2 -2
- metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py +100 -19
- metaflow_extensions/outerbounds/plugins/nvcf/nvcf_decorator.py +6 -1
- metaflow_extensions/outerbounds/plugins/nvct/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/nvct/exceptions.py +71 -0
- metaflow_extensions/outerbounds/plugins/nvct/nvct.py +131 -0
- metaflow_extensions/outerbounds/plugins/nvct/nvct_cli.py +289 -0
- metaflow_extensions/outerbounds/plugins/nvct/nvct_decorator.py +286 -0
- metaflow_extensions/outerbounds/plugins/nvct/nvct_runner.py +218 -0
- metaflow_extensions/outerbounds/plugins/nvct/utils.py +29 -0
- metaflow_extensions/outerbounds/plugins/ollama/__init__.py +225 -0
- metaflow_extensions/outerbounds/plugins/ollama/constants.py +1 -0
- metaflow_extensions/outerbounds/plugins/ollama/exceptions.py +22 -0
- metaflow_extensions/outerbounds/plugins/ollama/ollama.py +1924 -0
- metaflow_extensions/outerbounds/plugins/ollama/status_card.py +292 -0
- metaflow_extensions/outerbounds/plugins/optuna/__init__.py +48 -0
- metaflow_extensions/outerbounds/plugins/profilers/simple_card_decorator.py +96 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/__init__.py +7 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/binary_caller.py +132 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/constants.py +11 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/exceptions.py +13 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/proxy_bootstrap.py +59 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_api.py +93 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_decorator.py +250 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_manager.py +225 -0
- metaflow_extensions/outerbounds/plugins/secrets/secrets.py +38 -2
- metaflow_extensions/outerbounds/plugins/snowflake/snowflake.py +81 -11
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark.py +18 -8
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_cli.py +6 -0
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_client.py +45 -18
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_decorator.py +18 -9
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_job.py +10 -4
- metaflow_extensions/outerbounds/plugins/torchtune/__init__.py +163 -0
- metaflow_extensions/outerbounds/plugins/vllm/__init__.py +255 -0
- metaflow_extensions/outerbounds/plugins/vllm/constants.py +1 -0
- metaflow_extensions/outerbounds/plugins/vllm/exceptions.py +1 -0
- metaflow_extensions/outerbounds/plugins/vllm/status_card.py +352 -0
- metaflow_extensions/outerbounds/plugins/vllm/vllm_manager.py +621 -0
- metaflow_extensions/outerbounds/remote_config.py +46 -9
- metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py +94 -2
- metaflow_extensions/outerbounds/toplevel/ob_internal.py +4 -0
- metaflow_extensions/outerbounds/toplevel/plugins/ollama/__init__.py +1 -0
- metaflow_extensions/outerbounds/toplevel/plugins/optuna/__init__.py +1 -0
- metaflow_extensions/outerbounds/toplevel/plugins/torchtune/__init__.py +1 -0
- metaflow_extensions/outerbounds/toplevel/plugins/vllm/__init__.py +1 -0
- metaflow_extensions/outerbounds/toplevel/s3_proxy.py +88 -0
- {ob_metaflow_extensions-1.1.130.dist-info → ob_metaflow_extensions-1.5.1.dist-info}/METADATA +2 -2
- ob_metaflow_extensions-1.5.1.dist-info/RECORD +133 -0
- metaflow_extensions/outerbounds/plugins/nim/utilities.py +0 -5
- ob_metaflow_extensions-1.1.130.dist-info/RECORD +0 -56
- {ob_metaflow_extensions-1.1.130.dist-info → ob_metaflow_extensions-1.5.1.dist-info}/WHEEL +0 -0
- {ob_metaflow_extensions-1.1.130.dist-info → ob_metaflow_extensions-1.5.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,85 @@
|
|
|
1
|
+
from metaflow.user_decorators.user_flow_decorator import FlowMutator
|
|
2
|
+
from metaflow.user_decorators.mutable_flow import MutableFlow
|
|
3
|
+
from metaflow.user_decorators.mutable_step import MutableStep
|
|
4
|
+
import os
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class _ExternalCheckpointFlowDeco(FlowMutator):
|
|
8
|
+
def init(self, *args, **kwargs):
|
|
9
|
+
self.bucket_path = kwargs.get("bucket_path", None)
|
|
10
|
+
|
|
11
|
+
self.secrets = kwargs.get("secrets", [])
|
|
12
|
+
if self.bucket_path is None:
|
|
13
|
+
raise ValueError(
|
|
14
|
+
"`bucket_path` keyword argument is required for the coreweave_datastore"
|
|
15
|
+
)
|
|
16
|
+
if not self.bucket_path.startswith("s3://"):
|
|
17
|
+
raise ValueError(
|
|
18
|
+
"`bucket_path` must start with `s3://` for the coreweave_datastore"
|
|
19
|
+
)
|
|
20
|
+
if self.secrets is None:
|
|
21
|
+
raise ValueError(
|
|
22
|
+
"`secrets` keyword argument is required for the coreweave_datastore"
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
def _swap_secrets(self, mutable_flow: MutableFlow) -> None:
|
|
26
|
+
from metaflow import (
|
|
27
|
+
checkpoint,
|
|
28
|
+
model,
|
|
29
|
+
huggingface_hub,
|
|
30
|
+
secrets,
|
|
31
|
+
with_artifact_store,
|
|
32
|
+
)
|
|
33
|
+
|
|
34
|
+
def _add_secrets(step: MutableStep) -> None:
|
|
35
|
+
decos_to_add = []
|
|
36
|
+
swapping_decos = {
|
|
37
|
+
"huggingface_hub": huggingface_hub,
|
|
38
|
+
"model": model,
|
|
39
|
+
"checkpoint": checkpoint,
|
|
40
|
+
}
|
|
41
|
+
already_has_secrets = False
|
|
42
|
+
secrets_present_in_deco = []
|
|
43
|
+
for d in step.decorator_specs:
|
|
44
|
+
name, _, _, deco_kwargs = d
|
|
45
|
+
if name in swapping_decos:
|
|
46
|
+
decos_to_add.append((name, deco_kwargs))
|
|
47
|
+
elif name == "secrets":
|
|
48
|
+
already_has_secrets = True
|
|
49
|
+
secrets_present_in_deco.extend(deco_kwargs["sources"])
|
|
50
|
+
|
|
51
|
+
# If the step aleady has secrets then take all the sources in
|
|
52
|
+
# the secrets and add the addtional secrets to the existing secrets
|
|
53
|
+
secrets_to_add = self.secrets
|
|
54
|
+
if already_has_secrets:
|
|
55
|
+
secrets_to_add.extend(secrets_present_in_deco)
|
|
56
|
+
|
|
57
|
+
secrets_to_add = list(set(secrets_to_add))
|
|
58
|
+
|
|
59
|
+
if len(decos_to_add) == 0:
|
|
60
|
+
if already_has_secrets:
|
|
61
|
+
step.remove_decorator("secrets")
|
|
62
|
+
|
|
63
|
+
step.add_decorator(
|
|
64
|
+
secrets,
|
|
65
|
+
deco_kwargs=dict(
|
|
66
|
+
sources=secrets_to_add,
|
|
67
|
+
),
|
|
68
|
+
)
|
|
69
|
+
return
|
|
70
|
+
|
|
71
|
+
for d, _ in decos_to_add:
|
|
72
|
+
step.remove_decorator(d)
|
|
73
|
+
|
|
74
|
+
step.add_decorator(
|
|
75
|
+
secrets,
|
|
76
|
+
deco_kwargs=dict(
|
|
77
|
+
sources=secrets_to_add,
|
|
78
|
+
),
|
|
79
|
+
)
|
|
80
|
+
for d, attrs in decos_to_add:
|
|
81
|
+
_deco_to_add = swapping_decos[d]
|
|
82
|
+
step.add_decorator(_deco_to_add, deco_kwargs=attrs)
|
|
83
|
+
|
|
84
|
+
for step_name, step in mutable_flow.steps:
|
|
85
|
+
_add_secrets(step)
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
from metaflow.user_decorators.mutable_flow import MutableFlow
|
|
2
|
+
from .external_chckpt import _ExternalCheckpointFlowDeco
|
|
3
|
+
import os
|
|
4
|
+
|
|
5
|
+
NEBIUS_ENDPOINT_URL = "https://storage.eu-north1.nebius.cloud:443"
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class nebius_checkpoints(_ExternalCheckpointFlowDeco):
|
|
9
|
+
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
This decorator is used for setting the nebius's S3 compatible object store as the artifact store for
|
|
13
|
+
checkpoints/models created by the flow.
|
|
14
|
+
|
|
15
|
+
Parameters
|
|
16
|
+
----------
|
|
17
|
+
secrets: list
|
|
18
|
+
A list of secrets to be added to the step. These secrets should contain any secrets that are required globally and the secret
|
|
19
|
+
for the nebius object store. The secret should contain the following keys:
|
|
20
|
+
- NEBIUS_ACCESS_KEY
|
|
21
|
+
- NEBIUS_SECRET_KEY
|
|
22
|
+
|
|
23
|
+
bucket_path: str
|
|
24
|
+
The path to the bucket to store the checkpoints/models.
|
|
25
|
+
|
|
26
|
+
endpoint_url: str
|
|
27
|
+
The endpoint url for the nebius object store. Defaults to `https://storage.eu-north1.nebius.cloud:443`
|
|
28
|
+
|
|
29
|
+
Usage
|
|
30
|
+
-----
|
|
31
|
+
```python
|
|
32
|
+
from metaflow import checkpoint, step, FlowSpec, nebius_checkpoints
|
|
33
|
+
|
|
34
|
+
@nebius_checkpoints(secrets=[], bucket_path=None)
|
|
35
|
+
class MyFlow(FlowSpec):
|
|
36
|
+
@checkpoint
|
|
37
|
+
@step
|
|
38
|
+
def start(self):
|
|
39
|
+
# Saves the checkpoint in the nebius object store
|
|
40
|
+
current.checkpoint.save("./foo.txt")
|
|
41
|
+
|
|
42
|
+
@step
|
|
43
|
+
def end(self):
|
|
44
|
+
pass
|
|
45
|
+
```
|
|
46
|
+
"""
|
|
47
|
+
|
|
48
|
+
def __init__(self, *args, **kwargs):
|
|
49
|
+
super().__init__(*args, **kwargs)
|
|
50
|
+
|
|
51
|
+
def init(self, *args, **kwargs):
|
|
52
|
+
super().init(*args, **kwargs)
|
|
53
|
+
self.nebius_endpoint_url = kwargs.get("endpoint_url", NEBIUS_ENDPOINT_URL)
|
|
54
|
+
|
|
55
|
+
def pre_mutate(self, mutable_flow: MutableFlow) -> None:
|
|
56
|
+
from metaflow import (
|
|
57
|
+
with_artifact_store,
|
|
58
|
+
)
|
|
59
|
+
|
|
60
|
+
def _nebius_config():
|
|
61
|
+
return {
|
|
62
|
+
"root": self.bucket_path,
|
|
63
|
+
"client_params": {
|
|
64
|
+
"aws_access_key_id": os.environ.get("NEBIUS_ACCESS_KEY"),
|
|
65
|
+
"aws_secret_access_key": os.environ.get("NEBIUS_SECRET_KEY"),
|
|
66
|
+
"endpoint_url": self.nebius_endpoint_url,
|
|
67
|
+
},
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
mutable_flow.add_decorator(
|
|
71
|
+
with_artifact_store, deco_kwargs=dict(type="s3", config=_nebius_config)
|
|
72
|
+
)
|
|
73
|
+
self._swap_secrets(mutable_flow)
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
import threading
|
|
2
|
+
import time
|
|
3
|
+
import sys
|
|
4
|
+
from typing import Dict, Optional, Any, Callable
|
|
5
|
+
from functools import partial
|
|
6
|
+
from metaflow.exception import MetaflowException
|
|
7
|
+
from metaflow.metaflow_config import FAST_BAKERY_URL
|
|
8
|
+
|
|
9
|
+
from .fast_bakery import FastBakery, FastBakeryApiResponse, FastBakeryException
|
|
10
|
+
from .docker_environment import cache_request
|
|
11
|
+
|
|
12
|
+
BAKERY_METAFILE = ".imagebakery-cache"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class BakerException(MetaflowException):
|
|
16
|
+
headline = "Ran into an error while baking image"
|
|
17
|
+
|
|
18
|
+
def __init__(self, msg):
|
|
19
|
+
super(BakerException, self).__init__(msg)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def bake_image(
|
|
23
|
+
cache_file_path: str,
|
|
24
|
+
ref: Optional[str] = None,
|
|
25
|
+
python: Optional[str] = None,
|
|
26
|
+
pypi_packages: Optional[Dict[str, str]] = None,
|
|
27
|
+
conda_packages: Optional[Dict[str, str]] = None,
|
|
28
|
+
base_image: Optional[str] = None,
|
|
29
|
+
logger: Optional[Callable[[str], Any]] = None,
|
|
30
|
+
) -> FastBakeryApiResponse:
|
|
31
|
+
"""
|
|
32
|
+
Bakes a Docker image with the specified dependencies.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
cache_file_path: Path to the cache file
|
|
36
|
+
ref: Reference identifier for this bake (for logging purposes)
|
|
37
|
+
python: Python version to use
|
|
38
|
+
pypi_packages: Dictionary of PyPI packages and versions
|
|
39
|
+
conda_packages: Dictionary of Conda packages and versions
|
|
40
|
+
base_image: Base Docker image to use
|
|
41
|
+
logger: Optional logger function to output progress
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
FastBakeryApiResponse: The response from the bakery service
|
|
45
|
+
|
|
46
|
+
Raises:
|
|
47
|
+
BakerException: If the baking process fails
|
|
48
|
+
"""
|
|
49
|
+
# Default logger if none provided
|
|
50
|
+
if logger is None:
|
|
51
|
+
logger = partial(print, file=sys.stderr)
|
|
52
|
+
|
|
53
|
+
# Thread lock for logging
|
|
54
|
+
logger_lock = threading.Lock()
|
|
55
|
+
images_baked = 0
|
|
56
|
+
|
|
57
|
+
@cache_request(cache_file_path)
|
|
58
|
+
def _cached_bake(
|
|
59
|
+
ref=None,
|
|
60
|
+
python=None,
|
|
61
|
+
pypi_packages=None,
|
|
62
|
+
conda_packages=None,
|
|
63
|
+
base_image=None,
|
|
64
|
+
):
|
|
65
|
+
try:
|
|
66
|
+
bakery = FastBakery(url=FAST_BAKERY_URL)
|
|
67
|
+
bakery._reset_payload()
|
|
68
|
+
bakery.python_version(python)
|
|
69
|
+
bakery.pypi_packages(pypi_packages)
|
|
70
|
+
bakery.conda_packages(conda_packages)
|
|
71
|
+
bakery.base_image(base_image)
|
|
72
|
+
# bakery.ignore_cache()
|
|
73
|
+
|
|
74
|
+
with logger_lock:
|
|
75
|
+
logger(f"🍳 Baking [{ref}] ...")
|
|
76
|
+
logger(f" 🐍 Python: {python}")
|
|
77
|
+
|
|
78
|
+
if pypi_packages:
|
|
79
|
+
logger(f" 📦 PyPI packages:")
|
|
80
|
+
for package, version in pypi_packages.items():
|
|
81
|
+
logger(f" 🔧 {package}: {version}")
|
|
82
|
+
|
|
83
|
+
if conda_packages:
|
|
84
|
+
logger(f" 📦 Conda packages:")
|
|
85
|
+
for package, version in conda_packages.items():
|
|
86
|
+
logger(f" 🔧 {package}: {version}")
|
|
87
|
+
|
|
88
|
+
logger(f" 🏗️ Base image: {base_image}")
|
|
89
|
+
|
|
90
|
+
start_time = time.time()
|
|
91
|
+
res = bakery.bake()
|
|
92
|
+
# TODO: Get actual bake time from bakery
|
|
93
|
+
bake_time = time.time() - start_time
|
|
94
|
+
|
|
95
|
+
with logger_lock:
|
|
96
|
+
logger(f"🏁 Baked [{ref}] in {bake_time:.2f} seconds!")
|
|
97
|
+
nonlocal images_baked
|
|
98
|
+
images_baked += 1
|
|
99
|
+
return res
|
|
100
|
+
except FastBakeryException as ex:
|
|
101
|
+
raise BakerException(f"Bake [{ref}] failed: {str(ex)}")
|
|
102
|
+
|
|
103
|
+
# Call the cached bake function with the provided parameters
|
|
104
|
+
return _cached_bake(
|
|
105
|
+
ref=ref,
|
|
106
|
+
python=python,
|
|
107
|
+
pypi_packages=pypi_packages,
|
|
108
|
+
conda_packages=conda_packages,
|
|
109
|
+
base_image=base_image,
|
|
110
|
+
)
|
|
@@ -90,6 +90,7 @@ class DockerEnvironmentException(MetaflowException):
|
|
|
90
90
|
class DockerEnvironment(MetaflowEnvironment):
|
|
91
91
|
TYPE = "fast-bakery"
|
|
92
92
|
_filecache = None
|
|
93
|
+
_force_rebuild = False
|
|
93
94
|
|
|
94
95
|
def __init__(self, flow):
|
|
95
96
|
self.skipped_steps = set()
|
|
@@ -178,12 +179,20 @@ class DockerEnvironment(MetaflowEnvironment):
|
|
|
178
179
|
|
|
179
180
|
if self.skipped_steps:
|
|
180
181
|
self.delegate = CondaEnvironment(self.flow)
|
|
182
|
+
self.delegate._force_rebuild = self._force_rebuild
|
|
181
183
|
self.delegate.set_local_root(self.local_root)
|
|
182
184
|
self.delegate.validate_environment(echo, self.datastore_type)
|
|
183
185
|
self.delegate.init_environment(echo, self.skipped_steps)
|
|
184
186
|
|
|
185
187
|
def _bake(self, steps) -> Dict[str, FastBakeryApiResponse]:
|
|
186
188
|
metafile_path = get_fastbakery_metafile_path(self.local_root, self.flow.name)
|
|
189
|
+
if self._force_rebuild:
|
|
190
|
+
# clear the metafile if force rebuilding, effectively skipping the cache.
|
|
191
|
+
try:
|
|
192
|
+
os.remove(metafile_path)
|
|
193
|
+
except Exception:
|
|
194
|
+
pass
|
|
195
|
+
|
|
187
196
|
logger_lock = threading.Lock()
|
|
188
197
|
|
|
189
198
|
@cache_request(metafile_path)
|
|
@@ -201,7 +210,8 @@ class DockerEnvironment(MetaflowEnvironment):
|
|
|
201
210
|
bakery.pypi_packages(pypi_packages)
|
|
202
211
|
bakery.conda_packages(conda_packages)
|
|
203
212
|
bakery.base_image(base_image)
|
|
204
|
-
|
|
213
|
+
if self._force_rebuild:
|
|
214
|
+
bakery.ignore_cache()
|
|
205
215
|
|
|
206
216
|
with logger_lock:
|
|
207
217
|
self.logger(f"🍳 Baking [{ref}] ...")
|
|
@@ -267,7 +277,7 @@ class DockerEnvironment(MetaflowEnvironment):
|
|
|
267
277
|
packages = get_pinned_conda_libs(python, self.datastore_type)
|
|
268
278
|
packages.update(dependencies.attributes["packages"] if dependencies else {})
|
|
269
279
|
|
|
270
|
-
|
|
280
|
+
requested = {
|
|
271
281
|
"python": python,
|
|
272
282
|
"pypi_packages": (
|
|
273
283
|
packages if isinstance(dependencies, PyPIStepDecorator) else None
|
|
@@ -277,15 +287,35 @@ class DockerEnvironment(MetaflowEnvironment):
|
|
|
277
287
|
),
|
|
278
288
|
"base_image": base_image,
|
|
279
289
|
}
|
|
290
|
+
dedup_key = hashlib.sha256(
|
|
291
|
+
json.dumps(requested).encode("utf-8")
|
|
292
|
+
).hexdigest()
|
|
293
|
+
|
|
294
|
+
return step.name, dedup_key, requested
|
|
280
295
|
|
|
281
296
|
with ThreadPoolExecutor() as executor:
|
|
282
297
|
prepared_args = list(executor.map(prepare_step, steps))
|
|
283
|
-
for
|
|
284
|
-
|
|
285
|
-
|
|
298
|
+
# Deduplicate the requests for baking images of steps.
|
|
299
|
+
# We do not want to bake the same image twice.
|
|
300
|
+
dedup_requests = {}
|
|
301
|
+
for step_name, key, args in prepared_args:
|
|
302
|
+
if key not in dedup_requests:
|
|
303
|
+
dedup_requests[key] = {"step_names": set(), "args": args}
|
|
304
|
+
dedup_requests[key]["step_names"].add(step_name)
|
|
305
|
+
|
|
306
|
+
# unique futures
|
|
307
|
+
futures = []
|
|
308
|
+
for i, kv in enumerate(dedup_requests.items(), 1):
|
|
309
|
+
key, value = kv
|
|
310
|
+
future = executor.submit(
|
|
311
|
+
_cached_bake, **{**value["args"], "ref": f"#{i:02d}"}
|
|
312
|
+
)
|
|
313
|
+
futures.append({"step_names": value["step_names"], "future": future})
|
|
314
|
+
|
|
286
315
|
results = {}
|
|
287
|
-
for
|
|
288
|
-
|
|
316
|
+
for item in futures:
|
|
317
|
+
for step_name in item["step_names"]:
|
|
318
|
+
results[step_name] = item["future"].result()
|
|
289
319
|
|
|
290
320
|
return results
|
|
291
321
|
|
|
@@ -321,12 +351,16 @@ class DockerEnvironment(MetaflowEnvironment):
|
|
|
321
351
|
config.append("--disable=F0401")
|
|
322
352
|
return config
|
|
323
353
|
|
|
324
|
-
def get_package_commands(
|
|
354
|
+
def get_package_commands(
|
|
355
|
+
self, codepackage_url, datastore_type, code_package_metadata=None
|
|
356
|
+
):
|
|
325
357
|
# we must set the skip install flag at this stage in order to skip package downloads,
|
|
326
358
|
# doing so in bootstrap_commands is too late in the lifecycle.
|
|
327
359
|
return [
|
|
328
360
|
"export METAFLOW_SKIP_INSTALL_DEPENDENCIES=$FASTBAKERY_IMAGE",
|
|
329
|
-
] + super().get_package_commands(
|
|
361
|
+
] + super().get_package_commands(
|
|
362
|
+
codepackage_url, datastore_type, code_package_metadata=code_package_metadata
|
|
363
|
+
)
|
|
330
364
|
|
|
331
365
|
def bootstrap_commands(self, step_name, datastore_type):
|
|
332
366
|
if step_name in self.skipped_steps:
|
|
@@ -81,6 +81,17 @@ class FastBakery:
|
|
|
81
81
|
|
|
82
82
|
def _reset_payload(self):
|
|
83
83
|
self._payload = {}
|
|
84
|
+
from metaflow_extensions.outerbounds.remote_config import init_config
|
|
85
|
+
from os import environ
|
|
86
|
+
|
|
87
|
+
conf = init_config()
|
|
88
|
+
if "OBP_PERIMETER" in conf:
|
|
89
|
+
perimeter = conf["OBP_PERIMETER"]
|
|
90
|
+
else:
|
|
91
|
+
# if the perimeter is not in metaflow config, try to get it from the environment
|
|
92
|
+
perimeter = environ.get("OBP_PERIMETER", "")
|
|
93
|
+
|
|
94
|
+
self._payload["perimeterName"] = perimeter
|
|
84
95
|
|
|
85
96
|
def python_version(self, version: str):
|
|
86
97
|
self._payload["pythonVersion"] = version
|
|
@@ -111,6 +122,7 @@ class FastBakery:
|
|
|
111
122
|
"responseMaxAgeSeconds": 0,
|
|
112
123
|
"layerMaxAgeSeconds": 0,
|
|
113
124
|
"baseImageMaxAgeSeconds": 0,
|
|
125
|
+
"overwriteExistingLayers": True, # Used primarily to rewrite possibly corrupted layers.
|
|
114
126
|
}
|
|
115
127
|
return self
|
|
116
128
|
|
|
@@ -5,6 +5,7 @@ import time
|
|
|
5
5
|
|
|
6
6
|
from metaflow.exception import MetaflowException
|
|
7
7
|
from metaflow.metaflow_config import KUBERNETES_NAMESPACE
|
|
8
|
+
from .pod_killer import PodKiller
|
|
8
9
|
|
|
9
10
|
|
|
10
11
|
CLIENT_REFRESH_INTERVAL_SECONDS = 300
|
|
@@ -105,50 +106,23 @@ class KubernetesClient(object):
|
|
|
105
106
|
return list(results)
|
|
106
107
|
|
|
107
108
|
def kill_pods(self, flow_name, run_id, user, echo):
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
],
|
|
126
|
-
stderr=True,
|
|
127
|
-
stdin=False,
|
|
128
|
-
stdout=True,
|
|
129
|
-
tty=False,
|
|
130
|
-
)
|
|
131
|
-
except Exception:
|
|
132
|
-
# best effort kill for pod can fail.
|
|
133
|
-
try:
|
|
134
|
-
job_name = pod.metadata.labels.get("job-name", None)
|
|
135
|
-
if job_name is None:
|
|
136
|
-
raise Exception("Could not determine job name")
|
|
137
|
-
|
|
138
|
-
job_api.patch_namespaced_job(
|
|
139
|
-
name=job_name,
|
|
140
|
-
namespace=pod.metadata.namespace,
|
|
141
|
-
field_manager="metaflow",
|
|
142
|
-
body={"spec": {"parallelism": 0}},
|
|
143
|
-
)
|
|
144
|
-
except Exception as e:
|
|
145
|
-
echo("failed to kill pod %s - %s" % (pod.metadata.name, str(e)))
|
|
146
|
-
|
|
147
|
-
with ThreadPoolExecutor() as executor:
|
|
148
|
-
operated_pods = list(executor.map(_kill_pod, pods))
|
|
149
|
-
|
|
150
|
-
if not operated_pods:
|
|
151
|
-
echo("No active Kubernetes pods found for run *%s*" % run_id)
|
|
109
|
+
# Create PodKiller instance
|
|
110
|
+
killer = PodKiller(self._client, echo, self._namespace)
|
|
111
|
+
|
|
112
|
+
# Process all matching jobs and jobsets based on their outcomes
|
|
113
|
+
(
|
|
114
|
+
job_jobset_results,
|
|
115
|
+
num_jobs,
|
|
116
|
+
num_jobsets,
|
|
117
|
+
) = killer.process_matching_jobs_and_jobsets(flow_name, run_id, user)
|
|
118
|
+
|
|
119
|
+
if job_jobset_results:
|
|
120
|
+
successful_operations = sum(1 for result in job_jobset_results if result)
|
|
121
|
+
echo(
|
|
122
|
+
f"Found and processed {num_jobs} jobs and {num_jobsets} jobsets, {successful_operations} operations successful\n"
|
|
123
|
+
)
|
|
124
|
+
else:
|
|
125
|
+
echo("No matching jobs or jobsets found for run *%s*" % run_id)
|
|
152
126
|
|
|
153
127
|
def job(self, **kwargs):
|
|
154
128
|
from metaflow.plugins.kubernetes.kubernetes_job import KubernetesJob
|