ob-metaflow-extensions 1.1.151__py2.py3-none-any.whl → 1.4.33__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metaflow_extensions/outerbounds/__init__.py +1 -1
- metaflow_extensions/outerbounds/plugins/__init__.py +17 -3
- metaflow_extensions/outerbounds/plugins/apps/app_cli.py +0 -0
- metaflow_extensions/outerbounds/plugins/apps/app_deploy_decorator.py +146 -0
- metaflow_extensions/outerbounds/plugins/apps/core/__init__.py +10 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_state_machine.py +506 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_vendor/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/__init__.py +4 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/spinners.py +478 -0
- metaflow_extensions/outerbounds/plugins/apps/core/app_cli.py +1200 -0
- metaflow_extensions/outerbounds/plugins/apps/core/app_config.py +146 -0
- metaflow_extensions/outerbounds/plugins/apps/core/artifacts.py +0 -0
- metaflow_extensions/outerbounds/plugins/apps/core/capsule.py +958 -0
- metaflow_extensions/outerbounds/plugins/apps/core/click_importer.py +24 -0
- metaflow_extensions/outerbounds/plugins/apps/core/code_package/__init__.py +3 -0
- metaflow_extensions/outerbounds/plugins/apps/core/code_package/code_packager.py +618 -0
- metaflow_extensions/outerbounds/plugins/apps/core/code_package/examples.py +125 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/__init__.py +12 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/cli_generator.py +161 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/config_utils.py +868 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/schema_export.py +288 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/typed_configs.py +139 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/typed_init_generator.py +398 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/unified_config.py +1088 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config_schema.yaml +337 -0
- metaflow_extensions/outerbounds/plugins/apps/core/dependencies.py +115 -0
- metaflow_extensions/outerbounds/plugins/apps/core/deployer.py +303 -0
- metaflow_extensions/outerbounds/plugins/apps/core/experimental/__init__.py +89 -0
- metaflow_extensions/outerbounds/plugins/apps/core/perimeters.py +87 -0
- metaflow_extensions/outerbounds/plugins/apps/core/secrets.py +164 -0
- metaflow_extensions/outerbounds/plugins/apps/core/utils.py +233 -0
- metaflow_extensions/outerbounds/plugins/apps/core/validations.py +17 -0
- metaflow_extensions/outerbounds/plugins/aws/__init__.py +4 -0
- metaflow_extensions/outerbounds/plugins/aws/assume_role.py +3 -0
- metaflow_extensions/outerbounds/plugins/aws/assume_role_decorator.py +78 -0
- metaflow_extensions/outerbounds/plugins/checkpoint_datastores/coreweave.py +9 -77
- metaflow_extensions/outerbounds/plugins/checkpoint_datastores/external_chckpt.py +85 -0
- metaflow_extensions/outerbounds/plugins/checkpoint_datastores/nebius.py +7 -78
- metaflow_extensions/outerbounds/plugins/fast_bakery/baker.py +110 -0
- metaflow_extensions/outerbounds/plugins/fast_bakery/docker_environment.py +17 -3
- metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery.py +1 -0
- metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py +18 -44
- metaflow_extensions/outerbounds/plugins/kubernetes/pod_killer.py +374 -0
- metaflow_extensions/outerbounds/plugins/nim/card.py +1 -6
- metaflow_extensions/outerbounds/plugins/nim/{__init__.py → nim_decorator.py} +13 -49
- metaflow_extensions/outerbounds/plugins/nim/nim_manager.py +294 -233
- metaflow_extensions/outerbounds/plugins/nim/utils.py +36 -0
- metaflow_extensions/outerbounds/plugins/nvcf/constants.py +2 -2
- metaflow_extensions/outerbounds/plugins/nvct/nvct_decorator.py +32 -8
- metaflow_extensions/outerbounds/plugins/nvct/nvct_runner.py +1 -1
- metaflow_extensions/outerbounds/plugins/ollama/__init__.py +171 -16
- metaflow_extensions/outerbounds/plugins/ollama/constants.py +1 -0
- metaflow_extensions/outerbounds/plugins/ollama/exceptions.py +22 -0
- metaflow_extensions/outerbounds/plugins/ollama/ollama.py +1710 -114
- metaflow_extensions/outerbounds/plugins/ollama/status_card.py +292 -0
- metaflow_extensions/outerbounds/plugins/optuna/__init__.py +48 -0
- metaflow_extensions/outerbounds/plugins/profilers/simple_card_decorator.py +96 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/__init__.py +7 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/binary_caller.py +132 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/constants.py +11 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/exceptions.py +13 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/proxy_bootstrap.py +59 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_api.py +93 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_decorator.py +250 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_manager.py +225 -0
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_client.py +6 -3
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_decorator.py +13 -7
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_job.py +8 -2
- metaflow_extensions/outerbounds/plugins/torchtune/__init__.py +163 -0
- metaflow_extensions/outerbounds/plugins/vllm/__init__.py +255 -0
- metaflow_extensions/outerbounds/plugins/vllm/constants.py +1 -0
- metaflow_extensions/outerbounds/plugins/vllm/exceptions.py +1 -0
- metaflow_extensions/outerbounds/plugins/vllm/status_card.py +352 -0
- metaflow_extensions/outerbounds/plugins/vllm/vllm_manager.py +621 -0
- metaflow_extensions/outerbounds/remote_config.py +27 -3
- metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py +86 -2
- metaflow_extensions/outerbounds/toplevel/ob_internal.py +4 -0
- metaflow_extensions/outerbounds/toplevel/plugins/optuna/__init__.py +1 -0
- metaflow_extensions/outerbounds/toplevel/plugins/torchtune/__init__.py +1 -0
- metaflow_extensions/outerbounds/toplevel/plugins/vllm/__init__.py +1 -0
- metaflow_extensions/outerbounds/toplevel/s3_proxy.py +88 -0
- {ob_metaflow_extensions-1.1.151.dist-info → ob_metaflow_extensions-1.4.33.dist-info}/METADATA +2 -2
- ob_metaflow_extensions-1.4.33.dist-info/RECORD +134 -0
- metaflow_extensions/outerbounds/plugins/nim/utilities.py +0 -5
- ob_metaflow_extensions-1.1.151.dist-info/RECORD +0 -74
- {ob_metaflow_extensions-1.1.151.dist-info → ob_metaflow_extensions-1.4.33.dist-info}/WHEEL +0 -0
- {ob_metaflow_extensions-1.1.151.dist-info → ob_metaflow_extensions-1.4.33.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
import threading
|
|
2
|
+
import time
|
|
3
|
+
import sys
|
|
4
|
+
from typing import Dict, Optional, Any, Callable
|
|
5
|
+
from functools import partial
|
|
6
|
+
from metaflow.exception import MetaflowException
|
|
7
|
+
from metaflow.metaflow_config import FAST_BAKERY_URL
|
|
8
|
+
|
|
9
|
+
from .fast_bakery import FastBakery, FastBakeryApiResponse, FastBakeryException
|
|
10
|
+
from .docker_environment import cache_request
|
|
11
|
+
|
|
12
|
+
BAKERY_METAFILE = ".imagebakery-cache"
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class BakerException(MetaflowException):
|
|
16
|
+
headline = "Ran into an error while baking image"
|
|
17
|
+
|
|
18
|
+
def __init__(self, msg):
|
|
19
|
+
super(BakerException, self).__init__(msg)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def bake_image(
|
|
23
|
+
cache_file_path: str,
|
|
24
|
+
ref: Optional[str] = None,
|
|
25
|
+
python: Optional[str] = None,
|
|
26
|
+
pypi_packages: Optional[Dict[str, str]] = None,
|
|
27
|
+
conda_packages: Optional[Dict[str, str]] = None,
|
|
28
|
+
base_image: Optional[str] = None,
|
|
29
|
+
logger: Optional[Callable[[str], Any]] = None,
|
|
30
|
+
) -> FastBakeryApiResponse:
|
|
31
|
+
"""
|
|
32
|
+
Bakes a Docker image with the specified dependencies.
|
|
33
|
+
|
|
34
|
+
Args:
|
|
35
|
+
cache_file_path: Path to the cache file
|
|
36
|
+
ref: Reference identifier for this bake (for logging purposes)
|
|
37
|
+
python: Python version to use
|
|
38
|
+
pypi_packages: Dictionary of PyPI packages and versions
|
|
39
|
+
conda_packages: Dictionary of Conda packages and versions
|
|
40
|
+
base_image: Base Docker image to use
|
|
41
|
+
logger: Optional logger function to output progress
|
|
42
|
+
|
|
43
|
+
Returns:
|
|
44
|
+
FastBakeryApiResponse: The response from the bakery service
|
|
45
|
+
|
|
46
|
+
Raises:
|
|
47
|
+
BakerException: If the baking process fails
|
|
48
|
+
"""
|
|
49
|
+
# Default logger if none provided
|
|
50
|
+
if logger is None:
|
|
51
|
+
logger = partial(print, file=sys.stderr)
|
|
52
|
+
|
|
53
|
+
# Thread lock for logging
|
|
54
|
+
logger_lock = threading.Lock()
|
|
55
|
+
images_baked = 0
|
|
56
|
+
|
|
57
|
+
@cache_request(cache_file_path)
|
|
58
|
+
def _cached_bake(
|
|
59
|
+
ref=None,
|
|
60
|
+
python=None,
|
|
61
|
+
pypi_packages=None,
|
|
62
|
+
conda_packages=None,
|
|
63
|
+
base_image=None,
|
|
64
|
+
):
|
|
65
|
+
try:
|
|
66
|
+
bakery = FastBakery(url=FAST_BAKERY_URL)
|
|
67
|
+
bakery._reset_payload()
|
|
68
|
+
bakery.python_version(python)
|
|
69
|
+
bakery.pypi_packages(pypi_packages)
|
|
70
|
+
bakery.conda_packages(conda_packages)
|
|
71
|
+
bakery.base_image(base_image)
|
|
72
|
+
# bakery.ignore_cache()
|
|
73
|
+
|
|
74
|
+
with logger_lock:
|
|
75
|
+
logger(f"🍳 Baking [{ref}] ...")
|
|
76
|
+
logger(f" 🐍 Python: {python}")
|
|
77
|
+
|
|
78
|
+
if pypi_packages:
|
|
79
|
+
logger(f" 📦 PyPI packages:")
|
|
80
|
+
for package, version in pypi_packages.items():
|
|
81
|
+
logger(f" 🔧 {package}: {version}")
|
|
82
|
+
|
|
83
|
+
if conda_packages:
|
|
84
|
+
logger(f" 📦 Conda packages:")
|
|
85
|
+
for package, version in conda_packages.items():
|
|
86
|
+
logger(f" 🔧 {package}: {version}")
|
|
87
|
+
|
|
88
|
+
logger(f" 🏗️ Base image: {base_image}")
|
|
89
|
+
|
|
90
|
+
start_time = time.time()
|
|
91
|
+
res = bakery.bake()
|
|
92
|
+
# TODO: Get actual bake time from bakery
|
|
93
|
+
bake_time = time.time() - start_time
|
|
94
|
+
|
|
95
|
+
with logger_lock:
|
|
96
|
+
logger(f"🏁 Baked [{ref}] in {bake_time:.2f} seconds!")
|
|
97
|
+
nonlocal images_baked
|
|
98
|
+
images_baked += 1
|
|
99
|
+
return res
|
|
100
|
+
except FastBakeryException as ex:
|
|
101
|
+
raise BakerException(f"Bake [{ref}] failed: {str(ex)}")
|
|
102
|
+
|
|
103
|
+
# Call the cached bake function with the provided parameters
|
|
104
|
+
return _cached_bake(
|
|
105
|
+
ref=ref,
|
|
106
|
+
python=python,
|
|
107
|
+
pypi_packages=pypi_packages,
|
|
108
|
+
conda_packages=conda_packages,
|
|
109
|
+
base_image=base_image,
|
|
110
|
+
)
|
|
@@ -90,6 +90,7 @@ class DockerEnvironmentException(MetaflowException):
|
|
|
90
90
|
class DockerEnvironment(MetaflowEnvironment):
|
|
91
91
|
TYPE = "fast-bakery"
|
|
92
92
|
_filecache = None
|
|
93
|
+
_force_rebuild = False
|
|
93
94
|
|
|
94
95
|
def __init__(self, flow):
|
|
95
96
|
self.skipped_steps = set()
|
|
@@ -178,12 +179,20 @@ class DockerEnvironment(MetaflowEnvironment):
|
|
|
178
179
|
|
|
179
180
|
if self.skipped_steps:
|
|
180
181
|
self.delegate = CondaEnvironment(self.flow)
|
|
182
|
+
self.delegate._force_rebuild = self._force_rebuild
|
|
181
183
|
self.delegate.set_local_root(self.local_root)
|
|
182
184
|
self.delegate.validate_environment(echo, self.datastore_type)
|
|
183
185
|
self.delegate.init_environment(echo, self.skipped_steps)
|
|
184
186
|
|
|
185
187
|
def _bake(self, steps) -> Dict[str, FastBakeryApiResponse]:
|
|
186
188
|
metafile_path = get_fastbakery_metafile_path(self.local_root, self.flow.name)
|
|
189
|
+
if self._force_rebuild:
|
|
190
|
+
# clear the metafile if force rebuilding, effectively skipping the cache.
|
|
191
|
+
try:
|
|
192
|
+
os.remove(metafile_path)
|
|
193
|
+
except Exception:
|
|
194
|
+
pass
|
|
195
|
+
|
|
187
196
|
logger_lock = threading.Lock()
|
|
188
197
|
|
|
189
198
|
@cache_request(metafile_path)
|
|
@@ -201,7 +210,8 @@ class DockerEnvironment(MetaflowEnvironment):
|
|
|
201
210
|
bakery.pypi_packages(pypi_packages)
|
|
202
211
|
bakery.conda_packages(conda_packages)
|
|
203
212
|
bakery.base_image(base_image)
|
|
204
|
-
|
|
213
|
+
if self._force_rebuild:
|
|
214
|
+
bakery.ignore_cache()
|
|
205
215
|
|
|
206
216
|
with logger_lock:
|
|
207
217
|
self.logger(f"🍳 Baking [{ref}] ...")
|
|
@@ -341,12 +351,16 @@ class DockerEnvironment(MetaflowEnvironment):
|
|
|
341
351
|
config.append("--disable=F0401")
|
|
342
352
|
return config
|
|
343
353
|
|
|
344
|
-
def get_package_commands(
|
|
354
|
+
def get_package_commands(
|
|
355
|
+
self, codepackage_url, datastore_type, code_package_metadata=None
|
|
356
|
+
):
|
|
345
357
|
# we must set the skip install flag at this stage in order to skip package downloads,
|
|
346
358
|
# doing so in bootstrap_commands is too late in the lifecycle.
|
|
347
359
|
return [
|
|
348
360
|
"export METAFLOW_SKIP_INSTALL_DEPENDENCIES=$FASTBAKERY_IMAGE",
|
|
349
|
-
] + super().get_package_commands(
|
|
361
|
+
] + super().get_package_commands(
|
|
362
|
+
codepackage_url, datastore_type, code_package_metadata=code_package_metadata
|
|
363
|
+
)
|
|
350
364
|
|
|
351
365
|
def bootstrap_commands(self, step_name, datastore_type):
|
|
352
366
|
if step_name in self.skipped_steps:
|
|
@@ -5,6 +5,7 @@ import time
|
|
|
5
5
|
|
|
6
6
|
from metaflow.exception import MetaflowException
|
|
7
7
|
from metaflow.metaflow_config import KUBERNETES_NAMESPACE
|
|
8
|
+
from .pod_killer import PodKiller
|
|
8
9
|
|
|
9
10
|
|
|
10
11
|
CLIENT_REFRESH_INTERVAL_SECONDS = 300
|
|
@@ -105,50 +106,23 @@ class KubernetesClient(object):
|
|
|
105
106
|
return list(results)
|
|
106
107
|
|
|
107
108
|
def kill_pods(self, flow_name, run_id, user, echo):
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
],
|
|
126
|
-
stderr=True,
|
|
127
|
-
stdin=False,
|
|
128
|
-
stdout=True,
|
|
129
|
-
tty=False,
|
|
130
|
-
)
|
|
131
|
-
except Exception:
|
|
132
|
-
# best effort kill for pod can fail.
|
|
133
|
-
try:
|
|
134
|
-
job_name = pod.metadata.labels.get("job-name", None)
|
|
135
|
-
if job_name is None:
|
|
136
|
-
raise Exception("Could not determine job name")
|
|
137
|
-
|
|
138
|
-
job_api.patch_namespaced_job(
|
|
139
|
-
name=job_name,
|
|
140
|
-
namespace=pod.metadata.namespace,
|
|
141
|
-
field_manager="metaflow",
|
|
142
|
-
body={"spec": {"parallelism": 0}},
|
|
143
|
-
)
|
|
144
|
-
except Exception as e:
|
|
145
|
-
echo("failed to kill pod %s - %s" % (pod.metadata.name, str(e)))
|
|
146
|
-
|
|
147
|
-
with ThreadPoolExecutor() as executor:
|
|
148
|
-
operated_pods = list(executor.map(_kill_pod, pods))
|
|
149
|
-
|
|
150
|
-
if not operated_pods:
|
|
151
|
-
echo("No active Kubernetes pods found for run *%s*" % run_id)
|
|
109
|
+
# Create PodKiller instance
|
|
110
|
+
killer = PodKiller(self._client, echo, self._namespace)
|
|
111
|
+
|
|
112
|
+
# Process all matching jobs and jobsets based on their outcomes
|
|
113
|
+
(
|
|
114
|
+
job_jobset_results,
|
|
115
|
+
num_jobs,
|
|
116
|
+
num_jobsets,
|
|
117
|
+
) = killer.process_matching_jobs_and_jobsets(flow_name, run_id, user)
|
|
118
|
+
|
|
119
|
+
if job_jobset_results:
|
|
120
|
+
successful_operations = sum(1 for result in job_jobset_results if result)
|
|
121
|
+
echo(
|
|
122
|
+
f"Found and processed {num_jobs} jobs and {num_jobsets} jobsets, {successful_operations} operations successful\n"
|
|
123
|
+
)
|
|
124
|
+
else:
|
|
125
|
+
echo("No matching jobs or jobsets found for run *%s*" % run_id)
|
|
152
126
|
|
|
153
127
|
def job(self, **kwargs):
|
|
154
128
|
from metaflow.plugins.kubernetes.kubernetes_job import KubernetesJob
|
|
@@ -0,0 +1,374 @@
|
|
|
1
|
+
from typing import TYPE_CHECKING
|
|
2
|
+
|
|
3
|
+
if TYPE_CHECKING:
|
|
4
|
+
from kubernetes.client.models.v1_job import V1Job
|
|
5
|
+
from kubernetes.client.models.v1_job_status import V1JobStatus
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def _is_jobset_child(job: "V1Job"):
|
|
9
|
+
if job.metadata.owner_references:
|
|
10
|
+
for owner_ref in job.metadata.owner_references:
|
|
11
|
+
if owner_ref.kind == "JobSet":
|
|
12
|
+
return owner_ref
|
|
13
|
+
return None
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class JobOutcomes:
|
|
17
|
+
KILL = "kill"
|
|
18
|
+
DELETE = "delete"
|
|
19
|
+
LEAVE_UNCHANGED = "leave_unchanged"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def derive_jobset_outcome(jobset_status):
|
|
23
|
+
return (
|
|
24
|
+
JobOutcomes.LEAVE_UNCHANGED
|
|
25
|
+
if jobset_status.get("terminalState", None)
|
|
26
|
+
else JobOutcomes.DELETE
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def derive_job_outcome(job_status: "V1JobStatus"):
|
|
31
|
+
if job_status.start_time is None:
|
|
32
|
+
# If the job has not started even then just wipe it!
|
|
33
|
+
return JobOutcomes.DELETE
|
|
34
|
+
if job_status.succeeded or job_status.failed:
|
|
35
|
+
return JobOutcomes.LEAVE_UNCHANGED
|
|
36
|
+
|
|
37
|
+
if job_status.completion_time is not None:
|
|
38
|
+
return JobOutcomes.LEAVE_UNCHANGED
|
|
39
|
+
|
|
40
|
+
# This means that the job has neither finished or succedded.
|
|
41
|
+
if job_status.active:
|
|
42
|
+
return JobOutcomes.DELETE
|
|
43
|
+
|
|
44
|
+
# This means that the job is not active. Had started. There is not succedded/fail.
|
|
45
|
+
# This is a weird state. Better to just kill the job
|
|
46
|
+
return JobOutcomes.DELETE
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class PodKiller:
|
|
50
|
+
def __init__(self, kubernetes_client, echo_func, namespace, progress_bar=None):
|
|
51
|
+
self.client = kubernetes_client
|
|
52
|
+
self.echo = echo_func
|
|
53
|
+
self.api_instance = self.client.CoreV1Api()
|
|
54
|
+
self.job_api = self.client.BatchV1Api()
|
|
55
|
+
self._namespace = namespace
|
|
56
|
+
self.jobset_api = None
|
|
57
|
+
self.jobset_api = self.client.CustomObjectsApi()
|
|
58
|
+
self.progress_bar = progress_bar
|
|
59
|
+
|
|
60
|
+
def _delete_jobset(self, owner_ref, namespace):
|
|
61
|
+
"""Delete a JobSet if it's the owner of a job."""
|
|
62
|
+
if not self.jobset_api:
|
|
63
|
+
self.echo("JobSet API not available, cannot delete JobSet\n")
|
|
64
|
+
return False
|
|
65
|
+
|
|
66
|
+
try:
|
|
67
|
+
jobset_name = owner_ref.name
|
|
68
|
+
self.echo(f"Deleting JobSet: {jobset_name}\n")
|
|
69
|
+
|
|
70
|
+
self.jobset_api.delete_namespaced_custom_object(
|
|
71
|
+
group="jobset.x-k8s.io",
|
|
72
|
+
version="v1alpha2",
|
|
73
|
+
namespace=namespace,
|
|
74
|
+
plural="jobsets",
|
|
75
|
+
name=jobset_name,
|
|
76
|
+
)
|
|
77
|
+
return True
|
|
78
|
+
except Exception as e:
|
|
79
|
+
self.echo(f"Failed to delete JobSet {owner_ref.name}: {str(e)}\n")
|
|
80
|
+
return False
|
|
81
|
+
|
|
82
|
+
def _delete_job(self, job_name, namespace):
|
|
83
|
+
"""Delete a Batch Job and check for JobSet owner reference."""
|
|
84
|
+
try:
|
|
85
|
+
# First get the job to check for owner references
|
|
86
|
+
job = self.job_api.read_namespaced_job(name=job_name, namespace=namespace)
|
|
87
|
+
# Check for JobSet owner reference
|
|
88
|
+
jobset_ref = _is_jobset_child(job)
|
|
89
|
+
if jobset_ref:
|
|
90
|
+
if self._delete_jobset(jobset_ref, namespace):
|
|
91
|
+
return True
|
|
92
|
+
|
|
93
|
+
# If no JobSet owner or JobSet deletion failed, delete the job
|
|
94
|
+
self.echo(f"Deleting Batch Job: {job_name}")
|
|
95
|
+
self.job_api.delete_namespaced_job(
|
|
96
|
+
name=job_name, namespace=namespace, propagation_policy="Background"
|
|
97
|
+
)
|
|
98
|
+
return True
|
|
99
|
+
|
|
100
|
+
except Exception as e:
|
|
101
|
+
self.echo(f"Failed to delete job {job_name}: {str(e)}")
|
|
102
|
+
return False
|
|
103
|
+
|
|
104
|
+
def _kill_pod_process(self, pod):
|
|
105
|
+
"""Attempt to kill processes inside a pod."""
|
|
106
|
+
from kubernetes.stream import stream
|
|
107
|
+
|
|
108
|
+
try:
|
|
109
|
+
stream(
|
|
110
|
+
self.api_instance.connect_get_namespaced_pod_exec,
|
|
111
|
+
name=pod.metadata.name,
|
|
112
|
+
namespace=pod.metadata.namespace,
|
|
113
|
+
command=["/bin/sh", "-c", "/sbin/killall5"],
|
|
114
|
+
stderr=True,
|
|
115
|
+
stdin=False,
|
|
116
|
+
stdout=True,
|
|
117
|
+
tty=False,
|
|
118
|
+
)
|
|
119
|
+
return True
|
|
120
|
+
except Exception as e:
|
|
121
|
+
self.echo(
|
|
122
|
+
f"Failed to kill processes in pod {pod.metadata.name}: {str(e)}\n"
|
|
123
|
+
)
|
|
124
|
+
return False
|
|
125
|
+
|
|
126
|
+
@staticmethod
|
|
127
|
+
def _metaflow_matching_spec(run_id, user, flow_name, annotations, labels):
|
|
128
|
+
# Handle argo prefixes in run_id like in _find_active_pods
|
|
129
|
+
_argo_run_id = None
|
|
130
|
+
if run_id is not None:
|
|
131
|
+
_argo_run_id = run_id[run_id.startswith("argo-") and len("argo-") :]
|
|
132
|
+
return (
|
|
133
|
+
annotations
|
|
134
|
+
and (
|
|
135
|
+
run_id is None
|
|
136
|
+
or (annotations.get("metaflow/run_id") == run_id)
|
|
137
|
+
# we want to also match jobsets launched by argo-workflows
|
|
138
|
+
# This line has no real value since the We already avoid any
|
|
139
|
+
# argo-workflows related terminations.
|
|
140
|
+
or (
|
|
141
|
+
labels.get("workflows.argoproj.io/workflow") is not None
|
|
142
|
+
and labels.get("workflows.argoproj.io/workflow") == _argo_run_id
|
|
143
|
+
)
|
|
144
|
+
)
|
|
145
|
+
and (user is None or annotations.get("metaflow/user") == user)
|
|
146
|
+
and (annotations.get("metaflow/flow_name") == flow_name)
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
def _find_matching_jobs(self, flow_name, run_id=None, user=None):
|
|
150
|
+
"""Find jobs that match the flow_name, run_id, and user criteria using similar logic to _find_active_pods"""
|
|
151
|
+
|
|
152
|
+
def paginated_job_finder(namespace):
|
|
153
|
+
continue_token = None
|
|
154
|
+
while True:
|
|
155
|
+
response = self.job_api.list_namespaced_job(
|
|
156
|
+
namespace=namespace, limit=100, _continue=continue_token
|
|
157
|
+
)
|
|
158
|
+
yield response.items
|
|
159
|
+
continue_token = response.metadata._continue
|
|
160
|
+
if not continue_token:
|
|
161
|
+
break
|
|
162
|
+
|
|
163
|
+
try:
|
|
164
|
+
matching_jobs = []
|
|
165
|
+
for _jobs in paginated_job_finder(self._namespace):
|
|
166
|
+
for job in _jobs:
|
|
167
|
+
_match = self._metaflow_matching_spec(
|
|
168
|
+
run_id=run_id,
|
|
169
|
+
user=user,
|
|
170
|
+
flow_name=flow_name,
|
|
171
|
+
annotations=job.metadata.annotations,
|
|
172
|
+
labels=job.metadata.labels,
|
|
173
|
+
)
|
|
174
|
+
if _match:
|
|
175
|
+
matching_jobs.append(job)
|
|
176
|
+
return matching_jobs
|
|
177
|
+
except Exception as e:
|
|
178
|
+
self.echo(f"Error finding jobs: {str(e)}\n")
|
|
179
|
+
return []
|
|
180
|
+
|
|
181
|
+
def _find_matching_jobsets(self, flow_name, run_id=None, user=None):
|
|
182
|
+
"""Find jobsets that match the flow_name, run_id, and user criteria using similar logic to _find_active_pods"""
|
|
183
|
+
if not self.jobset_api:
|
|
184
|
+
return []
|
|
185
|
+
|
|
186
|
+
def paginated_jobset_finder(namespace):
|
|
187
|
+
continue_token = None
|
|
188
|
+
responses = []
|
|
189
|
+
while True:
|
|
190
|
+
response = self.jobset_api.list_namespaced_custom_object(
|
|
191
|
+
group="jobset.x-k8s.io",
|
|
192
|
+
version="v1alpha2",
|
|
193
|
+
namespace=namespace,
|
|
194
|
+
plural="jobsets",
|
|
195
|
+
limit=100,
|
|
196
|
+
**({"_continue": continue_token} if continue_token else {}),
|
|
197
|
+
)
|
|
198
|
+
continue_token = response.get("metadata", {}).get("continue", None)
|
|
199
|
+
responses.append(response)
|
|
200
|
+
if not continue_token:
|
|
201
|
+
break
|
|
202
|
+
return responses
|
|
203
|
+
|
|
204
|
+
try:
|
|
205
|
+
matching_jobsets = []
|
|
206
|
+
|
|
207
|
+
for jobset_response in paginated_jobset_finder(self._namespace):
|
|
208
|
+
for jobset in jobset_response.get("items", []):
|
|
209
|
+
_match = self._metaflow_matching_spec(
|
|
210
|
+
run_id=run_id,
|
|
211
|
+
user=user,
|
|
212
|
+
flow_name=flow_name,
|
|
213
|
+
annotations=jobset.get("metadata", {}).get("annotations", {}),
|
|
214
|
+
labels=jobset.get("metadata", {}).get("labels", {}),
|
|
215
|
+
)
|
|
216
|
+
if _match:
|
|
217
|
+
matching_jobsets.append(jobset)
|
|
218
|
+
|
|
219
|
+
return matching_jobsets
|
|
220
|
+
except Exception as e:
|
|
221
|
+
self.echo(f"Error finding jobsets: {str(e)}\n")
|
|
222
|
+
return []
|
|
223
|
+
|
|
224
|
+
def _kill_pods_for_job(self, job):
|
|
225
|
+
"""Find and kill pods associated with a specific job"""
|
|
226
|
+
job_name = job.metadata.name
|
|
227
|
+
namespace = job.metadata.namespace
|
|
228
|
+
|
|
229
|
+
try:
|
|
230
|
+
# Find pods with the job-name label matching this job
|
|
231
|
+
pods = self.api_instance.list_namespaced_pod(
|
|
232
|
+
namespace=namespace, label_selector=f"job-name={job_name}"
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
killed_pods = 0
|
|
236
|
+
for pod in pods.items:
|
|
237
|
+
if pod.status.phase in ["Running"]:
|
|
238
|
+
self.echo(
|
|
239
|
+
f"Killing processes in pod {pod.metadata.name} for job {job_name}"
|
|
240
|
+
)
|
|
241
|
+
if self._kill_pod_process(pod):
|
|
242
|
+
killed_pods += 1
|
|
243
|
+
|
|
244
|
+
return killed_pods > 0
|
|
245
|
+
except Exception as e:
|
|
246
|
+
self.echo(f"Failed to find/kill pods for job {job_name}: {str(e)}")
|
|
247
|
+
return False
|
|
248
|
+
|
|
249
|
+
def _handle_job_outcome(self, job, outcome):
|
|
250
|
+
"""Handle a job based on the derived outcome"""
|
|
251
|
+
job_name = job.metadata.name
|
|
252
|
+
namespace = job.metadata.namespace
|
|
253
|
+
|
|
254
|
+
if outcome == JobOutcomes.LEAVE_UNCHANGED:
|
|
255
|
+
# self.echo(f"Job {job_name} is in terminal state, leaving unchanged")
|
|
256
|
+
return None
|
|
257
|
+
elif outcome == JobOutcomes.DELETE:
|
|
258
|
+
self.echo(f"Deleting Job {job_name}")
|
|
259
|
+
return self._delete_job(job_name, namespace)
|
|
260
|
+
elif outcome == JobOutcomes.KILL:
|
|
261
|
+
self.echo(f"Killing Job {job_name}")
|
|
262
|
+
# First try to kill the pod processes
|
|
263
|
+
pods_killed = self._kill_pods_for_job(job)
|
|
264
|
+
if pods_killed > 0:
|
|
265
|
+
return True
|
|
266
|
+
# Worst case if we are not able to delete any pod, then delete the Job.
|
|
267
|
+
return self._delete_job(job_name, namespace)
|
|
268
|
+
else:
|
|
269
|
+
self.echo(f"Unknown outcome {outcome} for job {job_name}\n")
|
|
270
|
+
return False
|
|
271
|
+
|
|
272
|
+
def _handle_jobset_outcome(self, jobset, outcome):
|
|
273
|
+
"""Handle a jobset based on the derived outcome"""
|
|
274
|
+
jobset_name = jobset.get("metadata", {}).get("name", "unknown")
|
|
275
|
+
namespace = jobset.get("metadata", {}).get("namespace", self._namespace)
|
|
276
|
+
|
|
277
|
+
if outcome == JobOutcomes.LEAVE_UNCHANGED:
|
|
278
|
+
# self.echo(f"JobSet {jobset_name} is in terminal state, leaving unchanged")
|
|
279
|
+
return None
|
|
280
|
+
elif outcome == JobOutcomes.DELETE:
|
|
281
|
+
self.echo(f"Deleting JobSet {jobset_name}")
|
|
282
|
+
try:
|
|
283
|
+
self.jobset_api.delete_namespaced_custom_object(
|
|
284
|
+
group="jobset.x-k8s.io",
|
|
285
|
+
version="v1alpha2",
|
|
286
|
+
namespace=namespace,
|
|
287
|
+
plural="jobsets",
|
|
288
|
+
name=jobset_name,
|
|
289
|
+
)
|
|
290
|
+
return True
|
|
291
|
+
except Exception as e:
|
|
292
|
+
self.echo(f"Failed to delete JobSet {jobset_name}: {str(e)}")
|
|
293
|
+
return False
|
|
294
|
+
else:
|
|
295
|
+
self.echo(f"Unknown outcome {outcome} for JobSet {jobset_name}")
|
|
296
|
+
return False
|
|
297
|
+
|
|
298
|
+
def extract_matching_jobs_and_jobsets(self, flow_name, run_id, user):
|
|
299
|
+
"""Extract matching jobs and jobsets based on the flow_name, run_id, and user criteria"""
|
|
300
|
+
jobs = self._find_matching_jobs(flow_name, run_id, user)
|
|
301
|
+
jobsets = self._find_matching_jobsets(flow_name, run_id, user)
|
|
302
|
+
return [(j, derive_job_outcome(j.status)) for j in jobs], [
|
|
303
|
+
(j, derive_jobset_outcome(j.get("status", {}))) for j in jobsets
|
|
304
|
+
]
|
|
305
|
+
|
|
306
|
+
def process_matching_jobs_and_jobsets(self, flow_name, run_id, user):
|
|
307
|
+
"""Process all matching jobs and jobsets based on their derived outcomes"""
|
|
308
|
+
results = []
|
|
309
|
+
progress_update = lambda x: x
|
|
310
|
+
if self.progress_bar:
|
|
311
|
+
progress_update = lambda x: self.progress_bar.update(1, x)
|
|
312
|
+
|
|
313
|
+
# Process matching jobs
|
|
314
|
+
_jobs, _jobsets = [], []
|
|
315
|
+
jobs = self._find_matching_jobs(flow_name, run_id, user)
|
|
316
|
+
for job in jobs:
|
|
317
|
+
outcome = derive_job_outcome(job.status)
|
|
318
|
+
result = self._handle_job_outcome(job, outcome)
|
|
319
|
+
# results.append(result)
|
|
320
|
+
if result is not None:
|
|
321
|
+
progress_update("💀 Killing Job %s" % job.metadata.name)
|
|
322
|
+
results.append(result)
|
|
323
|
+
_jobs.append(result)
|
|
324
|
+
|
|
325
|
+
# Process matching jobsets
|
|
326
|
+
jobsets = self._find_matching_jobsets(flow_name, run_id, user)
|
|
327
|
+
for jobset in jobsets:
|
|
328
|
+
jobset_status = jobset.get("status", {})
|
|
329
|
+
outcome = derive_jobset_outcome(jobset_status)
|
|
330
|
+
result = self._handle_jobset_outcome(jobset, outcome)
|
|
331
|
+
if result is not None:
|
|
332
|
+
progress_update(
|
|
333
|
+
"💀 Deleting JobSet %s"
|
|
334
|
+
% jobset.get("metadata", {}).get("name", "unknown")
|
|
335
|
+
)
|
|
336
|
+
results.append(result)
|
|
337
|
+
_jobsets.append(result)
|
|
338
|
+
|
|
339
|
+
return results, len(_jobs), len(_jobsets)
|
|
340
|
+
|
|
341
|
+
def process_matching_jobs_and_jobsets_force_all(self, flow_name, run_id, user):
|
|
342
|
+
"""Force process ALL matching jobs and jobsets regardless of their status/outcome"""
|
|
343
|
+
results = []
|
|
344
|
+
progress_update = lambda x: x
|
|
345
|
+
if self.progress_bar:
|
|
346
|
+
progress_update = lambda x: self.progress_bar.update(1, x)
|
|
347
|
+
|
|
348
|
+
# Process matching jobs - FORCE DELETE ALL
|
|
349
|
+
_jobs, _jobsets = [], []
|
|
350
|
+
jobs = self._find_matching_jobs(flow_name, run_id, user)
|
|
351
|
+
for job in jobs:
|
|
352
|
+
# Force DELETE outcome regardless of actual status
|
|
353
|
+
result = self._handle_job_outcome(job, JobOutcomes.DELETE)
|
|
354
|
+
progress_update("🔥 FORCE Deleting Job %s" % job.metadata.name)
|
|
355
|
+
results.append(
|
|
356
|
+
result if result is not None else True
|
|
357
|
+
) # Treat None as success for force mode
|
|
358
|
+
_jobs.append(result if result is not None else True)
|
|
359
|
+
|
|
360
|
+
# Process matching jobsets - FORCE DELETE ALL
|
|
361
|
+
jobsets = self._find_matching_jobsets(flow_name, run_id, user)
|
|
362
|
+
for jobset in jobsets:
|
|
363
|
+
# Force DELETE outcome regardless of actual status
|
|
364
|
+
result = self._handle_jobset_outcome(jobset, JobOutcomes.DELETE)
|
|
365
|
+
progress_update(
|
|
366
|
+
"🔥 FORCE Deleting JobSet %s"
|
|
367
|
+
% jobset.get("metadata", {}).get("name", "unknown")
|
|
368
|
+
)
|
|
369
|
+
results.append(
|
|
370
|
+
result if result is not None else True
|
|
371
|
+
) # Treat None as success for force mode
|
|
372
|
+
_jobsets.append(result if result is not None else True)
|
|
373
|
+
|
|
374
|
+
return results, len(_jobs), len(_jobsets)
|
|
@@ -1,8 +1,7 @@
|
|
|
1
|
-
import sqlite3
|
|
2
1
|
from metaflow.cards import Markdown, Table
|
|
3
2
|
from metaflow.metaflow_current import current
|
|
4
3
|
|
|
5
|
-
from .
|
|
4
|
+
from .utils import get_storage_path
|
|
6
5
|
from ..card_utilities.async_cards import CardRefresher
|
|
7
6
|
from ..card_utilities.extra_components import BarPlot, ViolinPlot
|
|
8
7
|
|
|
@@ -17,9 +16,7 @@ class NimMetricsRefresher(CardRefresher):
|
|
|
17
16
|
self._file_name = get_storage_path(current.task_id)
|
|
18
17
|
|
|
19
18
|
def sqlite_fetch_func(self, conn):
|
|
20
|
-
cursor = conn.cursor()
|
|
21
19
|
try:
|
|
22
|
-
conn = sqlite3.connect(self._file_name)
|
|
23
20
|
cursor = conn.cursor()
|
|
24
21
|
cursor.execute(
|
|
25
22
|
"SELECT error, success, status_code, prompt_tokens, completion_tokens, e2e_time, model FROM metrics"
|
|
@@ -85,7 +82,6 @@ class NimMetricsRefresher(CardRefresher):
|
|
|
85
82
|
current_card.refresh()
|
|
86
83
|
|
|
87
84
|
def on_error(self, current_card, error_message):
|
|
88
|
-
|
|
89
85
|
if isinstance(error_message, FileNotFoundError):
|
|
90
86
|
return
|
|
91
87
|
|
|
@@ -99,7 +95,6 @@ class NimMetricsRefresher(CardRefresher):
|
|
|
99
95
|
current_card.refresh()
|
|
100
96
|
|
|
101
97
|
def update_only_components(self, current_card, data_object):
|
|
102
|
-
|
|
103
98
|
# update request success data
|
|
104
99
|
self._metrics_charts["request_success"].spec["data"][0]["values"] = [
|
|
105
100
|
{
|