ob-metaflow-extensions 1.1.45rc3__py2.py3-none-any.whl → 1.5.1__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ob-metaflow-extensions might be problematic. Click here for more details.
- metaflow_extensions/outerbounds/__init__.py +1 -7
- metaflow_extensions/outerbounds/config/__init__.py +35 -0
- metaflow_extensions/outerbounds/plugins/__init__.py +186 -57
- metaflow_extensions/outerbounds/plugins/apps/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/apps/app_cli.py +0 -0
- metaflow_extensions/outerbounds/plugins/apps/app_utils.py +187 -0
- metaflow_extensions/outerbounds/plugins/apps/consts.py +3 -0
- metaflow_extensions/outerbounds/plugins/apps/core/__init__.py +15 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_state_machine.py +506 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_vendor/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/__init__.py +4 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/spinners.py +478 -0
- metaflow_extensions/outerbounds/plugins/apps/core/app_config.py +128 -0
- metaflow_extensions/outerbounds/plugins/apps/core/app_deploy_decorator.py +330 -0
- metaflow_extensions/outerbounds/plugins/apps/core/artifacts.py +0 -0
- metaflow_extensions/outerbounds/plugins/apps/core/capsule.py +958 -0
- metaflow_extensions/outerbounds/plugins/apps/core/click_importer.py +24 -0
- metaflow_extensions/outerbounds/plugins/apps/core/code_package/__init__.py +3 -0
- metaflow_extensions/outerbounds/plugins/apps/core/code_package/code_packager.py +618 -0
- metaflow_extensions/outerbounds/plugins/apps/core/code_package/examples.py +125 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/__init__.py +15 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/cli_generator.py +165 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/config_utils.py +966 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/schema_export.py +299 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/typed_configs.py +233 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/typed_init_generator.py +537 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/unified_config.py +1125 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config_schema.yaml +337 -0
- metaflow_extensions/outerbounds/plugins/apps/core/dependencies.py +115 -0
- metaflow_extensions/outerbounds/plugins/apps/core/deployer.py +959 -0
- metaflow_extensions/outerbounds/plugins/apps/core/experimental/__init__.py +89 -0
- metaflow_extensions/outerbounds/plugins/apps/core/perimeters.py +87 -0
- metaflow_extensions/outerbounds/plugins/apps/core/secrets.py +164 -0
- metaflow_extensions/outerbounds/plugins/apps/core/utils.py +233 -0
- metaflow_extensions/outerbounds/plugins/apps/core/validations.py +17 -0
- metaflow_extensions/outerbounds/plugins/apps/deploy_decorator.py +201 -0
- metaflow_extensions/outerbounds/plugins/apps/supervisord_utils.py +243 -0
- metaflow_extensions/outerbounds/plugins/auth_server.py +28 -8
- metaflow_extensions/outerbounds/plugins/aws/__init__.py +4 -0
- metaflow_extensions/outerbounds/plugins/aws/assume_role.py +3 -0
- metaflow_extensions/outerbounds/plugins/aws/assume_role_decorator.py +118 -0
- metaflow_extensions/outerbounds/plugins/card_utilities/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/card_utilities/async_cards.py +142 -0
- metaflow_extensions/outerbounds/plugins/card_utilities/extra_components.py +545 -0
- metaflow_extensions/outerbounds/plugins/card_utilities/injector.py +70 -0
- metaflow_extensions/outerbounds/plugins/checkpoint_datastores/__init__.py +2 -0
- metaflow_extensions/outerbounds/plugins/checkpoint_datastores/coreweave.py +71 -0
- metaflow_extensions/outerbounds/plugins/checkpoint_datastores/external_chckpt.py +85 -0
- metaflow_extensions/outerbounds/plugins/checkpoint_datastores/nebius.py +73 -0
- metaflow_extensions/outerbounds/plugins/fast_bakery/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/fast_bakery/baker.py +110 -0
- metaflow_extensions/outerbounds/plugins/fast_bakery/docker_environment.py +391 -0
- metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery.py +188 -0
- metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_cli.py +54 -0
- metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_decorator.py +50 -0
- metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py +79 -0
- metaflow_extensions/outerbounds/plugins/kubernetes/pod_killer.py +374 -0
- metaflow_extensions/outerbounds/plugins/nim/card.py +140 -0
- metaflow_extensions/outerbounds/plugins/nim/nim_decorator.py +101 -0
- metaflow_extensions/outerbounds/plugins/nim/nim_manager.py +379 -0
- metaflow_extensions/outerbounds/plugins/nim/utils.py +36 -0
- metaflow_extensions/outerbounds/plugins/nvcf/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/nvcf/constants.py +3 -0
- metaflow_extensions/outerbounds/plugins/nvcf/exceptions.py +94 -0
- metaflow_extensions/outerbounds/plugins/nvcf/heartbeat_store.py +178 -0
- metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py +417 -0
- metaflow_extensions/outerbounds/plugins/nvcf/nvcf_cli.py +280 -0
- metaflow_extensions/outerbounds/plugins/nvcf/nvcf_decorator.py +242 -0
- metaflow_extensions/outerbounds/plugins/nvcf/utils.py +6 -0
- metaflow_extensions/outerbounds/plugins/nvct/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/nvct/exceptions.py +71 -0
- metaflow_extensions/outerbounds/plugins/nvct/nvct.py +131 -0
- metaflow_extensions/outerbounds/plugins/nvct/nvct_cli.py +289 -0
- metaflow_extensions/outerbounds/plugins/nvct/nvct_decorator.py +286 -0
- metaflow_extensions/outerbounds/plugins/nvct/nvct_runner.py +218 -0
- metaflow_extensions/outerbounds/plugins/nvct/utils.py +29 -0
- metaflow_extensions/outerbounds/plugins/ollama/__init__.py +225 -0
- metaflow_extensions/outerbounds/plugins/ollama/constants.py +1 -0
- metaflow_extensions/outerbounds/plugins/ollama/exceptions.py +22 -0
- metaflow_extensions/outerbounds/plugins/ollama/ollama.py +1924 -0
- metaflow_extensions/outerbounds/plugins/ollama/status_card.py +292 -0
- metaflow_extensions/outerbounds/plugins/optuna/__init__.py +48 -0
- metaflow_extensions/outerbounds/plugins/perimeters.py +19 -5
- metaflow_extensions/outerbounds/plugins/profilers/deco_injector.py +70 -0
- metaflow_extensions/outerbounds/plugins/profilers/gpu_profile_decorator.py +88 -0
- metaflow_extensions/outerbounds/plugins/profilers/simple_card_decorator.py +96 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/__init__.py +7 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/binary_caller.py +132 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/constants.py +11 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/exceptions.py +13 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/proxy_bootstrap.py +59 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_api.py +93 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_decorator.py +250 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_manager.py +225 -0
- metaflow_extensions/outerbounds/plugins/secrets/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/secrets/secrets.py +204 -0
- metaflow_extensions/outerbounds/plugins/snowflake/__init__.py +3 -0
- metaflow_extensions/outerbounds/plugins/snowflake/snowflake.py +378 -0
- metaflow_extensions/outerbounds/plugins/snowpark/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark.py +309 -0
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_cli.py +277 -0
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_client.py +150 -0
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_decorator.py +273 -0
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_exceptions.py +13 -0
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_job.py +241 -0
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_service_spec.py +259 -0
- metaflow_extensions/outerbounds/plugins/tensorboard/__init__.py +50 -0
- metaflow_extensions/outerbounds/plugins/torchtune/__init__.py +163 -0
- metaflow_extensions/outerbounds/plugins/vllm/__init__.py +255 -0
- metaflow_extensions/outerbounds/plugins/vllm/constants.py +1 -0
- metaflow_extensions/outerbounds/plugins/vllm/exceptions.py +1 -0
- metaflow_extensions/outerbounds/plugins/vllm/status_card.py +352 -0
- metaflow_extensions/outerbounds/plugins/vllm/vllm_manager.py +621 -0
- metaflow_extensions/outerbounds/profilers/gpu.py +131 -47
- metaflow_extensions/outerbounds/remote_config.py +53 -16
- metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py +138 -2
- metaflow_extensions/outerbounds/toplevel/ob_internal.py +4 -0
- metaflow_extensions/outerbounds/toplevel/plugins/ollama/__init__.py +1 -0
- metaflow_extensions/outerbounds/toplevel/plugins/optuna/__init__.py +1 -0
- metaflow_extensions/outerbounds/toplevel/plugins/snowflake/__init__.py +1 -0
- metaflow_extensions/outerbounds/toplevel/plugins/torchtune/__init__.py +1 -0
- metaflow_extensions/outerbounds/toplevel/plugins/vllm/__init__.py +1 -0
- metaflow_extensions/outerbounds/toplevel/s3_proxy.py +88 -0
- {ob_metaflow_extensions-1.1.45rc3.dist-info → ob_metaflow_extensions-1.5.1.dist-info}/METADATA +2 -2
- ob_metaflow_extensions-1.5.1.dist-info/RECORD +133 -0
- ob_metaflow_extensions-1.1.45rc3.dist-info/RECORD +0 -19
- {ob_metaflow_extensions-1.1.45rc3.dist-info → ob_metaflow_extensions-1.5.1.dist-info}/WHEEL +0 -0
- {ob_metaflow_extensions-1.1.45rc3.dist-info → ob_metaflow_extensions-1.5.1.dist-info}/top_level.txt +0 -0
|
@@ -1,8 +1,11 @@
|
|
|
1
|
+
from concurrent.futures import ThreadPoolExecutor
|
|
1
2
|
import os
|
|
2
3
|
import sys
|
|
3
4
|
import time
|
|
4
5
|
|
|
5
6
|
from metaflow.exception import MetaflowException
|
|
7
|
+
from metaflow.metaflow_config import KUBERNETES_NAMESPACE
|
|
8
|
+
from .pod_killer import PodKiller
|
|
6
9
|
|
|
7
10
|
|
|
8
11
|
CLIENT_REFRESH_INTERVAL_SECONDS = 300
|
|
@@ -27,6 +30,7 @@ class KubernetesClient(object):
|
|
|
27
30
|
% sys.executable
|
|
28
31
|
)
|
|
29
32
|
self._refresh_client()
|
|
33
|
+
self._namespace = KUBERNETES_NAMESPACE
|
|
30
34
|
|
|
31
35
|
def _refresh_client(self):
|
|
32
36
|
from metaflow_extensions.outerbounds.plugins.auth_server import get_token
|
|
@@ -50,7 +54,82 @@ class KubernetesClient(object):
|
|
|
50
54
|
|
|
51
55
|
return self._client
|
|
52
56
|
|
|
57
|
+
def _find_active_pods(self, flow_name, run_id=None, user=None):
|
|
58
|
+
def _request(_continue=None):
|
|
59
|
+
# handle paginated responses
|
|
60
|
+
return self._client.CoreV1Api().list_namespaced_pod(
|
|
61
|
+
namespace=self._namespace,
|
|
62
|
+
# limited selector support for K8S api. We want to cover multiple statuses: Running / Pending / Unknown
|
|
63
|
+
field_selector="status.phase!=Succeeded,status.phase!=Failed",
|
|
64
|
+
limit=1000,
|
|
65
|
+
_continue=_continue,
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
results = _request()
|
|
69
|
+
|
|
70
|
+
if run_id is not None:
|
|
71
|
+
# handle argo prefixes in run_id
|
|
72
|
+
run_id = run_id[run_id.startswith("argo-") and len("argo-") :]
|
|
73
|
+
|
|
74
|
+
while results.metadata._continue or results.items:
|
|
75
|
+
for pod in results.items:
|
|
76
|
+
match = (
|
|
77
|
+
# arbitrary pods might have no annotations at all.
|
|
78
|
+
pod.metadata.annotations
|
|
79
|
+
and pod.metadata.labels
|
|
80
|
+
and (
|
|
81
|
+
run_id is None
|
|
82
|
+
or (pod.metadata.annotations.get("metaflow/run_id") == run_id)
|
|
83
|
+
# we want to also match pods launched by argo-workflows
|
|
84
|
+
or (
|
|
85
|
+
pod.metadata.labels.get("workflows.argoproj.io/workflow")
|
|
86
|
+
== run_id
|
|
87
|
+
)
|
|
88
|
+
)
|
|
89
|
+
and (
|
|
90
|
+
user is None
|
|
91
|
+
or pod.metadata.annotations.get("metaflow/user") == user
|
|
92
|
+
)
|
|
93
|
+
and (
|
|
94
|
+
pod.metadata.annotations.get("metaflow/flow_name") == flow_name
|
|
95
|
+
)
|
|
96
|
+
)
|
|
97
|
+
if match:
|
|
98
|
+
yield pod
|
|
99
|
+
if not results.metadata._continue:
|
|
100
|
+
break
|
|
101
|
+
results = _request(results.metadata._continue)
|
|
102
|
+
|
|
103
|
+
def list(self, flow_name, run_id, user):
|
|
104
|
+
results = self._find_active_pods(flow_name, run_id, user)
|
|
105
|
+
|
|
106
|
+
return list(results)
|
|
107
|
+
|
|
108
|
+
def kill_pods(self, flow_name, run_id, user, echo):
|
|
109
|
+
# Create PodKiller instance
|
|
110
|
+
killer = PodKiller(self._client, echo, self._namespace)
|
|
111
|
+
|
|
112
|
+
# Process all matching jobs and jobsets based on their outcomes
|
|
113
|
+
(
|
|
114
|
+
job_jobset_results,
|
|
115
|
+
num_jobs,
|
|
116
|
+
num_jobsets,
|
|
117
|
+
) = killer.process_matching_jobs_and_jobsets(flow_name, run_id, user)
|
|
118
|
+
|
|
119
|
+
if job_jobset_results:
|
|
120
|
+
successful_operations = sum(1 for result in job_jobset_results if result)
|
|
121
|
+
echo(
|
|
122
|
+
f"Found and processed {num_jobs} jobs and {num_jobsets} jobsets, {successful_operations} operations successful\n"
|
|
123
|
+
)
|
|
124
|
+
else:
|
|
125
|
+
echo("No matching jobs or jobsets found for run *%s*" % run_id)
|
|
126
|
+
|
|
53
127
|
def job(self, **kwargs):
|
|
54
128
|
from metaflow.plugins.kubernetes.kubernetes_job import KubernetesJob
|
|
55
129
|
|
|
56
130
|
return KubernetesJob(self, **kwargs)
|
|
131
|
+
|
|
132
|
+
def jobset(self, **kwargs):
|
|
133
|
+
from metaflow.plugins.kubernetes.kubernetes_job import KubernetesJobSet
|
|
134
|
+
|
|
135
|
+
return KubernetesJobSet(self, **kwargs)
|
|
@@ -0,0 +1,374 @@
|
|
|
1
|
+
from typing import TYPE_CHECKING
|
|
2
|
+
|
|
3
|
+
if TYPE_CHECKING:
|
|
4
|
+
from kubernetes.client.models.v1_job import V1Job
|
|
5
|
+
from kubernetes.client.models.v1_job_status import V1JobStatus
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def _is_jobset_child(job: "V1Job"):
|
|
9
|
+
if job.metadata.owner_references:
|
|
10
|
+
for owner_ref in job.metadata.owner_references:
|
|
11
|
+
if owner_ref.kind == "JobSet":
|
|
12
|
+
return owner_ref
|
|
13
|
+
return None
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class JobOutcomes:
|
|
17
|
+
KILL = "kill"
|
|
18
|
+
DELETE = "delete"
|
|
19
|
+
LEAVE_UNCHANGED = "leave_unchanged"
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def derive_jobset_outcome(jobset_status):
|
|
23
|
+
return (
|
|
24
|
+
JobOutcomes.LEAVE_UNCHANGED
|
|
25
|
+
if jobset_status.get("terminalState", None)
|
|
26
|
+
else JobOutcomes.DELETE
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def derive_job_outcome(job_status: "V1JobStatus"):
|
|
31
|
+
if job_status.start_time is None:
|
|
32
|
+
# If the job has not started even then just wipe it!
|
|
33
|
+
return JobOutcomes.DELETE
|
|
34
|
+
if job_status.succeeded or job_status.failed:
|
|
35
|
+
return JobOutcomes.LEAVE_UNCHANGED
|
|
36
|
+
|
|
37
|
+
if job_status.completion_time is not None:
|
|
38
|
+
return JobOutcomes.LEAVE_UNCHANGED
|
|
39
|
+
|
|
40
|
+
# This means that the job has neither finished or succedded.
|
|
41
|
+
if job_status.active:
|
|
42
|
+
return JobOutcomes.DELETE
|
|
43
|
+
|
|
44
|
+
# This means that the job is not active. Had started. There is not succedded/fail.
|
|
45
|
+
# This is a weird state. Better to just kill the job
|
|
46
|
+
return JobOutcomes.DELETE
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
class PodKiller:
|
|
50
|
+
def __init__(self, kubernetes_client, echo_func, namespace, progress_bar=None):
|
|
51
|
+
self.client = kubernetes_client
|
|
52
|
+
self.echo = echo_func
|
|
53
|
+
self.api_instance = self.client.CoreV1Api()
|
|
54
|
+
self.job_api = self.client.BatchV1Api()
|
|
55
|
+
self._namespace = namespace
|
|
56
|
+
self.jobset_api = None
|
|
57
|
+
self.jobset_api = self.client.CustomObjectsApi()
|
|
58
|
+
self.progress_bar = progress_bar
|
|
59
|
+
|
|
60
|
+
def _delete_jobset(self, owner_ref, namespace):
|
|
61
|
+
"""Delete a JobSet if it's the owner of a job."""
|
|
62
|
+
if not self.jobset_api:
|
|
63
|
+
self.echo("JobSet API not available, cannot delete JobSet\n")
|
|
64
|
+
return False
|
|
65
|
+
|
|
66
|
+
try:
|
|
67
|
+
jobset_name = owner_ref.name
|
|
68
|
+
self.echo(f"Deleting JobSet: {jobset_name}\n")
|
|
69
|
+
|
|
70
|
+
self.jobset_api.delete_namespaced_custom_object(
|
|
71
|
+
group="jobset.x-k8s.io",
|
|
72
|
+
version="v1alpha2",
|
|
73
|
+
namespace=namespace,
|
|
74
|
+
plural="jobsets",
|
|
75
|
+
name=jobset_name,
|
|
76
|
+
)
|
|
77
|
+
return True
|
|
78
|
+
except Exception as e:
|
|
79
|
+
self.echo(f"Failed to delete JobSet {owner_ref.name}: {str(e)}\n")
|
|
80
|
+
return False
|
|
81
|
+
|
|
82
|
+
def _delete_job(self, job_name, namespace):
|
|
83
|
+
"""Delete a Batch Job and check for JobSet owner reference."""
|
|
84
|
+
try:
|
|
85
|
+
# First get the job to check for owner references
|
|
86
|
+
job = self.job_api.read_namespaced_job(name=job_name, namespace=namespace)
|
|
87
|
+
# Check for JobSet owner reference
|
|
88
|
+
jobset_ref = _is_jobset_child(job)
|
|
89
|
+
if jobset_ref:
|
|
90
|
+
if self._delete_jobset(jobset_ref, namespace):
|
|
91
|
+
return True
|
|
92
|
+
|
|
93
|
+
# If no JobSet owner or JobSet deletion failed, delete the job
|
|
94
|
+
self.echo(f"Deleting Batch Job: {job_name}")
|
|
95
|
+
self.job_api.delete_namespaced_job(
|
|
96
|
+
name=job_name, namespace=namespace, propagation_policy="Background"
|
|
97
|
+
)
|
|
98
|
+
return True
|
|
99
|
+
|
|
100
|
+
except Exception as e:
|
|
101
|
+
self.echo(f"Failed to delete job {job_name}: {str(e)}")
|
|
102
|
+
return False
|
|
103
|
+
|
|
104
|
+
def _kill_pod_process(self, pod):
|
|
105
|
+
"""Attempt to kill processes inside a pod."""
|
|
106
|
+
from kubernetes.stream import stream
|
|
107
|
+
|
|
108
|
+
try:
|
|
109
|
+
stream(
|
|
110
|
+
self.api_instance.connect_get_namespaced_pod_exec,
|
|
111
|
+
name=pod.metadata.name,
|
|
112
|
+
namespace=pod.metadata.namespace,
|
|
113
|
+
command=["/bin/sh", "-c", "/sbin/killall5"],
|
|
114
|
+
stderr=True,
|
|
115
|
+
stdin=False,
|
|
116
|
+
stdout=True,
|
|
117
|
+
tty=False,
|
|
118
|
+
)
|
|
119
|
+
return True
|
|
120
|
+
except Exception as e:
|
|
121
|
+
self.echo(
|
|
122
|
+
f"Failed to kill processes in pod {pod.metadata.name}: {str(e)}\n"
|
|
123
|
+
)
|
|
124
|
+
return False
|
|
125
|
+
|
|
126
|
+
@staticmethod
|
|
127
|
+
def _metaflow_matching_spec(run_id, user, flow_name, annotations, labels):
|
|
128
|
+
# Handle argo prefixes in run_id like in _find_active_pods
|
|
129
|
+
_argo_run_id = None
|
|
130
|
+
if run_id is not None:
|
|
131
|
+
_argo_run_id = run_id[run_id.startswith("argo-") and len("argo-") :]
|
|
132
|
+
return (
|
|
133
|
+
annotations
|
|
134
|
+
and (
|
|
135
|
+
run_id is None
|
|
136
|
+
or (annotations.get("metaflow/run_id") == run_id)
|
|
137
|
+
# we want to also match jobsets launched by argo-workflows
|
|
138
|
+
# This line has no real value since the We already avoid any
|
|
139
|
+
# argo-workflows related terminations.
|
|
140
|
+
or (
|
|
141
|
+
labels.get("workflows.argoproj.io/workflow") is not None
|
|
142
|
+
and labels.get("workflows.argoproj.io/workflow") == _argo_run_id
|
|
143
|
+
)
|
|
144
|
+
)
|
|
145
|
+
and (user is None or annotations.get("metaflow/user") == user)
|
|
146
|
+
and (annotations.get("metaflow/flow_name") == flow_name)
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
def _find_matching_jobs(self, flow_name, run_id=None, user=None):
|
|
150
|
+
"""Find jobs that match the flow_name, run_id, and user criteria using similar logic to _find_active_pods"""
|
|
151
|
+
|
|
152
|
+
def paginated_job_finder(namespace):
|
|
153
|
+
continue_token = None
|
|
154
|
+
while True:
|
|
155
|
+
response = self.job_api.list_namespaced_job(
|
|
156
|
+
namespace=namespace, limit=100, _continue=continue_token
|
|
157
|
+
)
|
|
158
|
+
yield response.items
|
|
159
|
+
continue_token = response.metadata._continue
|
|
160
|
+
if not continue_token:
|
|
161
|
+
break
|
|
162
|
+
|
|
163
|
+
try:
|
|
164
|
+
matching_jobs = []
|
|
165
|
+
for _jobs in paginated_job_finder(self._namespace):
|
|
166
|
+
for job in _jobs:
|
|
167
|
+
_match = self._metaflow_matching_spec(
|
|
168
|
+
run_id=run_id,
|
|
169
|
+
user=user,
|
|
170
|
+
flow_name=flow_name,
|
|
171
|
+
annotations=job.metadata.annotations,
|
|
172
|
+
labels=job.metadata.labels,
|
|
173
|
+
)
|
|
174
|
+
if _match:
|
|
175
|
+
matching_jobs.append(job)
|
|
176
|
+
return matching_jobs
|
|
177
|
+
except Exception as e:
|
|
178
|
+
self.echo(f"Error finding jobs: {str(e)}\n")
|
|
179
|
+
return []
|
|
180
|
+
|
|
181
|
+
def _find_matching_jobsets(self, flow_name, run_id=None, user=None):
|
|
182
|
+
"""Find jobsets that match the flow_name, run_id, and user criteria using similar logic to _find_active_pods"""
|
|
183
|
+
if not self.jobset_api:
|
|
184
|
+
return []
|
|
185
|
+
|
|
186
|
+
def paginated_jobset_finder(namespace):
|
|
187
|
+
continue_token = None
|
|
188
|
+
responses = []
|
|
189
|
+
while True:
|
|
190
|
+
response = self.jobset_api.list_namespaced_custom_object(
|
|
191
|
+
group="jobset.x-k8s.io",
|
|
192
|
+
version="v1alpha2",
|
|
193
|
+
namespace=namespace,
|
|
194
|
+
plural="jobsets",
|
|
195
|
+
limit=100,
|
|
196
|
+
**({"_continue": continue_token} if continue_token else {}),
|
|
197
|
+
)
|
|
198
|
+
continue_token = response.get("metadata", {}).get("continue", None)
|
|
199
|
+
responses.append(response)
|
|
200
|
+
if not continue_token:
|
|
201
|
+
break
|
|
202
|
+
return responses
|
|
203
|
+
|
|
204
|
+
try:
|
|
205
|
+
matching_jobsets = []
|
|
206
|
+
|
|
207
|
+
for jobset_response in paginated_jobset_finder(self._namespace):
|
|
208
|
+
for jobset in jobset_response.get("items", []):
|
|
209
|
+
_match = self._metaflow_matching_spec(
|
|
210
|
+
run_id=run_id,
|
|
211
|
+
user=user,
|
|
212
|
+
flow_name=flow_name,
|
|
213
|
+
annotations=jobset.get("metadata", {}).get("annotations", {}),
|
|
214
|
+
labels=jobset.get("metadata", {}).get("labels", {}),
|
|
215
|
+
)
|
|
216
|
+
if _match:
|
|
217
|
+
matching_jobsets.append(jobset)
|
|
218
|
+
|
|
219
|
+
return matching_jobsets
|
|
220
|
+
except Exception as e:
|
|
221
|
+
self.echo(f"Error finding jobsets: {str(e)}\n")
|
|
222
|
+
return []
|
|
223
|
+
|
|
224
|
+
def _kill_pods_for_job(self, job):
|
|
225
|
+
"""Find and kill pods associated with a specific job"""
|
|
226
|
+
job_name = job.metadata.name
|
|
227
|
+
namespace = job.metadata.namespace
|
|
228
|
+
|
|
229
|
+
try:
|
|
230
|
+
# Find pods with the job-name label matching this job
|
|
231
|
+
pods = self.api_instance.list_namespaced_pod(
|
|
232
|
+
namespace=namespace, label_selector=f"job-name={job_name}"
|
|
233
|
+
)
|
|
234
|
+
|
|
235
|
+
killed_pods = 0
|
|
236
|
+
for pod in pods.items:
|
|
237
|
+
if pod.status.phase in ["Running"]:
|
|
238
|
+
self.echo(
|
|
239
|
+
f"Killing processes in pod {pod.metadata.name} for job {job_name}"
|
|
240
|
+
)
|
|
241
|
+
if self._kill_pod_process(pod):
|
|
242
|
+
killed_pods += 1
|
|
243
|
+
|
|
244
|
+
return killed_pods > 0
|
|
245
|
+
except Exception as e:
|
|
246
|
+
self.echo(f"Failed to find/kill pods for job {job_name}: {str(e)}")
|
|
247
|
+
return False
|
|
248
|
+
|
|
249
|
+
def _handle_job_outcome(self, job, outcome):
|
|
250
|
+
"""Handle a job based on the derived outcome"""
|
|
251
|
+
job_name = job.metadata.name
|
|
252
|
+
namespace = job.metadata.namespace
|
|
253
|
+
|
|
254
|
+
if outcome == JobOutcomes.LEAVE_UNCHANGED:
|
|
255
|
+
# self.echo(f"Job {job_name} is in terminal state, leaving unchanged")
|
|
256
|
+
return None
|
|
257
|
+
elif outcome == JobOutcomes.DELETE:
|
|
258
|
+
self.echo(f"Deleting Job {job_name}")
|
|
259
|
+
return self._delete_job(job_name, namespace)
|
|
260
|
+
elif outcome == JobOutcomes.KILL:
|
|
261
|
+
self.echo(f"Killing Job {job_name}")
|
|
262
|
+
# First try to kill the pod processes
|
|
263
|
+
pods_killed = self._kill_pods_for_job(job)
|
|
264
|
+
if pods_killed > 0:
|
|
265
|
+
return True
|
|
266
|
+
# Worst case if we are not able to delete any pod, then delete the Job.
|
|
267
|
+
return self._delete_job(job_name, namespace)
|
|
268
|
+
else:
|
|
269
|
+
self.echo(f"Unknown outcome {outcome} for job {job_name}\n")
|
|
270
|
+
return False
|
|
271
|
+
|
|
272
|
+
def _handle_jobset_outcome(self, jobset, outcome):
|
|
273
|
+
"""Handle a jobset based on the derived outcome"""
|
|
274
|
+
jobset_name = jobset.get("metadata", {}).get("name", "unknown")
|
|
275
|
+
namespace = jobset.get("metadata", {}).get("namespace", self._namespace)
|
|
276
|
+
|
|
277
|
+
if outcome == JobOutcomes.LEAVE_UNCHANGED:
|
|
278
|
+
# self.echo(f"JobSet {jobset_name} is in terminal state, leaving unchanged")
|
|
279
|
+
return None
|
|
280
|
+
elif outcome == JobOutcomes.DELETE:
|
|
281
|
+
self.echo(f"Deleting JobSet {jobset_name}")
|
|
282
|
+
try:
|
|
283
|
+
self.jobset_api.delete_namespaced_custom_object(
|
|
284
|
+
group="jobset.x-k8s.io",
|
|
285
|
+
version="v1alpha2",
|
|
286
|
+
namespace=namespace,
|
|
287
|
+
plural="jobsets",
|
|
288
|
+
name=jobset_name,
|
|
289
|
+
)
|
|
290
|
+
return True
|
|
291
|
+
except Exception as e:
|
|
292
|
+
self.echo(f"Failed to delete JobSet {jobset_name}: {str(e)}")
|
|
293
|
+
return False
|
|
294
|
+
else:
|
|
295
|
+
self.echo(f"Unknown outcome {outcome} for JobSet {jobset_name}")
|
|
296
|
+
return False
|
|
297
|
+
|
|
298
|
+
def extract_matching_jobs_and_jobsets(self, flow_name, run_id, user):
|
|
299
|
+
"""Extract matching jobs and jobsets based on the flow_name, run_id, and user criteria"""
|
|
300
|
+
jobs = self._find_matching_jobs(flow_name, run_id, user)
|
|
301
|
+
jobsets = self._find_matching_jobsets(flow_name, run_id, user)
|
|
302
|
+
return [(j, derive_job_outcome(j.status)) for j in jobs], [
|
|
303
|
+
(j, derive_jobset_outcome(j.get("status", {}))) for j in jobsets
|
|
304
|
+
]
|
|
305
|
+
|
|
306
|
+
def process_matching_jobs_and_jobsets(self, flow_name, run_id, user):
|
|
307
|
+
"""Process all matching jobs and jobsets based on their derived outcomes"""
|
|
308
|
+
results = []
|
|
309
|
+
progress_update = lambda x: x
|
|
310
|
+
if self.progress_bar:
|
|
311
|
+
progress_update = lambda x: self.progress_bar.update(1, x)
|
|
312
|
+
|
|
313
|
+
# Process matching jobs
|
|
314
|
+
_jobs, _jobsets = [], []
|
|
315
|
+
jobs = self._find_matching_jobs(flow_name, run_id, user)
|
|
316
|
+
for job in jobs:
|
|
317
|
+
outcome = derive_job_outcome(job.status)
|
|
318
|
+
result = self._handle_job_outcome(job, outcome)
|
|
319
|
+
# results.append(result)
|
|
320
|
+
if result is not None:
|
|
321
|
+
progress_update("💀 Killing Job %s" % job.metadata.name)
|
|
322
|
+
results.append(result)
|
|
323
|
+
_jobs.append(result)
|
|
324
|
+
|
|
325
|
+
# Process matching jobsets
|
|
326
|
+
jobsets = self._find_matching_jobsets(flow_name, run_id, user)
|
|
327
|
+
for jobset in jobsets:
|
|
328
|
+
jobset_status = jobset.get("status", {})
|
|
329
|
+
outcome = derive_jobset_outcome(jobset_status)
|
|
330
|
+
result = self._handle_jobset_outcome(jobset, outcome)
|
|
331
|
+
if result is not None:
|
|
332
|
+
progress_update(
|
|
333
|
+
"💀 Deleting JobSet %s"
|
|
334
|
+
% jobset.get("metadata", {}).get("name", "unknown")
|
|
335
|
+
)
|
|
336
|
+
results.append(result)
|
|
337
|
+
_jobsets.append(result)
|
|
338
|
+
|
|
339
|
+
return results, len(_jobs), len(_jobsets)
|
|
340
|
+
|
|
341
|
+
def process_matching_jobs_and_jobsets_force_all(self, flow_name, run_id, user):
|
|
342
|
+
"""Force process ALL matching jobs and jobsets regardless of their status/outcome"""
|
|
343
|
+
results = []
|
|
344
|
+
progress_update = lambda x: x
|
|
345
|
+
if self.progress_bar:
|
|
346
|
+
progress_update = lambda x: self.progress_bar.update(1, x)
|
|
347
|
+
|
|
348
|
+
# Process matching jobs - FORCE DELETE ALL
|
|
349
|
+
_jobs, _jobsets = [], []
|
|
350
|
+
jobs = self._find_matching_jobs(flow_name, run_id, user)
|
|
351
|
+
for job in jobs:
|
|
352
|
+
# Force DELETE outcome regardless of actual status
|
|
353
|
+
result = self._handle_job_outcome(job, JobOutcomes.DELETE)
|
|
354
|
+
progress_update("🔥 FORCE Deleting Job %s" % job.metadata.name)
|
|
355
|
+
results.append(
|
|
356
|
+
result if result is not None else True
|
|
357
|
+
) # Treat None as success for force mode
|
|
358
|
+
_jobs.append(result if result is not None else True)
|
|
359
|
+
|
|
360
|
+
# Process matching jobsets - FORCE DELETE ALL
|
|
361
|
+
jobsets = self._find_matching_jobsets(flow_name, run_id, user)
|
|
362
|
+
for jobset in jobsets:
|
|
363
|
+
# Force DELETE outcome regardless of actual status
|
|
364
|
+
result = self._handle_jobset_outcome(jobset, JobOutcomes.DELETE)
|
|
365
|
+
progress_update(
|
|
366
|
+
"🔥 FORCE Deleting JobSet %s"
|
|
367
|
+
% jobset.get("metadata", {}).get("name", "unknown")
|
|
368
|
+
)
|
|
369
|
+
results.append(
|
|
370
|
+
result if result is not None else True
|
|
371
|
+
) # Treat None as success for force mode
|
|
372
|
+
_jobsets.append(result if result is not None else True)
|
|
373
|
+
|
|
374
|
+
return results, len(_jobs), len(_jobsets)
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
from metaflow.cards import Markdown, Table
|
|
2
|
+
from metaflow.metaflow_current import current
|
|
3
|
+
|
|
4
|
+
from .utils import get_storage_path
|
|
5
|
+
from ..card_utilities.async_cards import CardRefresher
|
|
6
|
+
from ..card_utilities.extra_components import BarPlot, ViolinPlot
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
class NimMetricsRefresher(CardRefresher):
|
|
10
|
+
CARD_ID = "nim_metrics"
|
|
11
|
+
|
|
12
|
+
def __init__(self) -> None:
|
|
13
|
+
self._metrics_charts = {}
|
|
14
|
+
self._last_updated_on = None
|
|
15
|
+
self._already_rendered = False
|
|
16
|
+
self._file_name = get_storage_path(current.task_id)
|
|
17
|
+
|
|
18
|
+
def sqlite_fetch_func(self, conn):
|
|
19
|
+
try:
|
|
20
|
+
cursor = conn.cursor()
|
|
21
|
+
cursor.execute(
|
|
22
|
+
"SELECT error, success, status_code, prompt_tokens, completion_tokens, e2e_time, model FROM metrics"
|
|
23
|
+
)
|
|
24
|
+
rows = cursor.fetchall()
|
|
25
|
+
data = {
|
|
26
|
+
"error": 0,
|
|
27
|
+
"success": 0,
|
|
28
|
+
"status_code": [],
|
|
29
|
+
"prompt_tokens": [],
|
|
30
|
+
"completion_tokens": [],
|
|
31
|
+
"e2e_time": [],
|
|
32
|
+
"model": [],
|
|
33
|
+
}
|
|
34
|
+
for row in rows:
|
|
35
|
+
data["error"] += row[0]
|
|
36
|
+
data["success"] += row[1]
|
|
37
|
+
data["status_code"].append(row[2])
|
|
38
|
+
data["prompt_tokens"].append(row[3])
|
|
39
|
+
data["completion_tokens"].append(row[4])
|
|
40
|
+
data["e2e_time"].append(row[5])
|
|
41
|
+
data["model"].append(row[6])
|
|
42
|
+
return data
|
|
43
|
+
finally:
|
|
44
|
+
conn.close()
|
|
45
|
+
|
|
46
|
+
def render_card_fresh(self, current_card, data):
|
|
47
|
+
self._already_rendered = True
|
|
48
|
+
current_card.clear()
|
|
49
|
+
current_card.append(Markdown("## Metrics"))
|
|
50
|
+
|
|
51
|
+
self._metrics_charts["request_success"] = BarPlot(
|
|
52
|
+
title="Request success",
|
|
53
|
+
category_name="category",
|
|
54
|
+
value_name="amount",
|
|
55
|
+
orientation="horizontal",
|
|
56
|
+
)
|
|
57
|
+
self._metrics_charts["latency_distribution"] = ViolinPlot(
|
|
58
|
+
title="Latency distribution (s)",
|
|
59
|
+
category_col_name="model",
|
|
60
|
+
value_col_name="e2e_time",
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
current_card.append(
|
|
64
|
+
Table(
|
|
65
|
+
data=[
|
|
66
|
+
[
|
|
67
|
+
self._metrics_charts["request_success"],
|
|
68
|
+
],
|
|
69
|
+
[self._metrics_charts["latency_distribution"]],
|
|
70
|
+
]
|
|
71
|
+
)
|
|
72
|
+
)
|
|
73
|
+
current_card.refresh()
|
|
74
|
+
|
|
75
|
+
def on_startup(self, current_card):
|
|
76
|
+
current_card.append(Markdown("# Task-level NIM API metrics"))
|
|
77
|
+
current_card.append(
|
|
78
|
+
Markdown(
|
|
79
|
+
"_waiting for data to appear_",
|
|
80
|
+
)
|
|
81
|
+
)
|
|
82
|
+
current_card.refresh()
|
|
83
|
+
|
|
84
|
+
def on_error(self, current_card, error_message):
|
|
85
|
+
if isinstance(error_message, FileNotFoundError):
|
|
86
|
+
return
|
|
87
|
+
|
|
88
|
+
if not self._already_rendered:
|
|
89
|
+
current_card.clear()
|
|
90
|
+
current_card.append(
|
|
91
|
+
Markdown(
|
|
92
|
+
f"## Error: {str(error_message)}",
|
|
93
|
+
)
|
|
94
|
+
)
|
|
95
|
+
current_card.refresh()
|
|
96
|
+
|
|
97
|
+
def update_only_components(self, current_card, data_object):
|
|
98
|
+
# update request success data
|
|
99
|
+
self._metrics_charts["request_success"].spec["data"][0]["values"] = [
|
|
100
|
+
{
|
|
101
|
+
"category": "Successful requests",
|
|
102
|
+
"amount": data_object["metrics"]["success"],
|
|
103
|
+
},
|
|
104
|
+
{"category": "Errors", "amount": data_object["metrics"]["error"]},
|
|
105
|
+
]
|
|
106
|
+
|
|
107
|
+
latency_data = []
|
|
108
|
+
times = []
|
|
109
|
+
for m, e in zip(
|
|
110
|
+
data_object["metrics"]["model"], data_object["metrics"]["e2e_time"]
|
|
111
|
+
):
|
|
112
|
+
latency_data.append({"model": m, "e2e_time": e})
|
|
113
|
+
times.append(e)
|
|
114
|
+
|
|
115
|
+
# update latency data
|
|
116
|
+
self._metrics_charts["latency_distribution"].spec["data"][0][
|
|
117
|
+
"values"
|
|
118
|
+
] = latency_data
|
|
119
|
+
|
|
120
|
+
# update domain for latency plot
|
|
121
|
+
min_time = min(times)
|
|
122
|
+
max_time = max(times)
|
|
123
|
+
for scale in self._metrics_charts["latency_distribution"].spec["scales"]:
|
|
124
|
+
if scale["name"] == "xscale":
|
|
125
|
+
scale["domain"] = [min_time - max_time * 0.1, max_time + max_time * 0.1]
|
|
126
|
+
|
|
127
|
+
current_card.refresh()
|
|
128
|
+
|
|
129
|
+
def on_update(self, current_card, data_object):
|
|
130
|
+
data_object_keys = set(data_object.keys())
|
|
131
|
+
if len(data_object_keys) == 0:
|
|
132
|
+
return
|
|
133
|
+
if len(self._metrics_charts) == 0:
|
|
134
|
+
self.render_card_fresh(current_card, data_object)
|
|
135
|
+
return
|
|
136
|
+
elif len(data_object["metrics"]["status_code"]) == 0:
|
|
137
|
+
return
|
|
138
|
+
else:
|
|
139
|
+
self.update_only_components(current_card, data_object)
|
|
140
|
+
return
|