ob-metaflow-extensions 1.1.45rc3__py2.py3-none-any.whl → 1.5.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ob-metaflow-extensions might be problematic. Click here for more details.

Files changed (128) hide show
  1. metaflow_extensions/outerbounds/__init__.py +1 -7
  2. metaflow_extensions/outerbounds/config/__init__.py +35 -0
  3. metaflow_extensions/outerbounds/plugins/__init__.py +186 -57
  4. metaflow_extensions/outerbounds/plugins/apps/__init__.py +0 -0
  5. metaflow_extensions/outerbounds/plugins/apps/app_cli.py +0 -0
  6. metaflow_extensions/outerbounds/plugins/apps/app_utils.py +187 -0
  7. metaflow_extensions/outerbounds/plugins/apps/consts.py +3 -0
  8. metaflow_extensions/outerbounds/plugins/apps/core/__init__.py +15 -0
  9. metaflow_extensions/outerbounds/plugins/apps/core/_state_machine.py +506 -0
  10. metaflow_extensions/outerbounds/plugins/apps/core/_vendor/__init__.py +0 -0
  11. metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/__init__.py +4 -0
  12. metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/spinners.py +478 -0
  13. metaflow_extensions/outerbounds/plugins/apps/core/app_config.py +128 -0
  14. metaflow_extensions/outerbounds/plugins/apps/core/app_deploy_decorator.py +330 -0
  15. metaflow_extensions/outerbounds/plugins/apps/core/artifacts.py +0 -0
  16. metaflow_extensions/outerbounds/plugins/apps/core/capsule.py +958 -0
  17. metaflow_extensions/outerbounds/plugins/apps/core/click_importer.py +24 -0
  18. metaflow_extensions/outerbounds/plugins/apps/core/code_package/__init__.py +3 -0
  19. metaflow_extensions/outerbounds/plugins/apps/core/code_package/code_packager.py +618 -0
  20. metaflow_extensions/outerbounds/plugins/apps/core/code_package/examples.py +125 -0
  21. metaflow_extensions/outerbounds/plugins/apps/core/config/__init__.py +15 -0
  22. metaflow_extensions/outerbounds/plugins/apps/core/config/cli_generator.py +165 -0
  23. metaflow_extensions/outerbounds/plugins/apps/core/config/config_utils.py +966 -0
  24. metaflow_extensions/outerbounds/plugins/apps/core/config/schema_export.py +299 -0
  25. metaflow_extensions/outerbounds/plugins/apps/core/config/typed_configs.py +233 -0
  26. metaflow_extensions/outerbounds/plugins/apps/core/config/typed_init_generator.py +537 -0
  27. metaflow_extensions/outerbounds/plugins/apps/core/config/unified_config.py +1125 -0
  28. metaflow_extensions/outerbounds/plugins/apps/core/config_schema.yaml +337 -0
  29. metaflow_extensions/outerbounds/plugins/apps/core/dependencies.py +115 -0
  30. metaflow_extensions/outerbounds/plugins/apps/core/deployer.py +959 -0
  31. metaflow_extensions/outerbounds/plugins/apps/core/experimental/__init__.py +89 -0
  32. metaflow_extensions/outerbounds/plugins/apps/core/perimeters.py +87 -0
  33. metaflow_extensions/outerbounds/plugins/apps/core/secrets.py +164 -0
  34. metaflow_extensions/outerbounds/plugins/apps/core/utils.py +233 -0
  35. metaflow_extensions/outerbounds/plugins/apps/core/validations.py +17 -0
  36. metaflow_extensions/outerbounds/plugins/apps/deploy_decorator.py +201 -0
  37. metaflow_extensions/outerbounds/plugins/apps/supervisord_utils.py +243 -0
  38. metaflow_extensions/outerbounds/plugins/auth_server.py +28 -8
  39. metaflow_extensions/outerbounds/plugins/aws/__init__.py +4 -0
  40. metaflow_extensions/outerbounds/plugins/aws/assume_role.py +3 -0
  41. metaflow_extensions/outerbounds/plugins/aws/assume_role_decorator.py +118 -0
  42. metaflow_extensions/outerbounds/plugins/card_utilities/__init__.py +0 -0
  43. metaflow_extensions/outerbounds/plugins/card_utilities/async_cards.py +142 -0
  44. metaflow_extensions/outerbounds/plugins/card_utilities/extra_components.py +545 -0
  45. metaflow_extensions/outerbounds/plugins/card_utilities/injector.py +70 -0
  46. metaflow_extensions/outerbounds/plugins/checkpoint_datastores/__init__.py +2 -0
  47. metaflow_extensions/outerbounds/plugins/checkpoint_datastores/coreweave.py +71 -0
  48. metaflow_extensions/outerbounds/plugins/checkpoint_datastores/external_chckpt.py +85 -0
  49. metaflow_extensions/outerbounds/plugins/checkpoint_datastores/nebius.py +73 -0
  50. metaflow_extensions/outerbounds/plugins/fast_bakery/__init__.py +0 -0
  51. metaflow_extensions/outerbounds/plugins/fast_bakery/baker.py +110 -0
  52. metaflow_extensions/outerbounds/plugins/fast_bakery/docker_environment.py +391 -0
  53. metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery.py +188 -0
  54. metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_cli.py +54 -0
  55. metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_decorator.py +50 -0
  56. metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py +79 -0
  57. metaflow_extensions/outerbounds/plugins/kubernetes/pod_killer.py +374 -0
  58. metaflow_extensions/outerbounds/plugins/nim/card.py +140 -0
  59. metaflow_extensions/outerbounds/plugins/nim/nim_decorator.py +101 -0
  60. metaflow_extensions/outerbounds/plugins/nim/nim_manager.py +379 -0
  61. metaflow_extensions/outerbounds/plugins/nim/utils.py +36 -0
  62. metaflow_extensions/outerbounds/plugins/nvcf/__init__.py +0 -0
  63. metaflow_extensions/outerbounds/plugins/nvcf/constants.py +3 -0
  64. metaflow_extensions/outerbounds/plugins/nvcf/exceptions.py +94 -0
  65. metaflow_extensions/outerbounds/plugins/nvcf/heartbeat_store.py +178 -0
  66. metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py +417 -0
  67. metaflow_extensions/outerbounds/plugins/nvcf/nvcf_cli.py +280 -0
  68. metaflow_extensions/outerbounds/plugins/nvcf/nvcf_decorator.py +242 -0
  69. metaflow_extensions/outerbounds/plugins/nvcf/utils.py +6 -0
  70. metaflow_extensions/outerbounds/plugins/nvct/__init__.py +0 -0
  71. metaflow_extensions/outerbounds/plugins/nvct/exceptions.py +71 -0
  72. metaflow_extensions/outerbounds/plugins/nvct/nvct.py +131 -0
  73. metaflow_extensions/outerbounds/plugins/nvct/nvct_cli.py +289 -0
  74. metaflow_extensions/outerbounds/plugins/nvct/nvct_decorator.py +286 -0
  75. metaflow_extensions/outerbounds/plugins/nvct/nvct_runner.py +218 -0
  76. metaflow_extensions/outerbounds/plugins/nvct/utils.py +29 -0
  77. metaflow_extensions/outerbounds/plugins/ollama/__init__.py +225 -0
  78. metaflow_extensions/outerbounds/plugins/ollama/constants.py +1 -0
  79. metaflow_extensions/outerbounds/plugins/ollama/exceptions.py +22 -0
  80. metaflow_extensions/outerbounds/plugins/ollama/ollama.py +1924 -0
  81. metaflow_extensions/outerbounds/plugins/ollama/status_card.py +292 -0
  82. metaflow_extensions/outerbounds/plugins/optuna/__init__.py +48 -0
  83. metaflow_extensions/outerbounds/plugins/perimeters.py +19 -5
  84. metaflow_extensions/outerbounds/plugins/profilers/deco_injector.py +70 -0
  85. metaflow_extensions/outerbounds/plugins/profilers/gpu_profile_decorator.py +88 -0
  86. metaflow_extensions/outerbounds/plugins/profilers/simple_card_decorator.py +96 -0
  87. metaflow_extensions/outerbounds/plugins/s3_proxy/__init__.py +7 -0
  88. metaflow_extensions/outerbounds/plugins/s3_proxy/binary_caller.py +132 -0
  89. metaflow_extensions/outerbounds/plugins/s3_proxy/constants.py +11 -0
  90. metaflow_extensions/outerbounds/plugins/s3_proxy/exceptions.py +13 -0
  91. metaflow_extensions/outerbounds/plugins/s3_proxy/proxy_bootstrap.py +59 -0
  92. metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_api.py +93 -0
  93. metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_decorator.py +250 -0
  94. metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_manager.py +225 -0
  95. metaflow_extensions/outerbounds/plugins/secrets/__init__.py +0 -0
  96. metaflow_extensions/outerbounds/plugins/secrets/secrets.py +204 -0
  97. metaflow_extensions/outerbounds/plugins/snowflake/__init__.py +3 -0
  98. metaflow_extensions/outerbounds/plugins/snowflake/snowflake.py +378 -0
  99. metaflow_extensions/outerbounds/plugins/snowpark/__init__.py +0 -0
  100. metaflow_extensions/outerbounds/plugins/snowpark/snowpark.py +309 -0
  101. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_cli.py +277 -0
  102. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_client.py +150 -0
  103. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_decorator.py +273 -0
  104. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_exceptions.py +13 -0
  105. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_job.py +241 -0
  106. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_service_spec.py +259 -0
  107. metaflow_extensions/outerbounds/plugins/tensorboard/__init__.py +50 -0
  108. metaflow_extensions/outerbounds/plugins/torchtune/__init__.py +163 -0
  109. metaflow_extensions/outerbounds/plugins/vllm/__init__.py +255 -0
  110. metaflow_extensions/outerbounds/plugins/vllm/constants.py +1 -0
  111. metaflow_extensions/outerbounds/plugins/vllm/exceptions.py +1 -0
  112. metaflow_extensions/outerbounds/plugins/vllm/status_card.py +352 -0
  113. metaflow_extensions/outerbounds/plugins/vllm/vllm_manager.py +621 -0
  114. metaflow_extensions/outerbounds/profilers/gpu.py +131 -47
  115. metaflow_extensions/outerbounds/remote_config.py +53 -16
  116. metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py +138 -2
  117. metaflow_extensions/outerbounds/toplevel/ob_internal.py +4 -0
  118. metaflow_extensions/outerbounds/toplevel/plugins/ollama/__init__.py +1 -0
  119. metaflow_extensions/outerbounds/toplevel/plugins/optuna/__init__.py +1 -0
  120. metaflow_extensions/outerbounds/toplevel/plugins/snowflake/__init__.py +1 -0
  121. metaflow_extensions/outerbounds/toplevel/plugins/torchtune/__init__.py +1 -0
  122. metaflow_extensions/outerbounds/toplevel/plugins/vllm/__init__.py +1 -0
  123. metaflow_extensions/outerbounds/toplevel/s3_proxy.py +88 -0
  124. {ob_metaflow_extensions-1.1.45rc3.dist-info → ob_metaflow_extensions-1.5.1.dist-info}/METADATA +2 -2
  125. ob_metaflow_extensions-1.5.1.dist-info/RECORD +133 -0
  126. ob_metaflow_extensions-1.1.45rc3.dist-info/RECORD +0 -19
  127. {ob_metaflow_extensions-1.1.45rc3.dist-info → ob_metaflow_extensions-1.5.1.dist-info}/WHEEL +0 -0
  128. {ob_metaflow_extensions-1.1.45rc3.dist-info → ob_metaflow_extensions-1.5.1.dist-info}/top_level.txt +0 -0
@@ -1,8 +1,11 @@
1
+ from concurrent.futures import ThreadPoolExecutor
1
2
  import os
2
3
  import sys
3
4
  import time
4
5
 
5
6
  from metaflow.exception import MetaflowException
7
+ from metaflow.metaflow_config import KUBERNETES_NAMESPACE
8
+ from .pod_killer import PodKiller
6
9
 
7
10
 
8
11
  CLIENT_REFRESH_INTERVAL_SECONDS = 300
@@ -27,6 +30,7 @@ class KubernetesClient(object):
27
30
  % sys.executable
28
31
  )
29
32
  self._refresh_client()
33
+ self._namespace = KUBERNETES_NAMESPACE
30
34
 
31
35
  def _refresh_client(self):
32
36
  from metaflow_extensions.outerbounds.plugins.auth_server import get_token
@@ -50,7 +54,82 @@ class KubernetesClient(object):
50
54
 
51
55
  return self._client
52
56
 
57
+ def _find_active_pods(self, flow_name, run_id=None, user=None):
58
+ def _request(_continue=None):
59
+ # handle paginated responses
60
+ return self._client.CoreV1Api().list_namespaced_pod(
61
+ namespace=self._namespace,
62
+ # limited selector support for K8S api. We want to cover multiple statuses: Running / Pending / Unknown
63
+ field_selector="status.phase!=Succeeded,status.phase!=Failed",
64
+ limit=1000,
65
+ _continue=_continue,
66
+ )
67
+
68
+ results = _request()
69
+
70
+ if run_id is not None:
71
+ # handle argo prefixes in run_id
72
+ run_id = run_id[run_id.startswith("argo-") and len("argo-") :]
73
+
74
+ while results.metadata._continue or results.items:
75
+ for pod in results.items:
76
+ match = (
77
+ # arbitrary pods might have no annotations at all.
78
+ pod.metadata.annotations
79
+ and pod.metadata.labels
80
+ and (
81
+ run_id is None
82
+ or (pod.metadata.annotations.get("metaflow/run_id") == run_id)
83
+ # we want to also match pods launched by argo-workflows
84
+ or (
85
+ pod.metadata.labels.get("workflows.argoproj.io/workflow")
86
+ == run_id
87
+ )
88
+ )
89
+ and (
90
+ user is None
91
+ or pod.metadata.annotations.get("metaflow/user") == user
92
+ )
93
+ and (
94
+ pod.metadata.annotations.get("metaflow/flow_name") == flow_name
95
+ )
96
+ )
97
+ if match:
98
+ yield pod
99
+ if not results.metadata._continue:
100
+ break
101
+ results = _request(results.metadata._continue)
102
+
103
+ def list(self, flow_name, run_id, user):
104
+ results = self._find_active_pods(flow_name, run_id, user)
105
+
106
+ return list(results)
107
+
108
+ def kill_pods(self, flow_name, run_id, user, echo):
109
+ # Create PodKiller instance
110
+ killer = PodKiller(self._client, echo, self._namespace)
111
+
112
+ # Process all matching jobs and jobsets based on their outcomes
113
+ (
114
+ job_jobset_results,
115
+ num_jobs,
116
+ num_jobsets,
117
+ ) = killer.process_matching_jobs_and_jobsets(flow_name, run_id, user)
118
+
119
+ if job_jobset_results:
120
+ successful_operations = sum(1 for result in job_jobset_results if result)
121
+ echo(
122
+ f"Found and processed {num_jobs} jobs and {num_jobsets} jobsets, {successful_operations} operations successful\n"
123
+ )
124
+ else:
125
+ echo("No matching jobs or jobsets found for run *%s*" % run_id)
126
+
53
127
  def job(self, **kwargs):
54
128
  from metaflow.plugins.kubernetes.kubernetes_job import KubernetesJob
55
129
 
56
130
  return KubernetesJob(self, **kwargs)
131
+
132
+ def jobset(self, **kwargs):
133
+ from metaflow.plugins.kubernetes.kubernetes_job import KubernetesJobSet
134
+
135
+ return KubernetesJobSet(self, **kwargs)
@@ -0,0 +1,374 @@
1
+ from typing import TYPE_CHECKING
2
+
3
+ if TYPE_CHECKING:
4
+ from kubernetes.client.models.v1_job import V1Job
5
+ from kubernetes.client.models.v1_job_status import V1JobStatus
6
+
7
+
8
+ def _is_jobset_child(job: "V1Job"):
9
+ if job.metadata.owner_references:
10
+ for owner_ref in job.metadata.owner_references:
11
+ if owner_ref.kind == "JobSet":
12
+ return owner_ref
13
+ return None
14
+
15
+
16
+ class JobOutcomes:
17
+ KILL = "kill"
18
+ DELETE = "delete"
19
+ LEAVE_UNCHANGED = "leave_unchanged"
20
+
21
+
22
+ def derive_jobset_outcome(jobset_status):
23
+ return (
24
+ JobOutcomes.LEAVE_UNCHANGED
25
+ if jobset_status.get("terminalState", None)
26
+ else JobOutcomes.DELETE
27
+ )
28
+
29
+
30
+ def derive_job_outcome(job_status: "V1JobStatus"):
31
+ if job_status.start_time is None:
32
+ # If the job has not started even then just wipe it!
33
+ return JobOutcomes.DELETE
34
+ if job_status.succeeded or job_status.failed:
35
+ return JobOutcomes.LEAVE_UNCHANGED
36
+
37
+ if job_status.completion_time is not None:
38
+ return JobOutcomes.LEAVE_UNCHANGED
39
+
40
+ # This means that the job has neither finished or succedded.
41
+ if job_status.active:
42
+ return JobOutcomes.DELETE
43
+
44
+ # This means that the job is not active. Had started. There is not succedded/fail.
45
+ # This is a weird state. Better to just kill the job
46
+ return JobOutcomes.DELETE
47
+
48
+
49
+ class PodKiller:
50
+ def __init__(self, kubernetes_client, echo_func, namespace, progress_bar=None):
51
+ self.client = kubernetes_client
52
+ self.echo = echo_func
53
+ self.api_instance = self.client.CoreV1Api()
54
+ self.job_api = self.client.BatchV1Api()
55
+ self._namespace = namespace
56
+ self.jobset_api = None
57
+ self.jobset_api = self.client.CustomObjectsApi()
58
+ self.progress_bar = progress_bar
59
+
60
+ def _delete_jobset(self, owner_ref, namespace):
61
+ """Delete a JobSet if it's the owner of a job."""
62
+ if not self.jobset_api:
63
+ self.echo("JobSet API not available, cannot delete JobSet\n")
64
+ return False
65
+
66
+ try:
67
+ jobset_name = owner_ref.name
68
+ self.echo(f"Deleting JobSet: {jobset_name}\n")
69
+
70
+ self.jobset_api.delete_namespaced_custom_object(
71
+ group="jobset.x-k8s.io",
72
+ version="v1alpha2",
73
+ namespace=namespace,
74
+ plural="jobsets",
75
+ name=jobset_name,
76
+ )
77
+ return True
78
+ except Exception as e:
79
+ self.echo(f"Failed to delete JobSet {owner_ref.name}: {str(e)}\n")
80
+ return False
81
+
82
+ def _delete_job(self, job_name, namespace):
83
+ """Delete a Batch Job and check for JobSet owner reference."""
84
+ try:
85
+ # First get the job to check for owner references
86
+ job = self.job_api.read_namespaced_job(name=job_name, namespace=namespace)
87
+ # Check for JobSet owner reference
88
+ jobset_ref = _is_jobset_child(job)
89
+ if jobset_ref:
90
+ if self._delete_jobset(jobset_ref, namespace):
91
+ return True
92
+
93
+ # If no JobSet owner or JobSet deletion failed, delete the job
94
+ self.echo(f"Deleting Batch Job: {job_name}")
95
+ self.job_api.delete_namespaced_job(
96
+ name=job_name, namespace=namespace, propagation_policy="Background"
97
+ )
98
+ return True
99
+
100
+ except Exception as e:
101
+ self.echo(f"Failed to delete job {job_name}: {str(e)}")
102
+ return False
103
+
104
+ def _kill_pod_process(self, pod):
105
+ """Attempt to kill processes inside a pod."""
106
+ from kubernetes.stream import stream
107
+
108
+ try:
109
+ stream(
110
+ self.api_instance.connect_get_namespaced_pod_exec,
111
+ name=pod.metadata.name,
112
+ namespace=pod.metadata.namespace,
113
+ command=["/bin/sh", "-c", "/sbin/killall5"],
114
+ stderr=True,
115
+ stdin=False,
116
+ stdout=True,
117
+ tty=False,
118
+ )
119
+ return True
120
+ except Exception as e:
121
+ self.echo(
122
+ f"Failed to kill processes in pod {pod.metadata.name}: {str(e)}\n"
123
+ )
124
+ return False
125
+
126
+ @staticmethod
127
+ def _metaflow_matching_spec(run_id, user, flow_name, annotations, labels):
128
+ # Handle argo prefixes in run_id like in _find_active_pods
129
+ _argo_run_id = None
130
+ if run_id is not None:
131
+ _argo_run_id = run_id[run_id.startswith("argo-") and len("argo-") :]
132
+ return (
133
+ annotations
134
+ and (
135
+ run_id is None
136
+ or (annotations.get("metaflow/run_id") == run_id)
137
+ # we want to also match jobsets launched by argo-workflows
138
+ # This line has no real value since the We already avoid any
139
+ # argo-workflows related terminations.
140
+ or (
141
+ labels.get("workflows.argoproj.io/workflow") is not None
142
+ and labels.get("workflows.argoproj.io/workflow") == _argo_run_id
143
+ )
144
+ )
145
+ and (user is None or annotations.get("metaflow/user") == user)
146
+ and (annotations.get("metaflow/flow_name") == flow_name)
147
+ )
148
+
149
+ def _find_matching_jobs(self, flow_name, run_id=None, user=None):
150
+ """Find jobs that match the flow_name, run_id, and user criteria using similar logic to _find_active_pods"""
151
+
152
+ def paginated_job_finder(namespace):
153
+ continue_token = None
154
+ while True:
155
+ response = self.job_api.list_namespaced_job(
156
+ namespace=namespace, limit=100, _continue=continue_token
157
+ )
158
+ yield response.items
159
+ continue_token = response.metadata._continue
160
+ if not continue_token:
161
+ break
162
+
163
+ try:
164
+ matching_jobs = []
165
+ for _jobs in paginated_job_finder(self._namespace):
166
+ for job in _jobs:
167
+ _match = self._metaflow_matching_spec(
168
+ run_id=run_id,
169
+ user=user,
170
+ flow_name=flow_name,
171
+ annotations=job.metadata.annotations,
172
+ labels=job.metadata.labels,
173
+ )
174
+ if _match:
175
+ matching_jobs.append(job)
176
+ return matching_jobs
177
+ except Exception as e:
178
+ self.echo(f"Error finding jobs: {str(e)}\n")
179
+ return []
180
+
181
+ def _find_matching_jobsets(self, flow_name, run_id=None, user=None):
182
+ """Find jobsets that match the flow_name, run_id, and user criteria using similar logic to _find_active_pods"""
183
+ if not self.jobset_api:
184
+ return []
185
+
186
+ def paginated_jobset_finder(namespace):
187
+ continue_token = None
188
+ responses = []
189
+ while True:
190
+ response = self.jobset_api.list_namespaced_custom_object(
191
+ group="jobset.x-k8s.io",
192
+ version="v1alpha2",
193
+ namespace=namespace,
194
+ plural="jobsets",
195
+ limit=100,
196
+ **({"_continue": continue_token} if continue_token else {}),
197
+ )
198
+ continue_token = response.get("metadata", {}).get("continue", None)
199
+ responses.append(response)
200
+ if not continue_token:
201
+ break
202
+ return responses
203
+
204
+ try:
205
+ matching_jobsets = []
206
+
207
+ for jobset_response in paginated_jobset_finder(self._namespace):
208
+ for jobset in jobset_response.get("items", []):
209
+ _match = self._metaflow_matching_spec(
210
+ run_id=run_id,
211
+ user=user,
212
+ flow_name=flow_name,
213
+ annotations=jobset.get("metadata", {}).get("annotations", {}),
214
+ labels=jobset.get("metadata", {}).get("labels", {}),
215
+ )
216
+ if _match:
217
+ matching_jobsets.append(jobset)
218
+
219
+ return matching_jobsets
220
+ except Exception as e:
221
+ self.echo(f"Error finding jobsets: {str(e)}\n")
222
+ return []
223
+
224
+ def _kill_pods_for_job(self, job):
225
+ """Find and kill pods associated with a specific job"""
226
+ job_name = job.metadata.name
227
+ namespace = job.metadata.namespace
228
+
229
+ try:
230
+ # Find pods with the job-name label matching this job
231
+ pods = self.api_instance.list_namespaced_pod(
232
+ namespace=namespace, label_selector=f"job-name={job_name}"
233
+ )
234
+
235
+ killed_pods = 0
236
+ for pod in pods.items:
237
+ if pod.status.phase in ["Running"]:
238
+ self.echo(
239
+ f"Killing processes in pod {pod.metadata.name} for job {job_name}"
240
+ )
241
+ if self._kill_pod_process(pod):
242
+ killed_pods += 1
243
+
244
+ return killed_pods > 0
245
+ except Exception as e:
246
+ self.echo(f"Failed to find/kill pods for job {job_name}: {str(e)}")
247
+ return False
248
+
249
+ def _handle_job_outcome(self, job, outcome):
250
+ """Handle a job based on the derived outcome"""
251
+ job_name = job.metadata.name
252
+ namespace = job.metadata.namespace
253
+
254
+ if outcome == JobOutcomes.LEAVE_UNCHANGED:
255
+ # self.echo(f"Job {job_name} is in terminal state, leaving unchanged")
256
+ return None
257
+ elif outcome == JobOutcomes.DELETE:
258
+ self.echo(f"Deleting Job {job_name}")
259
+ return self._delete_job(job_name, namespace)
260
+ elif outcome == JobOutcomes.KILL:
261
+ self.echo(f"Killing Job {job_name}")
262
+ # First try to kill the pod processes
263
+ pods_killed = self._kill_pods_for_job(job)
264
+ if pods_killed > 0:
265
+ return True
266
+ # Worst case if we are not able to delete any pod, then delete the Job.
267
+ return self._delete_job(job_name, namespace)
268
+ else:
269
+ self.echo(f"Unknown outcome {outcome} for job {job_name}\n")
270
+ return False
271
+
272
+ def _handle_jobset_outcome(self, jobset, outcome):
273
+ """Handle a jobset based on the derived outcome"""
274
+ jobset_name = jobset.get("metadata", {}).get("name", "unknown")
275
+ namespace = jobset.get("metadata", {}).get("namespace", self._namespace)
276
+
277
+ if outcome == JobOutcomes.LEAVE_UNCHANGED:
278
+ # self.echo(f"JobSet {jobset_name} is in terminal state, leaving unchanged")
279
+ return None
280
+ elif outcome == JobOutcomes.DELETE:
281
+ self.echo(f"Deleting JobSet {jobset_name}")
282
+ try:
283
+ self.jobset_api.delete_namespaced_custom_object(
284
+ group="jobset.x-k8s.io",
285
+ version="v1alpha2",
286
+ namespace=namespace,
287
+ plural="jobsets",
288
+ name=jobset_name,
289
+ )
290
+ return True
291
+ except Exception as e:
292
+ self.echo(f"Failed to delete JobSet {jobset_name}: {str(e)}")
293
+ return False
294
+ else:
295
+ self.echo(f"Unknown outcome {outcome} for JobSet {jobset_name}")
296
+ return False
297
+
298
+ def extract_matching_jobs_and_jobsets(self, flow_name, run_id, user):
299
+ """Extract matching jobs and jobsets based on the flow_name, run_id, and user criteria"""
300
+ jobs = self._find_matching_jobs(flow_name, run_id, user)
301
+ jobsets = self._find_matching_jobsets(flow_name, run_id, user)
302
+ return [(j, derive_job_outcome(j.status)) for j in jobs], [
303
+ (j, derive_jobset_outcome(j.get("status", {}))) for j in jobsets
304
+ ]
305
+
306
+ def process_matching_jobs_and_jobsets(self, flow_name, run_id, user):
307
+ """Process all matching jobs and jobsets based on their derived outcomes"""
308
+ results = []
309
+ progress_update = lambda x: x
310
+ if self.progress_bar:
311
+ progress_update = lambda x: self.progress_bar.update(1, x)
312
+
313
+ # Process matching jobs
314
+ _jobs, _jobsets = [], []
315
+ jobs = self._find_matching_jobs(flow_name, run_id, user)
316
+ for job in jobs:
317
+ outcome = derive_job_outcome(job.status)
318
+ result = self._handle_job_outcome(job, outcome)
319
+ # results.append(result)
320
+ if result is not None:
321
+ progress_update("💀 Killing Job %s" % job.metadata.name)
322
+ results.append(result)
323
+ _jobs.append(result)
324
+
325
+ # Process matching jobsets
326
+ jobsets = self._find_matching_jobsets(flow_name, run_id, user)
327
+ for jobset in jobsets:
328
+ jobset_status = jobset.get("status", {})
329
+ outcome = derive_jobset_outcome(jobset_status)
330
+ result = self._handle_jobset_outcome(jobset, outcome)
331
+ if result is not None:
332
+ progress_update(
333
+ "💀 Deleting JobSet %s"
334
+ % jobset.get("metadata", {}).get("name", "unknown")
335
+ )
336
+ results.append(result)
337
+ _jobsets.append(result)
338
+
339
+ return results, len(_jobs), len(_jobsets)
340
+
341
+ def process_matching_jobs_and_jobsets_force_all(self, flow_name, run_id, user):
342
+ """Force process ALL matching jobs and jobsets regardless of their status/outcome"""
343
+ results = []
344
+ progress_update = lambda x: x
345
+ if self.progress_bar:
346
+ progress_update = lambda x: self.progress_bar.update(1, x)
347
+
348
+ # Process matching jobs - FORCE DELETE ALL
349
+ _jobs, _jobsets = [], []
350
+ jobs = self._find_matching_jobs(flow_name, run_id, user)
351
+ for job in jobs:
352
+ # Force DELETE outcome regardless of actual status
353
+ result = self._handle_job_outcome(job, JobOutcomes.DELETE)
354
+ progress_update("🔥 FORCE Deleting Job %s" % job.metadata.name)
355
+ results.append(
356
+ result if result is not None else True
357
+ ) # Treat None as success for force mode
358
+ _jobs.append(result if result is not None else True)
359
+
360
+ # Process matching jobsets - FORCE DELETE ALL
361
+ jobsets = self._find_matching_jobsets(flow_name, run_id, user)
362
+ for jobset in jobsets:
363
+ # Force DELETE outcome regardless of actual status
364
+ result = self._handle_jobset_outcome(jobset, JobOutcomes.DELETE)
365
+ progress_update(
366
+ "🔥 FORCE Deleting JobSet %s"
367
+ % jobset.get("metadata", {}).get("name", "unknown")
368
+ )
369
+ results.append(
370
+ result if result is not None else True
371
+ ) # Treat None as success for force mode
372
+ _jobsets.append(result if result is not None else True)
373
+
374
+ return results, len(_jobs), len(_jobsets)
@@ -0,0 +1,140 @@
1
+ from metaflow.cards import Markdown, Table
2
+ from metaflow.metaflow_current import current
3
+
4
+ from .utils import get_storage_path
5
+ from ..card_utilities.async_cards import CardRefresher
6
+ from ..card_utilities.extra_components import BarPlot, ViolinPlot
7
+
8
+
9
+ class NimMetricsRefresher(CardRefresher):
10
+ CARD_ID = "nim_metrics"
11
+
12
+ def __init__(self) -> None:
13
+ self._metrics_charts = {}
14
+ self._last_updated_on = None
15
+ self._already_rendered = False
16
+ self._file_name = get_storage_path(current.task_id)
17
+
18
+ def sqlite_fetch_func(self, conn):
19
+ try:
20
+ cursor = conn.cursor()
21
+ cursor.execute(
22
+ "SELECT error, success, status_code, prompt_tokens, completion_tokens, e2e_time, model FROM metrics"
23
+ )
24
+ rows = cursor.fetchall()
25
+ data = {
26
+ "error": 0,
27
+ "success": 0,
28
+ "status_code": [],
29
+ "prompt_tokens": [],
30
+ "completion_tokens": [],
31
+ "e2e_time": [],
32
+ "model": [],
33
+ }
34
+ for row in rows:
35
+ data["error"] += row[0]
36
+ data["success"] += row[1]
37
+ data["status_code"].append(row[2])
38
+ data["prompt_tokens"].append(row[3])
39
+ data["completion_tokens"].append(row[4])
40
+ data["e2e_time"].append(row[5])
41
+ data["model"].append(row[6])
42
+ return data
43
+ finally:
44
+ conn.close()
45
+
46
+ def render_card_fresh(self, current_card, data):
47
+ self._already_rendered = True
48
+ current_card.clear()
49
+ current_card.append(Markdown("## Metrics"))
50
+
51
+ self._metrics_charts["request_success"] = BarPlot(
52
+ title="Request success",
53
+ category_name="category",
54
+ value_name="amount",
55
+ orientation="horizontal",
56
+ )
57
+ self._metrics_charts["latency_distribution"] = ViolinPlot(
58
+ title="Latency distribution (s)",
59
+ category_col_name="model",
60
+ value_col_name="e2e_time",
61
+ )
62
+
63
+ current_card.append(
64
+ Table(
65
+ data=[
66
+ [
67
+ self._metrics_charts["request_success"],
68
+ ],
69
+ [self._metrics_charts["latency_distribution"]],
70
+ ]
71
+ )
72
+ )
73
+ current_card.refresh()
74
+
75
+ def on_startup(self, current_card):
76
+ current_card.append(Markdown("# Task-level NIM API metrics"))
77
+ current_card.append(
78
+ Markdown(
79
+ "_waiting for data to appear_",
80
+ )
81
+ )
82
+ current_card.refresh()
83
+
84
+ def on_error(self, current_card, error_message):
85
+ if isinstance(error_message, FileNotFoundError):
86
+ return
87
+
88
+ if not self._already_rendered:
89
+ current_card.clear()
90
+ current_card.append(
91
+ Markdown(
92
+ f"## Error: {str(error_message)}",
93
+ )
94
+ )
95
+ current_card.refresh()
96
+
97
+ def update_only_components(self, current_card, data_object):
98
+ # update request success data
99
+ self._metrics_charts["request_success"].spec["data"][0]["values"] = [
100
+ {
101
+ "category": "Successful requests",
102
+ "amount": data_object["metrics"]["success"],
103
+ },
104
+ {"category": "Errors", "amount": data_object["metrics"]["error"]},
105
+ ]
106
+
107
+ latency_data = []
108
+ times = []
109
+ for m, e in zip(
110
+ data_object["metrics"]["model"], data_object["metrics"]["e2e_time"]
111
+ ):
112
+ latency_data.append({"model": m, "e2e_time": e})
113
+ times.append(e)
114
+
115
+ # update latency data
116
+ self._metrics_charts["latency_distribution"].spec["data"][0][
117
+ "values"
118
+ ] = latency_data
119
+
120
+ # update domain for latency plot
121
+ min_time = min(times)
122
+ max_time = max(times)
123
+ for scale in self._metrics_charts["latency_distribution"].spec["scales"]:
124
+ if scale["name"] == "xscale":
125
+ scale["domain"] = [min_time - max_time * 0.1, max_time + max_time * 0.1]
126
+
127
+ current_card.refresh()
128
+
129
+ def on_update(self, current_card, data_object):
130
+ data_object_keys = set(data_object.keys())
131
+ if len(data_object_keys) == 0:
132
+ return
133
+ if len(self._metrics_charts) == 0:
134
+ self.render_card_fresh(current_card, data_object)
135
+ return
136
+ elif len(data_object["metrics"]["status_code"]) == 0:
137
+ return
138
+ else:
139
+ self.update_only_components(current_card, data_object)
140
+ return