ob-metaflow-extensions 1.1.142__py2.py3-none-any.whl → 1.4.33__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (97) hide show
  1. metaflow_extensions/outerbounds/__init__.py +1 -1
  2. metaflow_extensions/outerbounds/plugins/__init__.py +26 -5
  3. metaflow_extensions/outerbounds/plugins/apps/app_cli.py +0 -0
  4. metaflow_extensions/outerbounds/plugins/apps/app_deploy_decorator.py +146 -0
  5. metaflow_extensions/outerbounds/plugins/apps/core/__init__.py +10 -0
  6. metaflow_extensions/outerbounds/plugins/apps/core/_state_machine.py +506 -0
  7. metaflow_extensions/outerbounds/plugins/apps/core/_vendor/__init__.py +0 -0
  8. metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/__init__.py +4 -0
  9. metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/spinners.py +478 -0
  10. metaflow_extensions/outerbounds/plugins/apps/core/app_cli.py +1200 -0
  11. metaflow_extensions/outerbounds/plugins/apps/core/app_config.py +146 -0
  12. metaflow_extensions/outerbounds/plugins/apps/core/artifacts.py +0 -0
  13. metaflow_extensions/outerbounds/plugins/apps/core/capsule.py +958 -0
  14. metaflow_extensions/outerbounds/plugins/apps/core/click_importer.py +24 -0
  15. metaflow_extensions/outerbounds/plugins/apps/core/code_package/__init__.py +3 -0
  16. metaflow_extensions/outerbounds/plugins/apps/core/code_package/code_packager.py +618 -0
  17. metaflow_extensions/outerbounds/plugins/apps/core/code_package/examples.py +125 -0
  18. metaflow_extensions/outerbounds/plugins/apps/core/config/__init__.py +12 -0
  19. metaflow_extensions/outerbounds/plugins/apps/core/config/cli_generator.py +161 -0
  20. metaflow_extensions/outerbounds/plugins/apps/core/config/config_utils.py +868 -0
  21. metaflow_extensions/outerbounds/plugins/apps/core/config/schema_export.py +288 -0
  22. metaflow_extensions/outerbounds/plugins/apps/core/config/typed_configs.py +139 -0
  23. metaflow_extensions/outerbounds/plugins/apps/core/config/typed_init_generator.py +398 -0
  24. metaflow_extensions/outerbounds/plugins/apps/core/config/unified_config.py +1088 -0
  25. metaflow_extensions/outerbounds/plugins/apps/core/config_schema.yaml +337 -0
  26. metaflow_extensions/outerbounds/plugins/apps/core/dependencies.py +115 -0
  27. metaflow_extensions/outerbounds/plugins/apps/core/deployer.py +303 -0
  28. metaflow_extensions/outerbounds/plugins/apps/core/experimental/__init__.py +89 -0
  29. metaflow_extensions/outerbounds/plugins/apps/core/perimeters.py +87 -0
  30. metaflow_extensions/outerbounds/plugins/apps/core/secrets.py +164 -0
  31. metaflow_extensions/outerbounds/plugins/apps/core/utils.py +233 -0
  32. metaflow_extensions/outerbounds/plugins/apps/core/validations.py +17 -0
  33. metaflow_extensions/outerbounds/plugins/aws/__init__.py +4 -0
  34. metaflow_extensions/outerbounds/plugins/aws/assume_role.py +3 -0
  35. metaflow_extensions/outerbounds/plugins/aws/assume_role_decorator.py +78 -0
  36. metaflow_extensions/outerbounds/plugins/checkpoint_datastores/__init__.py +2 -0
  37. metaflow_extensions/outerbounds/plugins/checkpoint_datastores/coreweave.py +71 -0
  38. metaflow_extensions/outerbounds/plugins/checkpoint_datastores/external_chckpt.py +85 -0
  39. metaflow_extensions/outerbounds/plugins/checkpoint_datastores/nebius.py +73 -0
  40. metaflow_extensions/outerbounds/plugins/fast_bakery/baker.py +110 -0
  41. metaflow_extensions/outerbounds/plugins/fast_bakery/docker_environment.py +17 -3
  42. metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery.py +1 -0
  43. metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py +18 -44
  44. metaflow_extensions/outerbounds/plugins/kubernetes/pod_killer.py +374 -0
  45. metaflow_extensions/outerbounds/plugins/nim/card.py +1 -6
  46. metaflow_extensions/outerbounds/plugins/nim/{__init__.py → nim_decorator.py} +13 -49
  47. metaflow_extensions/outerbounds/plugins/nim/nim_manager.py +294 -233
  48. metaflow_extensions/outerbounds/plugins/nim/utils.py +36 -0
  49. metaflow_extensions/outerbounds/plugins/nvcf/constants.py +2 -2
  50. metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py +100 -19
  51. metaflow_extensions/outerbounds/plugins/nvcf/nvcf_decorator.py +6 -1
  52. metaflow_extensions/outerbounds/plugins/nvct/__init__.py +0 -0
  53. metaflow_extensions/outerbounds/plugins/nvct/exceptions.py +71 -0
  54. metaflow_extensions/outerbounds/plugins/nvct/nvct.py +131 -0
  55. metaflow_extensions/outerbounds/plugins/nvct/nvct_cli.py +289 -0
  56. metaflow_extensions/outerbounds/plugins/nvct/nvct_decorator.py +286 -0
  57. metaflow_extensions/outerbounds/plugins/nvct/nvct_runner.py +218 -0
  58. metaflow_extensions/outerbounds/plugins/nvct/utils.py +29 -0
  59. metaflow_extensions/outerbounds/plugins/ollama/__init__.py +171 -16
  60. metaflow_extensions/outerbounds/plugins/ollama/constants.py +1 -0
  61. metaflow_extensions/outerbounds/plugins/ollama/exceptions.py +22 -0
  62. metaflow_extensions/outerbounds/plugins/ollama/ollama.py +1710 -114
  63. metaflow_extensions/outerbounds/plugins/ollama/status_card.py +292 -0
  64. metaflow_extensions/outerbounds/plugins/optuna/__init__.py +48 -0
  65. metaflow_extensions/outerbounds/plugins/profilers/simple_card_decorator.py +96 -0
  66. metaflow_extensions/outerbounds/plugins/s3_proxy/__init__.py +7 -0
  67. metaflow_extensions/outerbounds/plugins/s3_proxy/binary_caller.py +132 -0
  68. metaflow_extensions/outerbounds/plugins/s3_proxy/constants.py +11 -0
  69. metaflow_extensions/outerbounds/plugins/s3_proxy/exceptions.py +13 -0
  70. metaflow_extensions/outerbounds/plugins/s3_proxy/proxy_bootstrap.py +59 -0
  71. metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_api.py +93 -0
  72. metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_decorator.py +250 -0
  73. metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_manager.py +225 -0
  74. metaflow_extensions/outerbounds/plugins/secrets/secrets.py +38 -2
  75. metaflow_extensions/outerbounds/plugins/snowflake/snowflake.py +44 -4
  76. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_client.py +6 -3
  77. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_decorator.py +13 -7
  78. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_job.py +8 -2
  79. metaflow_extensions/outerbounds/plugins/torchtune/__init__.py +163 -0
  80. metaflow_extensions/outerbounds/plugins/vllm/__init__.py +255 -0
  81. metaflow_extensions/outerbounds/plugins/vllm/constants.py +1 -0
  82. metaflow_extensions/outerbounds/plugins/vllm/exceptions.py +1 -0
  83. metaflow_extensions/outerbounds/plugins/vllm/status_card.py +352 -0
  84. metaflow_extensions/outerbounds/plugins/vllm/vllm_manager.py +621 -0
  85. metaflow_extensions/outerbounds/remote_config.py +27 -3
  86. metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py +87 -2
  87. metaflow_extensions/outerbounds/toplevel/ob_internal.py +4 -0
  88. metaflow_extensions/outerbounds/toplevel/plugins/optuna/__init__.py +1 -0
  89. metaflow_extensions/outerbounds/toplevel/plugins/torchtune/__init__.py +1 -0
  90. metaflow_extensions/outerbounds/toplevel/plugins/vllm/__init__.py +1 -0
  91. metaflow_extensions/outerbounds/toplevel/s3_proxy.py +88 -0
  92. {ob_metaflow_extensions-1.1.142.dist-info → ob_metaflow_extensions-1.4.33.dist-info}/METADATA +2 -2
  93. ob_metaflow_extensions-1.4.33.dist-info/RECORD +134 -0
  94. metaflow_extensions/outerbounds/plugins/nim/utilities.py +0 -5
  95. ob_metaflow_extensions-1.1.142.dist-info/RECORD +0 -64
  96. {ob_metaflow_extensions-1.1.142.dist-info → ob_metaflow_extensions-1.4.33.dist-info}/WHEEL +0 -0
  97. {ob_metaflow_extensions-1.1.142.dist-info → ob_metaflow_extensions-1.4.33.dist-info}/top_level.txt +0 -0
@@ -90,6 +90,7 @@ class DockerEnvironmentException(MetaflowException):
90
90
  class DockerEnvironment(MetaflowEnvironment):
91
91
  TYPE = "fast-bakery"
92
92
  _filecache = None
93
+ _force_rebuild = False
93
94
 
94
95
  def __init__(self, flow):
95
96
  self.skipped_steps = set()
@@ -178,12 +179,20 @@ class DockerEnvironment(MetaflowEnvironment):
178
179
 
179
180
  if self.skipped_steps:
180
181
  self.delegate = CondaEnvironment(self.flow)
182
+ self.delegate._force_rebuild = self._force_rebuild
181
183
  self.delegate.set_local_root(self.local_root)
182
184
  self.delegate.validate_environment(echo, self.datastore_type)
183
185
  self.delegate.init_environment(echo, self.skipped_steps)
184
186
 
185
187
  def _bake(self, steps) -> Dict[str, FastBakeryApiResponse]:
186
188
  metafile_path = get_fastbakery_metafile_path(self.local_root, self.flow.name)
189
+ if self._force_rebuild:
190
+ # clear the metafile if force rebuilding, effectively skipping the cache.
191
+ try:
192
+ os.remove(metafile_path)
193
+ except Exception:
194
+ pass
195
+
187
196
  logger_lock = threading.Lock()
188
197
 
189
198
  @cache_request(metafile_path)
@@ -201,7 +210,8 @@ class DockerEnvironment(MetaflowEnvironment):
201
210
  bakery.pypi_packages(pypi_packages)
202
211
  bakery.conda_packages(conda_packages)
203
212
  bakery.base_image(base_image)
204
- # bakery.ignore_cache()
213
+ if self._force_rebuild:
214
+ bakery.ignore_cache()
205
215
 
206
216
  with logger_lock:
207
217
  self.logger(f"🍳 Baking [{ref}] ...")
@@ -341,12 +351,16 @@ class DockerEnvironment(MetaflowEnvironment):
341
351
  config.append("--disable=F0401")
342
352
  return config
343
353
 
344
- def get_package_commands(self, codepackage_url, datastore_type):
354
+ def get_package_commands(
355
+ self, codepackage_url, datastore_type, code_package_metadata=None
356
+ ):
345
357
  # we must set the skip install flag at this stage in order to skip package downloads,
346
358
  # doing so in bootstrap_commands is too late in the lifecycle.
347
359
  return [
348
360
  "export METAFLOW_SKIP_INSTALL_DEPENDENCIES=$FASTBAKERY_IMAGE",
349
- ] + super().get_package_commands(codepackage_url, datastore_type)
361
+ ] + super().get_package_commands(
362
+ codepackage_url, datastore_type, code_package_metadata=code_package_metadata
363
+ )
350
364
 
351
365
  def bootstrap_commands(self, step_name, datastore_type):
352
366
  if step_name in self.skipped_steps:
@@ -122,6 +122,7 @@ class FastBakery:
122
122
  "responseMaxAgeSeconds": 0,
123
123
  "layerMaxAgeSeconds": 0,
124
124
  "baseImageMaxAgeSeconds": 0,
125
+ "overwriteExistingLayers": True, # Used primarily to rewrite possibly corrupted layers.
125
126
  }
126
127
  return self
127
128
 
@@ -5,6 +5,7 @@ import time
5
5
 
6
6
  from metaflow.exception import MetaflowException
7
7
  from metaflow.metaflow_config import KUBERNETES_NAMESPACE
8
+ from .pod_killer import PodKiller
8
9
 
9
10
 
10
11
  CLIENT_REFRESH_INTERVAL_SECONDS = 300
@@ -105,50 +106,23 @@ class KubernetesClient(object):
105
106
  return list(results)
106
107
 
107
108
  def kill_pods(self, flow_name, run_id, user, echo):
108
- from kubernetes.stream import stream
109
-
110
- api_instance = self._client.CoreV1Api()
111
- job_api = self._client.BatchV1Api()
112
- pods = self._find_active_pods(flow_name, run_id, user)
113
-
114
- def _kill_pod(pod):
115
- echo("Killing Kubernetes pod %s\n" % pod.metadata.name)
116
- try:
117
- stream(
118
- api_instance.connect_get_namespaced_pod_exec,
119
- name=pod.metadata.name,
120
- namespace=pod.metadata.namespace,
121
- command=[
122
- "/bin/sh",
123
- "-c",
124
- "/sbin/killall5",
125
- ],
126
- stderr=True,
127
- stdin=False,
128
- stdout=True,
129
- tty=False,
130
- )
131
- except Exception:
132
- # best effort kill for pod can fail.
133
- try:
134
- job_name = pod.metadata.labels.get("job-name", None)
135
- if job_name is None:
136
- raise Exception("Could not determine job name")
137
-
138
- job_api.patch_namespaced_job(
139
- name=job_name,
140
- namespace=pod.metadata.namespace,
141
- field_manager="metaflow",
142
- body={"spec": {"parallelism": 0}},
143
- )
144
- except Exception as e:
145
- echo("failed to kill pod %s - %s" % (pod.metadata.name, str(e)))
146
-
147
- with ThreadPoolExecutor() as executor:
148
- operated_pods = list(executor.map(_kill_pod, pods))
149
-
150
- if not operated_pods:
151
- echo("No active Kubernetes pods found for run *%s*" % run_id)
109
+ # Create PodKiller instance
110
+ killer = PodKiller(self._client, echo, self._namespace)
111
+
112
+ # Process all matching jobs and jobsets based on their outcomes
113
+ (
114
+ job_jobset_results,
115
+ num_jobs,
116
+ num_jobsets,
117
+ ) = killer.process_matching_jobs_and_jobsets(flow_name, run_id, user)
118
+
119
+ if job_jobset_results:
120
+ successful_operations = sum(1 for result in job_jobset_results if result)
121
+ echo(
122
+ f"Found and processed {num_jobs} jobs and {num_jobsets} jobsets, {successful_operations} operations successful\n"
123
+ )
124
+ else:
125
+ echo("No matching jobs or jobsets found for run *%s*" % run_id)
152
126
 
153
127
  def job(self, **kwargs):
154
128
  from metaflow.plugins.kubernetes.kubernetes_job import KubernetesJob
@@ -0,0 +1,374 @@
1
+ from typing import TYPE_CHECKING
2
+
3
+ if TYPE_CHECKING:
4
+ from kubernetes.client.models.v1_job import V1Job
5
+ from kubernetes.client.models.v1_job_status import V1JobStatus
6
+
7
+
8
+ def _is_jobset_child(job: "V1Job"):
9
+ if job.metadata.owner_references:
10
+ for owner_ref in job.metadata.owner_references:
11
+ if owner_ref.kind == "JobSet":
12
+ return owner_ref
13
+ return None
14
+
15
+
16
+ class JobOutcomes:
17
+ KILL = "kill"
18
+ DELETE = "delete"
19
+ LEAVE_UNCHANGED = "leave_unchanged"
20
+
21
+
22
+ def derive_jobset_outcome(jobset_status):
23
+ return (
24
+ JobOutcomes.LEAVE_UNCHANGED
25
+ if jobset_status.get("terminalState", None)
26
+ else JobOutcomes.DELETE
27
+ )
28
+
29
+
30
+ def derive_job_outcome(job_status: "V1JobStatus"):
31
+ if job_status.start_time is None:
32
+ # If the job has not started even then just wipe it!
33
+ return JobOutcomes.DELETE
34
+ if job_status.succeeded or job_status.failed:
35
+ return JobOutcomes.LEAVE_UNCHANGED
36
+
37
+ if job_status.completion_time is not None:
38
+ return JobOutcomes.LEAVE_UNCHANGED
39
+
40
+ # This means that the job has neither finished or succedded.
41
+ if job_status.active:
42
+ return JobOutcomes.DELETE
43
+
44
+ # This means that the job is not active. Had started. There is not succedded/fail.
45
+ # This is a weird state. Better to just kill the job
46
+ return JobOutcomes.DELETE
47
+
48
+
49
+ class PodKiller:
50
+ def __init__(self, kubernetes_client, echo_func, namespace, progress_bar=None):
51
+ self.client = kubernetes_client
52
+ self.echo = echo_func
53
+ self.api_instance = self.client.CoreV1Api()
54
+ self.job_api = self.client.BatchV1Api()
55
+ self._namespace = namespace
56
+ self.jobset_api = None
57
+ self.jobset_api = self.client.CustomObjectsApi()
58
+ self.progress_bar = progress_bar
59
+
60
+ def _delete_jobset(self, owner_ref, namespace):
61
+ """Delete a JobSet if it's the owner of a job."""
62
+ if not self.jobset_api:
63
+ self.echo("JobSet API not available, cannot delete JobSet\n")
64
+ return False
65
+
66
+ try:
67
+ jobset_name = owner_ref.name
68
+ self.echo(f"Deleting JobSet: {jobset_name}\n")
69
+
70
+ self.jobset_api.delete_namespaced_custom_object(
71
+ group="jobset.x-k8s.io",
72
+ version="v1alpha2",
73
+ namespace=namespace,
74
+ plural="jobsets",
75
+ name=jobset_name,
76
+ )
77
+ return True
78
+ except Exception as e:
79
+ self.echo(f"Failed to delete JobSet {owner_ref.name}: {str(e)}\n")
80
+ return False
81
+
82
+ def _delete_job(self, job_name, namespace):
83
+ """Delete a Batch Job and check for JobSet owner reference."""
84
+ try:
85
+ # First get the job to check for owner references
86
+ job = self.job_api.read_namespaced_job(name=job_name, namespace=namespace)
87
+ # Check for JobSet owner reference
88
+ jobset_ref = _is_jobset_child(job)
89
+ if jobset_ref:
90
+ if self._delete_jobset(jobset_ref, namespace):
91
+ return True
92
+
93
+ # If no JobSet owner or JobSet deletion failed, delete the job
94
+ self.echo(f"Deleting Batch Job: {job_name}")
95
+ self.job_api.delete_namespaced_job(
96
+ name=job_name, namespace=namespace, propagation_policy="Background"
97
+ )
98
+ return True
99
+
100
+ except Exception as e:
101
+ self.echo(f"Failed to delete job {job_name}: {str(e)}")
102
+ return False
103
+
104
+ def _kill_pod_process(self, pod):
105
+ """Attempt to kill processes inside a pod."""
106
+ from kubernetes.stream import stream
107
+
108
+ try:
109
+ stream(
110
+ self.api_instance.connect_get_namespaced_pod_exec,
111
+ name=pod.metadata.name,
112
+ namespace=pod.metadata.namespace,
113
+ command=["/bin/sh", "-c", "/sbin/killall5"],
114
+ stderr=True,
115
+ stdin=False,
116
+ stdout=True,
117
+ tty=False,
118
+ )
119
+ return True
120
+ except Exception as e:
121
+ self.echo(
122
+ f"Failed to kill processes in pod {pod.metadata.name}: {str(e)}\n"
123
+ )
124
+ return False
125
+
126
+ @staticmethod
127
+ def _metaflow_matching_spec(run_id, user, flow_name, annotations, labels):
128
+ # Handle argo prefixes in run_id like in _find_active_pods
129
+ _argo_run_id = None
130
+ if run_id is not None:
131
+ _argo_run_id = run_id[run_id.startswith("argo-") and len("argo-") :]
132
+ return (
133
+ annotations
134
+ and (
135
+ run_id is None
136
+ or (annotations.get("metaflow/run_id") == run_id)
137
+ # we want to also match jobsets launched by argo-workflows
138
+ # This line has no real value since the We already avoid any
139
+ # argo-workflows related terminations.
140
+ or (
141
+ labels.get("workflows.argoproj.io/workflow") is not None
142
+ and labels.get("workflows.argoproj.io/workflow") == _argo_run_id
143
+ )
144
+ )
145
+ and (user is None or annotations.get("metaflow/user") == user)
146
+ and (annotations.get("metaflow/flow_name") == flow_name)
147
+ )
148
+
149
+ def _find_matching_jobs(self, flow_name, run_id=None, user=None):
150
+ """Find jobs that match the flow_name, run_id, and user criteria using similar logic to _find_active_pods"""
151
+
152
+ def paginated_job_finder(namespace):
153
+ continue_token = None
154
+ while True:
155
+ response = self.job_api.list_namespaced_job(
156
+ namespace=namespace, limit=100, _continue=continue_token
157
+ )
158
+ yield response.items
159
+ continue_token = response.metadata._continue
160
+ if not continue_token:
161
+ break
162
+
163
+ try:
164
+ matching_jobs = []
165
+ for _jobs in paginated_job_finder(self._namespace):
166
+ for job in _jobs:
167
+ _match = self._metaflow_matching_spec(
168
+ run_id=run_id,
169
+ user=user,
170
+ flow_name=flow_name,
171
+ annotations=job.metadata.annotations,
172
+ labels=job.metadata.labels,
173
+ )
174
+ if _match:
175
+ matching_jobs.append(job)
176
+ return matching_jobs
177
+ except Exception as e:
178
+ self.echo(f"Error finding jobs: {str(e)}\n")
179
+ return []
180
+
181
+ def _find_matching_jobsets(self, flow_name, run_id=None, user=None):
182
+ """Find jobsets that match the flow_name, run_id, and user criteria using similar logic to _find_active_pods"""
183
+ if not self.jobset_api:
184
+ return []
185
+
186
+ def paginated_jobset_finder(namespace):
187
+ continue_token = None
188
+ responses = []
189
+ while True:
190
+ response = self.jobset_api.list_namespaced_custom_object(
191
+ group="jobset.x-k8s.io",
192
+ version="v1alpha2",
193
+ namespace=namespace,
194
+ plural="jobsets",
195
+ limit=100,
196
+ **({"_continue": continue_token} if continue_token else {}),
197
+ )
198
+ continue_token = response.get("metadata", {}).get("continue", None)
199
+ responses.append(response)
200
+ if not continue_token:
201
+ break
202
+ return responses
203
+
204
+ try:
205
+ matching_jobsets = []
206
+
207
+ for jobset_response in paginated_jobset_finder(self._namespace):
208
+ for jobset in jobset_response.get("items", []):
209
+ _match = self._metaflow_matching_spec(
210
+ run_id=run_id,
211
+ user=user,
212
+ flow_name=flow_name,
213
+ annotations=jobset.get("metadata", {}).get("annotations", {}),
214
+ labels=jobset.get("metadata", {}).get("labels", {}),
215
+ )
216
+ if _match:
217
+ matching_jobsets.append(jobset)
218
+
219
+ return matching_jobsets
220
+ except Exception as e:
221
+ self.echo(f"Error finding jobsets: {str(e)}\n")
222
+ return []
223
+
224
+ def _kill_pods_for_job(self, job):
225
+ """Find and kill pods associated with a specific job"""
226
+ job_name = job.metadata.name
227
+ namespace = job.metadata.namespace
228
+
229
+ try:
230
+ # Find pods with the job-name label matching this job
231
+ pods = self.api_instance.list_namespaced_pod(
232
+ namespace=namespace, label_selector=f"job-name={job_name}"
233
+ )
234
+
235
+ killed_pods = 0
236
+ for pod in pods.items:
237
+ if pod.status.phase in ["Running"]:
238
+ self.echo(
239
+ f"Killing processes in pod {pod.metadata.name} for job {job_name}"
240
+ )
241
+ if self._kill_pod_process(pod):
242
+ killed_pods += 1
243
+
244
+ return killed_pods > 0
245
+ except Exception as e:
246
+ self.echo(f"Failed to find/kill pods for job {job_name}: {str(e)}")
247
+ return False
248
+
249
+ def _handle_job_outcome(self, job, outcome):
250
+ """Handle a job based on the derived outcome"""
251
+ job_name = job.metadata.name
252
+ namespace = job.metadata.namespace
253
+
254
+ if outcome == JobOutcomes.LEAVE_UNCHANGED:
255
+ # self.echo(f"Job {job_name} is in terminal state, leaving unchanged")
256
+ return None
257
+ elif outcome == JobOutcomes.DELETE:
258
+ self.echo(f"Deleting Job {job_name}")
259
+ return self._delete_job(job_name, namespace)
260
+ elif outcome == JobOutcomes.KILL:
261
+ self.echo(f"Killing Job {job_name}")
262
+ # First try to kill the pod processes
263
+ pods_killed = self._kill_pods_for_job(job)
264
+ if pods_killed > 0:
265
+ return True
266
+ # Worst case if we are not able to delete any pod, then delete the Job.
267
+ return self._delete_job(job_name, namespace)
268
+ else:
269
+ self.echo(f"Unknown outcome {outcome} for job {job_name}\n")
270
+ return False
271
+
272
+ def _handle_jobset_outcome(self, jobset, outcome):
273
+ """Handle a jobset based on the derived outcome"""
274
+ jobset_name = jobset.get("metadata", {}).get("name", "unknown")
275
+ namespace = jobset.get("metadata", {}).get("namespace", self._namespace)
276
+
277
+ if outcome == JobOutcomes.LEAVE_UNCHANGED:
278
+ # self.echo(f"JobSet {jobset_name} is in terminal state, leaving unchanged")
279
+ return None
280
+ elif outcome == JobOutcomes.DELETE:
281
+ self.echo(f"Deleting JobSet {jobset_name}")
282
+ try:
283
+ self.jobset_api.delete_namespaced_custom_object(
284
+ group="jobset.x-k8s.io",
285
+ version="v1alpha2",
286
+ namespace=namespace,
287
+ plural="jobsets",
288
+ name=jobset_name,
289
+ )
290
+ return True
291
+ except Exception as e:
292
+ self.echo(f"Failed to delete JobSet {jobset_name}: {str(e)}")
293
+ return False
294
+ else:
295
+ self.echo(f"Unknown outcome {outcome} for JobSet {jobset_name}")
296
+ return False
297
+
298
+ def extract_matching_jobs_and_jobsets(self, flow_name, run_id, user):
299
+ """Extract matching jobs and jobsets based on the flow_name, run_id, and user criteria"""
300
+ jobs = self._find_matching_jobs(flow_name, run_id, user)
301
+ jobsets = self._find_matching_jobsets(flow_name, run_id, user)
302
+ return [(j, derive_job_outcome(j.status)) for j in jobs], [
303
+ (j, derive_jobset_outcome(j.get("status", {}))) for j in jobsets
304
+ ]
305
+
306
+ def process_matching_jobs_and_jobsets(self, flow_name, run_id, user):
307
+ """Process all matching jobs and jobsets based on their derived outcomes"""
308
+ results = []
309
+ progress_update = lambda x: x
310
+ if self.progress_bar:
311
+ progress_update = lambda x: self.progress_bar.update(1, x)
312
+
313
+ # Process matching jobs
314
+ _jobs, _jobsets = [], []
315
+ jobs = self._find_matching_jobs(flow_name, run_id, user)
316
+ for job in jobs:
317
+ outcome = derive_job_outcome(job.status)
318
+ result = self._handle_job_outcome(job, outcome)
319
+ # results.append(result)
320
+ if result is not None:
321
+ progress_update("💀 Killing Job %s" % job.metadata.name)
322
+ results.append(result)
323
+ _jobs.append(result)
324
+
325
+ # Process matching jobsets
326
+ jobsets = self._find_matching_jobsets(flow_name, run_id, user)
327
+ for jobset in jobsets:
328
+ jobset_status = jobset.get("status", {})
329
+ outcome = derive_jobset_outcome(jobset_status)
330
+ result = self._handle_jobset_outcome(jobset, outcome)
331
+ if result is not None:
332
+ progress_update(
333
+ "💀 Deleting JobSet %s"
334
+ % jobset.get("metadata", {}).get("name", "unknown")
335
+ )
336
+ results.append(result)
337
+ _jobsets.append(result)
338
+
339
+ return results, len(_jobs), len(_jobsets)
340
+
341
+ def process_matching_jobs_and_jobsets_force_all(self, flow_name, run_id, user):
342
+ """Force process ALL matching jobs and jobsets regardless of their status/outcome"""
343
+ results = []
344
+ progress_update = lambda x: x
345
+ if self.progress_bar:
346
+ progress_update = lambda x: self.progress_bar.update(1, x)
347
+
348
+ # Process matching jobs - FORCE DELETE ALL
349
+ _jobs, _jobsets = [], []
350
+ jobs = self._find_matching_jobs(flow_name, run_id, user)
351
+ for job in jobs:
352
+ # Force DELETE outcome regardless of actual status
353
+ result = self._handle_job_outcome(job, JobOutcomes.DELETE)
354
+ progress_update("🔥 FORCE Deleting Job %s" % job.metadata.name)
355
+ results.append(
356
+ result if result is not None else True
357
+ ) # Treat None as success for force mode
358
+ _jobs.append(result if result is not None else True)
359
+
360
+ # Process matching jobsets - FORCE DELETE ALL
361
+ jobsets = self._find_matching_jobsets(flow_name, run_id, user)
362
+ for jobset in jobsets:
363
+ # Force DELETE outcome regardless of actual status
364
+ result = self._handle_jobset_outcome(jobset, JobOutcomes.DELETE)
365
+ progress_update(
366
+ "🔥 FORCE Deleting JobSet %s"
367
+ % jobset.get("metadata", {}).get("name", "unknown")
368
+ )
369
+ results.append(
370
+ result if result is not None else True
371
+ ) # Treat None as success for force mode
372
+ _jobsets.append(result if result is not None else True)
373
+
374
+ return results, len(_jobs), len(_jobsets)
@@ -1,8 +1,7 @@
1
- import sqlite3
2
1
  from metaflow.cards import Markdown, Table
3
2
  from metaflow.metaflow_current import current
4
3
 
5
- from .utilities import get_storage_path
4
+ from .utils import get_storage_path
6
5
  from ..card_utilities.async_cards import CardRefresher
7
6
  from ..card_utilities.extra_components import BarPlot, ViolinPlot
8
7
 
@@ -17,9 +16,7 @@ class NimMetricsRefresher(CardRefresher):
17
16
  self._file_name = get_storage_path(current.task_id)
18
17
 
19
18
  def sqlite_fetch_func(self, conn):
20
- cursor = conn.cursor()
21
19
  try:
22
- conn = sqlite3.connect(self._file_name)
23
20
  cursor = conn.cursor()
24
21
  cursor.execute(
25
22
  "SELECT error, success, status_code, prompt_tokens, completion_tokens, e2e_time, model FROM metrics"
@@ -85,7 +82,6 @@ class NimMetricsRefresher(CardRefresher):
85
82
  current_card.refresh()
86
83
 
87
84
  def on_error(self, current_card, error_message):
88
-
89
85
  if isinstance(error_message, FileNotFoundError):
90
86
  return
91
87
 
@@ -99,7 +95,6 @@ class NimMetricsRefresher(CardRefresher):
99
95
  current_card.refresh()
100
96
 
101
97
  def update_only_components(self, current_card, data_object):
102
-
103
98
  # update request success data
104
99
  self._metrics_charts["request_success"].spec["data"][0]["values"] = [
105
100
  {
@@ -1,64 +1,31 @@
1
- from functools import partial
2
- from uuid import uuid4
3
- import os, time
4
- from metaflow.decorators import StepDecorator
1
+ import os
2
+ import time
5
3
  from metaflow import current
6
-
4
+ from .utils import get_storage_path, NIM_MONITOR_LOCAL_STORAGE_ROOT
7
5
  from .nim_manager import NimManager
6
+ from metaflow.decorators import StepDecorator
8
7
  from .card import NimMetricsRefresher
9
- from .utilities import get_storage_path, NIM_MONITOR_LOCAL_STORAGE_ROOT
10
- from ..card_utilities.async_cards import AsyncPeriodicRefresher
11
8
  from ..card_utilities.injector import CardDecoratorInjector
9
+ from ..card_utilities.async_cards import AsyncPeriodicRefresher
12
10
 
13
11
 
14
12
  class NimDecorator(StepDecorator, CardDecoratorInjector):
15
- """
16
- This decorator is used to run NIM containers in Metaflow tasks as sidecars.
17
-
18
- User code call
19
- -----------
20
- @nim(
21
- models=['meta/llama3-8b-instruct', 'meta/llama3-70b-instruct'],
22
- backend='managed'
23
- )
24
-
25
- Valid backend options
26
- ---------------------
27
- - 'managed': Outerbounds selects a compute provider based on the model.
28
-
29
- Valid model options
30
- ----------------
31
- - 'meta/llama3-8b-instruct': 8B parameter model
32
- - 'meta/llama3-70b-instruct': 70B parameter model
33
- - any model here: https://nvcf.ngc.nvidia.com/functions?filter=nvidia-functions
34
-
35
- Parameters
36
- ----------
37
- models: list[NIM]
38
- List of NIM containers running models in sidecars.
39
- backend: str
40
- Compute provider to run the NIM container.
41
- queue_timeout : int
42
- Time to keep the job in NVCF's queue.
43
- """
44
-
45
13
  name = "nim"
14
+
46
15
  defaults = {
47
16
  "models": [],
48
- "backend": "managed",
49
17
  "monitor": True,
50
18
  "persist_db": False,
51
- "queue_timeout": 5 * 24 * 3600, # Default 5 days in seconds
52
19
  }
53
20
 
54
- def step_init(
55
- self, flow, graph, step_name, decorators, environment, flow_datastore, logger
56
- ):
57
-
21
+ # Refer https://github.com/Netflix/metaflow/blob/master/docs/lifecycle.png
22
+ # to understand where these functions are invoked in the lifecycle of a
23
+ # Metaflow flow.
24
+ def step_init(self, flow, graph, step, decos, environment, flow_datastore, logger):
58
25
  if self.attributes["monitor"]:
59
26
  self.attach_card_decorator(
60
27
  flow,
61
- step_name,
28
+ step,
62
29
  NimMetricsRefresher.CARD_ID,
63
30
  "blank",
64
31
  refresh_interval=4.0,
@@ -68,11 +35,9 @@ class NimDecorator(StepDecorator, CardDecoratorInjector):
68
35
  {
69
36
  "nim": NimManager(
70
37
  models=self.attributes["models"],
71
- backend=self.attributes["backend"],
72
38
  flow=flow,
73
- step_name=step_name,
39
+ step_name=step,
74
40
  monitor=self.attributes["monitor"],
75
- queue_timeout=self.attributes["queue_timeout"],
76
41
  )
77
42
  }
78
43
  )
@@ -81,15 +46,14 @@ class NimDecorator(StepDecorator, CardDecoratorInjector):
81
46
  self, step_func, flow, graph, retry_count, max_user_code_retries, ubf_context
82
47
  ):
83
48
  if self.attributes["monitor"]:
84
-
85
49
  import sqlite3
86
- from metaflow import current
87
50
 
88
51
  file_path = get_storage_path(current.task_id)
89
52
  if os.path.exists(file_path):
90
53
  os.remove(file_path)
91
54
  os.makedirs(NIM_MONITOR_LOCAL_STORAGE_ROOT, exist_ok=True)
92
55
  conn = sqlite3.connect(file_path)
56
+
93
57
  cursor = conn.cursor()
94
58
  cursor.execute(
95
59
  """