ob-metaflow-extensions 1.1.158__py2.py3-none-any.whl → 1.1.160__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ob-metaflow-extensions might be problematic. Click here for more details.

@@ -347,4 +347,4 @@ SECRETS_PROVIDERS_DESC = [
347
347
  ("outerbounds", ".secrets.secrets.OuterboundsSecretsProvider"),
348
348
  ]
349
349
  # Adding an override here so the library can be imported at the metaflow.plugins level
350
- __mf_promote_submodules__ = ["snowflake", "ollama"]
350
+ __mf_promote_submodules__ = ["snowflake", "ollama", "torchtune"]
@@ -5,6 +5,7 @@ import time
5
5
 
6
6
  from metaflow.exception import MetaflowException
7
7
  from metaflow.metaflow_config import KUBERNETES_NAMESPACE
8
+ from .pod_killer import PodKiller
8
9
 
9
10
 
10
11
  CLIENT_REFRESH_INTERVAL_SECONDS = 300
@@ -105,50 +106,23 @@ class KubernetesClient(object):
105
106
  return list(results)
106
107
 
107
108
  def kill_pods(self, flow_name, run_id, user, echo):
108
- from kubernetes.stream import stream
109
-
110
- api_instance = self._client.CoreV1Api()
111
- job_api = self._client.BatchV1Api()
112
- pods = self._find_active_pods(flow_name, run_id, user)
113
-
114
- def _kill_pod(pod):
115
- echo("Killing Kubernetes pod %s\n" % pod.metadata.name)
116
- try:
117
- stream(
118
- api_instance.connect_get_namespaced_pod_exec,
119
- name=pod.metadata.name,
120
- namespace=pod.metadata.namespace,
121
- command=[
122
- "/bin/sh",
123
- "-c",
124
- "/sbin/killall5",
125
- ],
126
- stderr=True,
127
- stdin=False,
128
- stdout=True,
129
- tty=False,
130
- )
131
- except Exception:
132
- # best effort kill for pod can fail.
133
- try:
134
- job_name = pod.metadata.labels.get("job-name", None)
135
- if job_name is None:
136
- raise Exception("Could not determine job name")
137
-
138
- job_api.patch_namespaced_job(
139
- name=job_name,
140
- namespace=pod.metadata.namespace,
141
- field_manager="metaflow",
142
- body={"spec": {"parallelism": 0}},
143
- )
144
- except Exception as e:
145
- echo("failed to kill pod %s - %s" % (pod.metadata.name, str(e)))
146
-
147
- with ThreadPoolExecutor() as executor:
148
- operated_pods = list(executor.map(_kill_pod, pods))
149
-
150
- if not operated_pods:
151
- echo("No active Kubernetes pods found for run *%s*" % run_id)
109
+ # Create PodKiller instance
110
+ killer = PodKiller(self._client, echo, self._namespace)
111
+
112
+ # Process all matching jobs and jobsets based on their outcomes
113
+ (
114
+ job_jobset_results,
115
+ num_jobs,
116
+ num_jobsets,
117
+ ) = killer.process_matching_jobs_and_jobsets(flow_name, run_id, user)
118
+
119
+ if job_jobset_results:
120
+ successful_operations = sum(1 for result in job_jobset_results if result)
121
+ echo(
122
+ f"Found and processed {num_jobs} jobs and {num_jobsets} jobsets, {successful_operations} operations successful\n"
123
+ )
124
+ else:
125
+ echo("No matching jobs or jobsets found for run *%s*" % run_id)
152
126
 
153
127
  def job(self, **kwargs):
154
128
  from metaflow.plugins.kubernetes.kubernetes_job import KubernetesJob
@@ -0,0 +1,296 @@
1
+ from typing import TYPE_CHECKING
2
+
3
+ if TYPE_CHECKING:
4
+ from kubernetes.client.models.v1_job import V1Job
5
+ from kubernetes.client.models.v1_job_status import V1JobStatus
6
+
7
+
8
+ def _is_jobset_child(job: "V1Job"):
9
+ if job.metadata.owner_references:
10
+ for owner_ref in job.metadata.owner_references:
11
+ if owner_ref.kind == "JobSet":
12
+ return owner_ref
13
+ return None
14
+
15
+
16
+ class JobOutcomes:
17
+ KILL = "kill"
18
+ DELETE = "delete"
19
+ LEAVE_UNCHANGED = "leave_unchanged"
20
+
21
+
22
+ def derive_jobset_outcome(jobset_status):
23
+ return (
24
+ JobOutcomes.LEAVE_UNCHANGED
25
+ if jobset_status.get("terminalState", None)
26
+ else JobOutcomes.DELETE
27
+ )
28
+
29
+
30
+ def derive_job_outcome(job_status: "V1JobStatus"):
31
+ if job_status.start_time is None:
32
+ # If the job has not started even then just wipe it!
33
+ return JobOutcomes.DELETE
34
+ if job_status.succeeded or job_status.failed:
35
+ return JobOutcomes.LEAVE_UNCHANGED
36
+
37
+ if job_status.completion_time is not None:
38
+ return JobOutcomes.LEAVE_UNCHANGED
39
+
40
+ # This means that the job has neither finished or succedded.
41
+ if job_status.active:
42
+ return JobOutcomes.KILL
43
+
44
+ # This means that the job is not active. Had started. There is not succedded/fail.
45
+ # This is a weird state. Better to just kill the job
46
+ return JobOutcomes.DELETE
47
+
48
+
49
+ class PodKiller:
50
+ def __init__(self, kubernetes_client, echo_func, namespace):
51
+ self.client = kubernetes_client
52
+ self.echo = echo_func
53
+ self.api_instance = self.client.CoreV1Api()
54
+ self.job_api = self.client.BatchV1Api()
55
+ self._namespace = namespace
56
+ self.jobset_api = None
57
+ self.jobset_api = self.client.CustomObjectsApi()
58
+
59
+ def _delete_jobset(self, owner_ref, namespace):
60
+ """Delete a JobSet if it's the owner of a job."""
61
+ if not self.jobset_api:
62
+ self.echo("JobSet API not available, cannot delete JobSet\n")
63
+ return False
64
+
65
+ try:
66
+ jobset_name = owner_ref.name
67
+ self.echo(f"Deleting JobSet: {jobset_name}\n")
68
+
69
+ self.jobset_api.delete_namespaced_custom_object(
70
+ group="jobset.x-k8s.io",
71
+ version="v1alpha2",
72
+ namespace=namespace,
73
+ plural="jobsets",
74
+ name=jobset_name,
75
+ )
76
+ return True
77
+ except Exception as e:
78
+ self.echo(f"Failed to delete JobSet {owner_ref.name}: {str(e)}\n")
79
+ return False
80
+
81
+ def _delete_job(self, job_name, namespace):
82
+ """Delete a Batch Job and check for JobSet owner reference."""
83
+ try:
84
+ # First get the job to check for owner references
85
+ job = self.job_api.read_namespaced_job(name=job_name, namespace=namespace)
86
+ # Check for JobSet owner reference
87
+ jobset_ref = _is_jobset_child(job)
88
+ if jobset_ref:
89
+ if self._delete_jobset(jobset_ref, namespace):
90
+ return True
91
+
92
+ # If no JobSet owner or JobSet deletion failed, delete the job
93
+ self.echo(f"Deleting Batch Job: {job_name}")
94
+ self.job_api.delete_namespaced_job(
95
+ name=job_name, namespace=namespace, propagation_policy="Background"
96
+ )
97
+ return True
98
+
99
+ except Exception as e:
100
+ self.echo(f"Failed to delete job {job_name}: {str(e)}")
101
+ return False
102
+
103
+ def _kill_pod_process(self, pod):
104
+ """Attempt to kill processes inside a pod."""
105
+ from kubernetes.stream import stream
106
+
107
+ try:
108
+ stream(
109
+ self.api_instance.connect_get_namespaced_pod_exec,
110
+ name=pod.metadata.name,
111
+ namespace=pod.metadata.namespace,
112
+ command=["/bin/sh", "-c", "/sbin/killall5"],
113
+ stderr=True,
114
+ stdin=False,
115
+ stdout=True,
116
+ tty=False,
117
+ )
118
+ return True
119
+ except Exception as e:
120
+ self.echo(
121
+ f"Failed to kill processes in pod {pod.metadata.name}: {str(e)}\n"
122
+ )
123
+ return False
124
+
125
+ @staticmethod
126
+ def _metaflow_matching_spec(run_id, user, flow_name, annotations, labels):
127
+ # Handle argo prefixes in run_id like in _find_active_pods
128
+ _argo_run_id = None
129
+ if run_id is not None:
130
+ _argo_run_id = run_id[run_id.startswith("argo-") and len("argo-") :]
131
+ return (
132
+ annotations
133
+ and (
134
+ run_id is None
135
+ or (annotations.get("metaflow/run_id") == run_id)
136
+ # we want to also match jobsets launched by argo-workflows
137
+ # This line has no real value since the We already avoid any
138
+ # argo-workflows related terminations.
139
+ or (
140
+ labels.get("workflows.argoproj.io/workflow") is not None
141
+ and labels.get("workflows.argoproj.io/workflow") == _argo_run_id
142
+ )
143
+ )
144
+ and (user is None or annotations.get("metaflow/user") == user)
145
+ and (annotations.get("metaflow/flow_name") == flow_name)
146
+ )
147
+
148
+ def _find_matching_jobs(self, flow_name, run_id=None, user=None):
149
+ """Find jobs that match the flow_name, run_id, and user criteria using similar logic to _find_active_pods"""
150
+ try:
151
+ jobs = self.job_api.list_namespaced_job(namespace=self._namespace)
152
+ matching_jobs = []
153
+ for _job in jobs.items:
154
+ job = _job.to_dict()
155
+ _match = self._metaflow_matching_spec(
156
+ run_id=run_id,
157
+ user=user,
158
+ flow_name=flow_name,
159
+ annotations=job.get("metadata", {}).get("annotations", {}),
160
+ labels=job.get("metadata", {}).get("labels", {}),
161
+ )
162
+ if _match:
163
+ matching_jobs.append(_job)
164
+ return matching_jobs
165
+ except Exception as e:
166
+ self.echo(f"Error finding jobs: {str(e)}\n")
167
+ return []
168
+
169
+ def _find_matching_jobsets(self, flow_name, run_id=None, user=None):
170
+ """Find jobsets that match the flow_name, run_id, and user criteria using similar logic to _find_active_pods"""
171
+ if not self.jobset_api:
172
+ return []
173
+
174
+ try:
175
+ jobsets = self.jobset_api.list_namespaced_custom_object(
176
+ group="jobset.x-k8s.io",
177
+ version="v1alpha2",
178
+ namespace=self._namespace,
179
+ plural="jobsets",
180
+ )
181
+ matching_jobsets = []
182
+
183
+ for jobset in jobsets.get("items", []):
184
+ _match = self._metaflow_matching_spec(
185
+ run_id=run_id,
186
+ user=user,
187
+ flow_name=flow_name,
188
+ annotations=jobset.get("metadata", {}).get("annotations", {}),
189
+ labels=jobset.get("metadata", {}).get("labels", {}),
190
+ )
191
+ if _match:
192
+ matching_jobsets.append(jobset)
193
+
194
+ return matching_jobsets
195
+ except Exception as e:
196
+ self.echo(f"Error finding jobsets: {str(e)}\n")
197
+ return []
198
+
199
+ def _kill_pods_for_job(self, job):
200
+ """Find and kill pods associated with a specific job"""
201
+ job_name = job.metadata.name
202
+ namespace = job.metadata.namespace
203
+
204
+ try:
205
+ # Find pods with the job-name label matching this job
206
+ pods = self.api_instance.list_namespaced_pod(
207
+ namespace=namespace, label_selector=f"job-name={job_name}"
208
+ )
209
+
210
+ killed_pods = 0
211
+ for pod in pods.items:
212
+ if pod.status.phase in ["Running"]:
213
+ self.echo(
214
+ f"Killing processes in pod {pod.metadata.name} for job {job_name}"
215
+ )
216
+ if self._kill_pod_process(pod):
217
+ killed_pods += 1
218
+
219
+ return killed_pods > 0
220
+ except Exception as e:
221
+ self.echo(f"Failed to find/kill pods for job {job_name}: {str(e)}")
222
+ return False
223
+
224
+ def _handle_job_outcome(self, job, outcome):
225
+ """Handle a job based on the derived outcome"""
226
+ job_name = job.metadata.name
227
+ namespace = job.metadata.namespace
228
+
229
+ if outcome == JobOutcomes.LEAVE_UNCHANGED:
230
+ # self.echo(f"Job {job_name} is in terminal state, leaving unchanged")
231
+ return True
232
+ elif outcome == JobOutcomes.DELETE:
233
+ self.echo(f"Deleting Job {job_name}")
234
+ return self._delete_job(job_name, namespace)
235
+ elif outcome == JobOutcomes.KILL:
236
+ self.echo(f"Killing Job {job_name}")
237
+ # First try to kill the pod processes
238
+ pods_killed = self._kill_pods_for_job(job)
239
+ if pods_killed > 0:
240
+ return True
241
+ # Worst case if we are not able to delete any pod, then delete the Job.
242
+ return self._delete_job(job_name, namespace)
243
+ else:
244
+ self.echo(f"Unknown outcome {outcome} for job {job_name}\n")
245
+ return False
246
+
247
+ def _handle_jobset_outcome(self, jobset, outcome):
248
+ """Handle a jobset based on the derived outcome"""
249
+ jobset_name = jobset.get("metadata", {}).get("name", "unknown")
250
+ namespace = jobset.get("metadata", {}).get("namespace", self._namespace)
251
+
252
+ if outcome == JobOutcomes.LEAVE_UNCHANGED:
253
+ # self.echo(f"JobSet {jobset_name} is in terminal state, leaving unchanged")
254
+ return True
255
+ elif outcome == JobOutcomes.DELETE:
256
+ self.echo(f"Deleting JobSet {jobset_name}")
257
+ try:
258
+ self.jobset_api.delete_namespaced_custom_object(
259
+ group="jobset.x-k8s.io",
260
+ version="v1alpha2",
261
+ namespace=namespace,
262
+ plural="jobsets",
263
+ name=jobset_name,
264
+ )
265
+ return True
266
+ except Exception as e:
267
+ self.echo(f"Failed to delete JobSet {jobset_name}: {str(e)}")
268
+ return False
269
+ else:
270
+ self.echo(f"Unknown outcome {outcome} for JobSet {jobset_name}")
271
+ return False
272
+
273
+ def process_matching_jobs_and_jobsets(self, flow_name, run_id, user):
274
+ """Process all matching jobs and jobsets based on their derived outcomes"""
275
+ results = []
276
+
277
+ # Process matching jobs
278
+ jobs = self._find_matching_jobs(flow_name, run_id, user)
279
+ for job in jobs:
280
+ outcome = derive_job_outcome(job.status)
281
+ result = self._handle_job_outcome(job, outcome)
282
+ if not result:
283
+ self.echo("Failed Result for : %s" % job.metadata.name)
284
+ results.append(result)
285
+
286
+ # Process matching jobsets
287
+ jobsets = self._find_matching_jobsets(flow_name, run_id, user)
288
+ for jobset in jobsets:
289
+ jobset_status = jobset.get("status", {})
290
+ outcome = derive_jobset_outcome(jobset_status)
291
+ result = self._handle_jobset_outcome(jobset, outcome)
292
+ if not result:
293
+ self.echo("Failed Result for : %s" % jobset.get("metadata").get("name"))
294
+ results.append(result)
295
+
296
+ return results, len(jobs), len(jobsets)
@@ -1,3 +1,3 @@
1
- SUPPORTABLE_GPU_TYPES = ["L40", "L40S", "L40G", "H100"]
1
+ SUPPORTABLE_GPU_TYPES = ["L40", "L40S", "L40G", "H100", "NEBIUS_H100"]
2
2
  DEFAULT_GPU_TYPE = "H100"
3
- MAX_N_GPU_BY_TYPE = {"L40": 1, "L40S": 1, "L40G": 1, "H100": 4}
3
+ MAX_N_GPU_BY_TYPE = {"L40": 1, "L40S": 1, "L40G": 1, "H100": 4, "NEBIUS_H100": 8}
@@ -65,6 +65,28 @@ SUPPORTABLE_GPU_TYPES = {
65
65
  "backend": "gcp-asia-se-1a",
66
66
  },
67
67
  ],
68
+ "NEBIUS_H100": [
69
+ {
70
+ "n_gpus": 1,
71
+ "instance_type": "ON-PREM.GPU.H100_1x",
72
+ "backend": "default-project-eu-north1",
73
+ },
74
+ {
75
+ "n_gpus": 2,
76
+ "instance_type": "ON-PREM.GPU.H100_2x",
77
+ "backend": "default-project-eu-north1",
78
+ },
79
+ {
80
+ "n_gpus": 4,
81
+ "instance_type": "ON-PREM.GPU.H100_4x",
82
+ "backend": "default-project-eu-north1",
83
+ },
84
+ {
85
+ "n_gpus": 8,
86
+ "instance_type": "ON-PREM.GPU.H100_8x",
87
+ "backend": "default-project-eu-north1",
88
+ },
89
+ ],
68
90
  }
69
91
 
70
92
 
@@ -154,6 +176,8 @@ class NvctDecorator(StepDecorator):
154
176
 
155
177
  self.attributes["instance_type"] = valid_config["instance_type"]
156
178
  self.attributes["gpu_type"] = requested_gpu_type
179
+ if self.attributes["gpu_type"] == "NEBIUS_H100":
180
+ self.attributes["gpu_type"] = "H100"
157
181
  self.attributes["backend"] = valid_config["backend"]
158
182
 
159
183
  def runtime_init(self, flow, graph, package, run_id):
@@ -0,0 +1,159 @@
1
+ from queue import Queue, Empty
2
+ from concurrent.futures import ThreadPoolExecutor
3
+ from typing import Optional, List, Dict
4
+ import subprocess
5
+ import shutil
6
+ import sys
7
+ from metaflow import current
8
+
9
+ __mf_promote_submodules__ = ["plugins.torchtune"]
10
+
11
+
12
+ class TorchTune:
13
+ def __init__(
14
+ self,
15
+ use_multi_node_config: bool = False,
16
+ ):
17
+ """
18
+ Initialize the Tune launcher.
19
+
20
+ :param use_multi_node_config: If True, attempt to build a distributed configuration
21
+ from current.torch.torchrun_args.
22
+ """
23
+ self.multi_node_config = {}
24
+ if use_multi_node_config:
25
+ if getattr(current, "torch", None):
26
+ print(
27
+ "[Metaflow Tune] Since @torchrun is used, multi-node config can be used to launch the job."
28
+ )
29
+ # For distributed torchtune launches, we use similar parameters as torchrun.
30
+ # (You might need to adjust the keys according to your environment.)
31
+ self.multi_node_config = {
32
+ "nnodes": current.torch.torchrun_args["nnodes"],
33
+ "master_addr": current.torch.torchrun_args["master_addr"],
34
+ "master_port": int(current.torch.torchrun_args["master_port"]),
35
+ "node_rank": current.torch.torchrun_args["node_rank"],
36
+ "nproc_per_node": current.torch.torchrun_args["nproc_per_node"],
37
+ "num_processes": current.torch.torchrun_args["nproc_per_node"]
38
+ * current.torch.torchrun_args["nnodes"],
39
+ }
40
+ print(
41
+ f"[Metaflow Tune] Discovered multi-node config for torchrun: {self.multi_node_config}"
42
+ )
43
+ else:
44
+ print(
45
+ "[Metaflow Tune] Since @torchrun is not used, default multi-node config cannot be used to launch the job."
46
+ )
47
+
48
+ def run(
49
+ self,
50
+ recipe: str,
51
+ config_dict: Dict,
52
+ additional_cli_options: Optional[List[str]] = None,
53
+ ):
54
+ """
55
+ Launch the torchtune job via its CLI.
56
+
57
+ :param recipe: The path to the recipe (or name of the recipe) to run.
58
+ :param config_dict: Optional dictionary that will be dumped to a YAML file and passed via --config.
59
+ :param additional_cli_options: Optional list of additional CLI options.
60
+ :raises: subprocess.CalledProcessError if the subprocess returns a nonzero exit code.
61
+ """
62
+ import yaml
63
+ import tempfile
64
+ import os
65
+
66
+ _temp_dir = tempfile.mkdtemp()
67
+ try:
68
+ config_path = os.path.join(_temp_dir, "config.yaml")
69
+ with open(config_path, "w") as f:
70
+ yaml.dump(config_dict, f)
71
+
72
+ additional_options = (
73
+ additional_cli_options if additional_cli_options else []
74
+ )
75
+
76
+ # Build the command. Here we use "tune run" as the base command.
77
+ cmd = ["tune", "run"]
78
+
79
+ # If distributed configuration is present, add torchrun–style flags.
80
+ if self.multi_node_config:
81
+ cmd.extend(
82
+ [
83
+ "--nnodes",
84
+ str(self.multi_node_config.get("nnodes")),
85
+ "--nproc-per-node",
86
+ str(self.multi_node_config.get("nproc_per_node")),
87
+ # "--rdzv_conf", f"rdzv_endpoint={self.multi_node_config.get('master_addr')}:{self.multi_node_config.get('master_port')}"
88
+ "--rdzv-backend",
89
+ "c10d",
90
+ "--rdzv-endpoint",
91
+ f"{self.multi_node_config.get('master_addr')}:{self.multi_node_config.get('master_port')}",
92
+ "--rdzv-id",
93
+ "1234567890",
94
+ "--node-rank",
95
+ str(self.multi_node_config.get("node_rank")),
96
+ # TODO: should there be a masterip/port here ?
97
+ ]
98
+ )
99
+
100
+ cmd.extend(additional_options)
101
+
102
+ cmd.append(recipe)
103
+ # If a recipe configuration was provided, pass it via the --config flag.
104
+ cmd.extend(["--config", config_path])
105
+
106
+ # Append any additional CLI options.
107
+
108
+ # Launch the subprocess.
109
+ print(f"[Metaflow tune] {' '.join(cmd)}")
110
+ process = subprocess.Popen(
111
+ cmd,
112
+ stdout=subprocess.PIPE,
113
+ stderr=subprocess.PIPE,
114
+ universal_newlines=True,
115
+ )
116
+
117
+ # Stream the output in real-time.
118
+ for out_line, err_line in read_popen_pipes(process):
119
+ print(out_line, end="", flush=True)
120
+ print(err_line, end="", file=sys.stderr, flush=True)
121
+
122
+ process.wait()
123
+ if process.returncode != 0:
124
+ raise subprocess.CalledProcessError(process.returncode, cmd)
125
+ finally:
126
+ shutil.rmtree(_temp_dir)
127
+
128
+
129
+ def enqueue_output(file, queue):
130
+ for line in iter(file.readline, ""):
131
+ queue.put(line)
132
+ file.close()
133
+
134
+
135
+ def read_popen_pipes(p):
136
+
137
+ with ThreadPoolExecutor(2) as pool:
138
+ q_stdout, q_stderr = Queue(), Queue()
139
+
140
+ pool.submit(enqueue_output, p.stdout, q_stdout)
141
+ pool.submit(enqueue_output, p.stderr, q_stderr)
142
+
143
+ while True:
144
+
145
+ if p.poll() is not None and q_stdout.empty() and q_stderr.empty():
146
+ break
147
+
148
+ out_line = err_line = ""
149
+
150
+ try:
151
+ out_line = q_stdout.get_nowait()
152
+ except Empty:
153
+ pass
154
+ try:
155
+ err_line = q_stderr.get_nowait()
156
+ except Empty:
157
+ pass
158
+
159
+ yield (out_line, err_line)
@@ -0,0 +1 @@
1
+ __mf_promote_submodules__ = ["plugins.torchtune"]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ob-metaflow-extensions
3
- Version: 1.1.158
3
+ Version: 1.1.160
4
4
  Summary: Outerbounds Platform Extensions for Metaflow
5
5
  Author: Outerbounds, Inc.
6
6
  License: Commercial
@@ -1,7 +1,7 @@
1
1
  metaflow_extensions/outerbounds/__init__.py,sha256=Gb8u06s9ClQsA_vzxmkCzuMnigPy7kKcDnLfb7eB-64,514
2
2
  metaflow_extensions/outerbounds/remote_config.py,sha256=pEFJuKDYs98eoB_-ryPjVi9b_c4gpHMdBHE14ltoxIU,4672
3
3
  metaflow_extensions/outerbounds/config/__init__.py,sha256=JsQGRuGFz28fQWjUvxUgR8EKBLGRdLUIk_buPLJplJY,1225
4
- metaflow_extensions/outerbounds/plugins/__init__.py,sha256=gytuNt3lNabirHLEYzrmHFMp-JWh8dA2AZPK11HmaNw,13242
4
+ metaflow_extensions/outerbounds/plugins/__init__.py,sha256=GxYKjrMJCGVKoxhfdPAlVF9kYrEb3-xn9fgUTb_H9VY,13255
5
5
  metaflow_extensions/outerbounds/plugins/auth_server.py,sha256=_Q9_2EL0Xy77bCRphkwT1aSu8gQXRDOH-Z-RxTUO8N4,2202
6
6
  metaflow_extensions/outerbounds/plugins/perimeters.py,sha256=QXh3SFP7GQbS-RAIxUOPbhPzQ7KDFVxZkTdKqFKgXjI,2697
7
7
  metaflow_extensions/outerbounds/plugins/apps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -22,13 +22,14 @@ metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery.py,sha256=PE81ZB
22
22
  metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_cli.py,sha256=kqFyu2bJSnc9_9aYfBpz5xK6L6luWFZK_NMuh8f1eVk,1494
23
23
  metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_decorator.py,sha256=MXSIp05-jvt8Q2uGaLKjtuM_ToLeRLxhtMbfHc9Kcko,1515
24
24
  metaflow_extensions/outerbounds/plugins/kubernetes/__init__.py,sha256=5zG8gShSj8m7rgF4xgWBZFuY3GDP5n1T0ktjRpGJLHA,69
25
- metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py,sha256=fx_XUkgR4r6hF2ilDfT5LubRyVrYMVIv5f6clHkCaEk,5988
25
+ metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py,sha256=sjBhQ4aa-i1UkKsJyTswdDLYOBAFIvHRco4r7wfs9Tc,5003
26
+ metaflow_extensions/outerbounds/plugins/kubernetes/pod_killer.py,sha256=iZQSXNK14DVrJ6jRfrMfkjoUf4dNKDZA51k6LeVJL6M,11392
26
27
  metaflow_extensions/outerbounds/plugins/nim/card.py,sha256=dXOJvsZed5NyYyxYLPDvtwg9z_X4azL9HTJGYaiNriY,4690
27
28
  metaflow_extensions/outerbounds/plugins/nim/nim_decorator.py,sha256=50YVvC7mcZYlPluM0Wq1UtufhzlQb-RxzZkTOJJ3LkM,3439
28
29
  metaflow_extensions/outerbounds/plugins/nim/nim_manager.py,sha256=y8U71106KJtrC6nlhsNnzX9Xkv3RnyZ1KEpRFwqZZFk,13686
29
30
  metaflow_extensions/outerbounds/plugins/nim/utils.py,sha256=nU-v1sheBjmITXfHiJx2ucm_Tq_nGb5BcuAm5c235cQ,1164
30
31
  metaflow_extensions/outerbounds/plugins/nvcf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
31
- metaflow_extensions/outerbounds/plugins/nvcf/constants.py,sha256=aGHdNw_hqBu8i0zWXcatQM6e769wUXox0l8g0f6fNZ8,146
32
+ metaflow_extensions/outerbounds/plugins/nvcf/constants.py,sha256=8Gcf5nKBVoE2v7_4hTW1EAf-hvXCzQoccaRBeKa_MNk,179
32
33
  metaflow_extensions/outerbounds/plugins/nvcf/exceptions.py,sha256=-Pm9cOWUzpv94TvVUeq-FenAWdfLBJd5N7WPqIGZVqU,3671
33
34
  metaflow_extensions/outerbounds/plugins/nvcf/heartbeat_store.py,sha256=pOWwm8LFQBbtku0zNBBwCyXxLK8U-hhC4naQcmU69nE,6217
34
35
  metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py,sha256=3ZFdYItVpFWnHMOeyV1nslUyelfvX5rknh2d2IWxVws,15591
@@ -39,7 +40,7 @@ metaflow_extensions/outerbounds/plugins/nvct/__init__.py,sha256=47DEQpj8HBSa-_TI
39
40
  metaflow_extensions/outerbounds/plugins/nvct/exceptions.py,sha256=1PiV6FdH36CvkmHh5jtsfrsoe3Q_Fo1NomHw5wvgoDM,2886
40
41
  metaflow_extensions/outerbounds/plugins/nvct/nvct.py,sha256=Z2ZPWGuHe58au_d6GfHiw6Nl5d8INdLDI5exlsPEOSA,3564
41
42
  metaflow_extensions/outerbounds/plugins/nvct/nvct_cli.py,sha256=bB9AURhRep9PV_-b-qLHpgw_GPG_xFoq1PeHEgFP1mQ,10104
42
- metaflow_extensions/outerbounds/plugins/nvct/nvct_decorator.py,sha256=LaJ_Tk-vNjvrglzSTR-U6pk8f9MtQRKObU9m7vBYtkI,8695
43
+ metaflow_extensions/outerbounds/plugins/nvct/nvct_decorator.py,sha256=HKCvYn1Jh8uwLXeUqPNhxgBatq3mXNG5YIUl-zjNlHE,9429
43
44
  metaflow_extensions/outerbounds/plugins/nvct/nvct_runner.py,sha256=8IPkdvuTZNIqgAAt75gVNn-ydr-Zz2sKC8UX_6pNEKI,7091
44
45
  metaflow_extensions/outerbounds/plugins/nvct/utils.py,sha256=U4_Fu8H94j_Bbox7mmMhNnlRhlYHqnK28R5w_TMWEFM,1029
45
46
  metaflow_extensions/outerbounds/plugins/ollama/__init__.py,sha256=vzh8sQEfwKRdx0fsGFJ-km4mwfi0vm2q1_vsZv-EMcc,3034
@@ -61,6 +62,7 @@ metaflow_extensions/outerbounds/plugins/snowpark/snowpark_exceptions.py,sha256=F
61
62
  metaflow_extensions/outerbounds/plugins/snowpark/snowpark_job.py,sha256=aQphxX6jqYgfa83w387pEWl0keuLm38V53I8P8UL2ck,6887
62
63
  metaflow_extensions/outerbounds/plugins/snowpark/snowpark_service_spec.py,sha256=AI_kcm1hZV3JRxJkookcH6twiGnAYjk9Dx-MeoYz60Y,8511
63
64
  metaflow_extensions/outerbounds/plugins/tensorboard/__init__.py,sha256=9lUM4Cqi5RjrHBRfG6AQMRz8-R96eZC8Ih0KD2lv22Y,1858
65
+ metaflow_extensions/outerbounds/plugins/torchtune/__init__.py,sha256=TOXNeyhcgd8VxplXO_oEuryFEsbk0tikn5GL0-44SU8,5853
64
66
  metaflow_extensions/outerbounds/profilers/__init__.py,sha256=wa_jhnCBr82TBxoS0e8b6_6sLyZX0fdHicuGJZNTqKw,29
65
67
  metaflow_extensions/outerbounds/profilers/gpu.py,sha256=3Er8uKQzfm_082uadg4yn_D4Y-iSCgzUfFmguYxZsz4,27485
66
68
  metaflow_extensions/outerbounds/toplevel/__init__.py,sha256=qWUJSv_r5hXJ7jV_On4nEasKIfUCm6_UjkjXWA_A1Ts,90
@@ -70,7 +72,8 @@ metaflow_extensions/outerbounds/toplevel/plugins/gcp/__init__.py,sha256=BbZiaH3u
70
72
  metaflow_extensions/outerbounds/toplevel/plugins/kubernetes/__init__.py,sha256=5zG8gShSj8m7rgF4xgWBZFuY3GDP5n1T0ktjRpGJLHA,69
71
73
  metaflow_extensions/outerbounds/toplevel/plugins/ollama/__init__.py,sha256=GRSz2zwqkvlmFS6bcfYD_CX6CMko9DHQokMaH1iBshA,47
72
74
  metaflow_extensions/outerbounds/toplevel/plugins/snowflake/__init__.py,sha256=LptpH-ziXHrednMYUjIaosS1SXD3sOtF_9_eRqd8SJw,50
73
- ob_metaflow_extensions-1.1.158.dist-info/METADATA,sha256=0t_P8-Uhi3I39xyeSGv2BpRQO5Upe1eIjs04e6Stjd8,521
74
- ob_metaflow_extensions-1.1.158.dist-info/WHEEL,sha256=bb2Ot9scclHKMOLDEHY6B2sicWOgugjFKaJsT7vwMQo,110
75
- ob_metaflow_extensions-1.1.158.dist-info/top_level.txt,sha256=NwG0ukwjygtanDETyp_BUdtYtqIA_lOjzFFh1TsnxvI,20
76
- ob_metaflow_extensions-1.1.158.dist-info/RECORD,,
75
+ metaflow_extensions/outerbounds/toplevel/plugins/torchtune/__init__.py,sha256=uTVkdSk3xZ7hEKYfdlyVteWj5KeDwaM1hU9WT-_YKfI,50
76
+ ob_metaflow_extensions-1.1.160.dist-info/METADATA,sha256=-Y7aOv3_giiWpXTkncvXFIeHqXqfnWOA0hiHuLqueS8,521
77
+ ob_metaflow_extensions-1.1.160.dist-info/WHEEL,sha256=bb2Ot9scclHKMOLDEHY6B2sicWOgugjFKaJsT7vwMQo,110
78
+ ob_metaflow_extensions-1.1.160.dist-info/top_level.txt,sha256=NwG0ukwjygtanDETyp_BUdtYtqIA_lOjzFFh1TsnxvI,20
79
+ ob_metaflow_extensions-1.1.160.dist-info/RECORD,,