ob-metaflow-extensions 1.1.160rc0__tar.gz → 1.1.162rc0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ob-metaflow-extensions might be problematic. Click here for more details.

Files changed (86) hide show
  1. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/PKG-INFO +1 -1
  2. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py +18 -44
  3. ob-metaflow-extensions-1.1.162rc0/metaflow_extensions/outerbounds/plugins/kubernetes/pod_killer.py +298 -0
  4. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/ob_metaflow_extensions.egg-info/PKG-INFO +1 -1
  5. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/ob_metaflow_extensions.egg-info/SOURCES.txt +1 -0
  6. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/setup.py +1 -1
  7. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/README.md +0 -0
  8. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/__init__.py +0 -0
  9. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/config/__init__.py +0 -0
  10. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/__init__.py +0 -0
  11. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/apps/__init__.py +0 -0
  12. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/apps/app_utils.py +0 -0
  13. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/apps/consts.py +0 -0
  14. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/apps/deploy_decorator.py +0 -0
  15. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/apps/supervisord_utils.py +0 -0
  16. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/auth_server.py +0 -0
  17. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/card_utilities/__init__.py +0 -0
  18. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/card_utilities/async_cards.py +0 -0
  19. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/card_utilities/extra_components.py +0 -0
  20. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/card_utilities/injector.py +0 -0
  21. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/checkpoint_datastores/__init__.py +0 -0
  22. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/checkpoint_datastores/coreweave.py +0 -0
  23. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/checkpoint_datastores/nebius.py +0 -0
  24. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/fast_bakery/__init__.py +0 -0
  25. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/fast_bakery/baker.py +0 -0
  26. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/fast_bakery/docker_environment.py +0 -0
  27. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery.py +0 -0
  28. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_cli.py +0 -0
  29. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_decorator.py +0 -0
  30. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/kubernetes/__init__.py +0 -0
  31. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/nim/card.py +0 -0
  32. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/nim/nim_decorator.py +0 -0
  33. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/nim/nim_manager.py +0 -0
  34. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/nim/utils.py +0 -0
  35. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/nvcf/__init__.py +0 -0
  36. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/nvcf/constants.py +0 -0
  37. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/nvcf/exceptions.py +0 -0
  38. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/nvcf/heartbeat_store.py +0 -0
  39. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py +0 -0
  40. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/nvcf/nvcf_cli.py +0 -0
  41. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/nvcf/nvcf_decorator.py +0 -0
  42. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/nvcf/utils.py +0 -0
  43. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/nvct/__init__.py +0 -0
  44. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/nvct/exceptions.py +0 -0
  45. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/nvct/nvct.py +0 -0
  46. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/nvct/nvct_cli.py +0 -0
  47. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/nvct/nvct_decorator.py +0 -0
  48. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/nvct/nvct_runner.py +0 -0
  49. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/nvct/utils.py +0 -0
  50. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/ollama/__init__.py +0 -0
  51. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/ollama/constants.py +0 -0
  52. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/ollama/exceptions.py +0 -0
  53. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/ollama/ollama.py +0 -0
  54. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/perimeters.py +0 -0
  55. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/profilers/deco_injector.py +0 -0
  56. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/profilers/gpu_profile_decorator.py +0 -0
  57. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/secrets/__init__.py +0 -0
  58. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/secrets/secrets.py +0 -0
  59. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/snowflake/__init__.py +0 -0
  60. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/snowflake/snowflake.py +0 -0
  61. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/snowpark/__init__.py +0 -0
  62. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark.py +0 -0
  63. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_cli.py +0 -0
  64. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_client.py +0 -0
  65. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_decorator.py +0 -0
  66. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_exceptions.py +0 -0
  67. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_job.py +0 -0
  68. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_service_spec.py +0 -0
  69. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/tensorboard/__init__.py +0 -0
  70. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/plugins/torchtune/__init__.py +0 -0
  71. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/profilers/__init__.py +0 -0
  72. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/profilers/gpu.py +0 -0
  73. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/remote_config.py +0 -0
  74. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/toplevel/__init__.py +0 -0
  75. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py +0 -0
  76. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/toplevel/ob_internal.py +0 -0
  77. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/toplevel/plugins/azure/__init__.py +0 -0
  78. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/toplevel/plugins/gcp/__init__.py +0 -0
  79. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/toplevel/plugins/kubernetes/__init__.py +0 -0
  80. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/toplevel/plugins/ollama/__init__.py +0 -0
  81. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/toplevel/plugins/snowflake/__init__.py +0 -0
  82. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/metaflow_extensions/outerbounds/toplevel/plugins/torchtune/__init__.py +0 -0
  83. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/ob_metaflow_extensions.egg-info/dependency_links.txt +0 -0
  84. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/ob_metaflow_extensions.egg-info/requires.txt +0 -0
  85. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/ob_metaflow_extensions.egg-info/top_level.txt +0 -0
  86. {ob-metaflow-extensions-1.1.160rc0 → ob-metaflow-extensions-1.1.162rc0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ob-metaflow-extensions
3
- Version: 1.1.160rc0
3
+ Version: 1.1.162rc0
4
4
  Summary: Outerbounds Platform Extensions for Metaflow
5
5
  Author: Outerbounds, Inc.
6
6
  License: Commercial
@@ -5,6 +5,7 @@ import time
5
5
 
6
6
  from metaflow.exception import MetaflowException
7
7
  from metaflow.metaflow_config import KUBERNETES_NAMESPACE
8
+ from .pod_killer import PodKiller
8
9
 
9
10
 
10
11
  CLIENT_REFRESH_INTERVAL_SECONDS = 300
@@ -105,50 +106,23 @@ class KubernetesClient(object):
105
106
  return list(results)
106
107
 
107
108
  def kill_pods(self, flow_name, run_id, user, echo):
108
- from kubernetes.stream import stream
109
-
110
- api_instance = self._client.CoreV1Api()
111
- job_api = self._client.BatchV1Api()
112
- pods = self._find_active_pods(flow_name, run_id, user)
113
-
114
- def _kill_pod(pod):
115
- echo("Killing Kubernetes pod %s\n" % pod.metadata.name)
116
- try:
117
- stream(
118
- api_instance.connect_get_namespaced_pod_exec,
119
- name=pod.metadata.name,
120
- namespace=pod.metadata.namespace,
121
- command=[
122
- "/bin/sh",
123
- "-c",
124
- "/sbin/killall5",
125
- ],
126
- stderr=True,
127
- stdin=False,
128
- stdout=True,
129
- tty=False,
130
- )
131
- except Exception:
132
- # best effort kill for pod can fail.
133
- try:
134
- job_name = pod.metadata.labels.get("job-name", None)
135
- if job_name is None:
136
- raise Exception("Could not determine job name")
137
-
138
- job_api.patch_namespaced_job(
139
- name=job_name,
140
- namespace=pod.metadata.namespace,
141
- field_manager="metaflow",
142
- body={"spec": {"parallelism": 0}},
143
- )
144
- except Exception as e:
145
- echo("failed to kill pod %s - %s" % (pod.metadata.name, str(e)))
146
-
147
- with ThreadPoolExecutor() as executor:
148
- operated_pods = list(executor.map(_kill_pod, pods))
149
-
150
- if not operated_pods:
151
- echo("No active Kubernetes pods found for run *%s*" % run_id)
109
+ # Create PodKiller instance
110
+ killer = PodKiller(self._client, echo, self._namespace)
111
+
112
+ # Process all matching jobs and jobsets based on their outcomes
113
+ (
114
+ job_jobset_results,
115
+ num_jobs,
116
+ num_jobsets,
117
+ ) = killer.process_matching_jobs_and_jobsets(flow_name, run_id, user)
118
+
119
+ if job_jobset_results:
120
+ successful_operations = sum(1 for result in job_jobset_results if result)
121
+ echo(
122
+ f"Found and processed {num_jobs} jobs and {num_jobsets} jobsets, {successful_operations} operations successful\n"
123
+ )
124
+ else:
125
+ echo("No matching jobs or jobsets found for run *%s*" % run_id)
152
126
 
153
127
  def job(self, **kwargs):
154
128
  from metaflow.plugins.kubernetes.kubernetes_job import KubernetesJob
@@ -0,0 +1,298 @@
1
+ from typing import TYPE_CHECKING
2
+
3
+ if TYPE_CHECKING:
4
+ from kubernetes.client.models.v1_job import V1Job
5
+ from kubernetes.client.models.v1_job_status import V1JobStatus
6
+
7
+
8
+ def _is_jobset_child(job: "V1Job"):
9
+ if job.metadata.owner_references:
10
+ for owner_ref in job.metadata.owner_references:
11
+ if owner_ref.kind == "JobSet":
12
+ return owner_ref
13
+ return None
14
+
15
+
16
+ class JobOutcomes:
17
+ KILL = "kill"
18
+ DELETE = "delete"
19
+ LEAVE_UNCHANGED = "leave_unchanged"
20
+
21
+
22
+ def derive_jobset_outcome(jobset_status):
23
+ return (
24
+ JobOutcomes.LEAVE_UNCHANGED
25
+ if jobset_status.get("terminalState", None)
26
+ else JobOutcomes.DELETE
27
+ )
28
+
29
+
30
+ def derive_job_outcome(job_status: "V1JobStatus"):
31
+ if job_status.start_time is None:
32
+ # If the job has not started even then just wipe it!
33
+ return JobOutcomes.DELETE
34
+ if job_status.succeeded or job_status.failed:
35
+ return JobOutcomes.LEAVE_UNCHANGED
36
+
37
+ if job_status.completion_time is not None:
38
+ return JobOutcomes.LEAVE_UNCHANGED
39
+
40
+ # This means that the job has neither finished or succedded.
41
+ if job_status.active:
42
+ return JobOutcomes.KILL
43
+
44
+ # This means that the job is not active. Had started. There is not succedded/fail.
45
+ # This is a weird state. Better to just kill the job
46
+ return JobOutcomes.DELETE
47
+
48
+
49
+ class PodKiller:
50
+ def __init__(self, kubernetes_client, echo_func, namespace):
51
+ self.client = kubernetes_client
52
+ self.echo = echo_func
53
+ self.api_instance = self.client.CoreV1Api()
54
+ self.job_api = self.client.BatchV1Api()
55
+ self._namespace = namespace
56
+ self.jobset_api = None
57
+ self.jobset_api = self.client.CustomObjectsApi()
58
+
59
+ def _delete_jobset(self, owner_ref, namespace):
60
+ """Delete a JobSet if it's the owner of a job."""
61
+ if not self.jobset_api:
62
+ self.echo("JobSet API not available, cannot delete JobSet\n")
63
+ return False
64
+
65
+ try:
66
+ jobset_name = owner_ref.name
67
+ self.echo(f"Deleting JobSet: {jobset_name}\n")
68
+
69
+ self.jobset_api.delete_namespaced_custom_object(
70
+ group="jobset.x-k8s.io",
71
+ version="v1alpha2",
72
+ namespace=namespace,
73
+ plural="jobsets",
74
+ name=jobset_name,
75
+ )
76
+ return True
77
+ except Exception as e:
78
+ self.echo(f"Failed to delete JobSet {owner_ref.name}: {str(e)}\n")
79
+ return False
80
+
81
+ def _delete_job(self, job_name, namespace):
82
+ """Delete a Batch Job and check for JobSet owner reference."""
83
+ try:
84
+ # First get the job to check for owner references
85
+ job = self.job_api.read_namespaced_job(name=job_name, namespace=namespace)
86
+ # Check for JobSet owner reference
87
+ jobset_ref = _is_jobset_child(job)
88
+ if jobset_ref:
89
+ if self._delete_jobset(jobset_ref, namespace):
90
+ return True
91
+
92
+ # If no JobSet owner or JobSet deletion failed, delete the job
93
+ self.echo(f"Deleting Batch Job: {job_name}")
94
+ self.job_api.delete_namespaced_job(
95
+ name=job_name, namespace=namespace, propagation_policy="Background"
96
+ )
97
+ return True
98
+
99
+ except Exception as e:
100
+ self.echo(f"Failed to delete job {job_name}: {str(e)}")
101
+ return False
102
+
103
+ def _kill_pod_process(self, pod):
104
+ """Attempt to kill processes inside a pod."""
105
+ from kubernetes.stream import stream
106
+
107
+ try:
108
+ stream(
109
+ self.api_instance.connect_get_namespaced_pod_exec,
110
+ name=pod.metadata.name,
111
+ namespace=pod.metadata.namespace,
112
+ command=["/bin/sh", "-c", "/sbin/killall5"],
113
+ stderr=True,
114
+ stdin=False,
115
+ stdout=True,
116
+ tty=False,
117
+ )
118
+ return True
119
+ except Exception as e:
120
+ self.echo(
121
+ f"Failed to kill processes in pod {pod.metadata.name}: {str(e)}\n"
122
+ )
123
+ return False
124
+
125
+ @staticmethod
126
+ def _metaflow_matching_spec(run_id, user, flow_name, annotations, labels):
127
+ # Handle argo prefixes in run_id like in _find_active_pods
128
+ _argo_run_id = None
129
+ if run_id is not None:
130
+ _argo_run_id = run_id[run_id.startswith("argo-") and len("argo-") :]
131
+ return (
132
+ annotations
133
+ and (
134
+ run_id is None
135
+ or (annotations.get("metaflow/run_id") == run_id)
136
+ # we want to also match jobsets launched by argo-workflows
137
+ # This line has no real value since the We already avoid any
138
+ # argo-workflows related terminations.
139
+ or (
140
+ labels.get("workflows.argoproj.io/workflow") is not None
141
+ and labels.get("workflows.argoproj.io/workflow") == _argo_run_id
142
+ )
143
+ )
144
+ and (user is None or annotations.get("metaflow/user") == user)
145
+ and (annotations.get("metaflow/flow_name") == flow_name)
146
+ )
147
+
148
+ def _find_matching_jobs(self, flow_name, run_id=None, user=None):
149
+ """Find jobs that match the flow_name, run_id, and user criteria using similar logic to _find_active_pods"""
150
+ try:
151
+ jobs = self.job_api.list_namespaced_job(namespace=self._namespace)
152
+ matching_jobs = []
153
+ for _job in jobs.items:
154
+ job = _job.to_dict()
155
+ _match = self._metaflow_matching_spec(
156
+ run_id=run_id,
157
+ user=user,
158
+ flow_name=flow_name,
159
+ annotations=job.get("metadata", {}).get("annotations", {}),
160
+ labels=job.get("metadata", {}).get("labels", {}),
161
+ )
162
+ if _match:
163
+ matching_jobs.append(_job)
164
+ return matching_jobs
165
+ except Exception as e:
166
+ self.echo(f"Error finding jobs: {str(e)}\n")
167
+ return []
168
+
169
+ def _find_matching_jobsets(self, flow_name, run_id=None, user=None):
170
+ """Find jobsets that match the flow_name, run_id, and user criteria using similar logic to _find_active_pods"""
171
+ if not self.jobset_api:
172
+ return []
173
+
174
+ try:
175
+ jobsets = self.jobset_api.list_namespaced_custom_object(
176
+ group="jobset.x-k8s.io",
177
+ version="v1alpha2",
178
+ namespace=self._namespace,
179
+ plural="jobsets",
180
+ )
181
+ matching_jobsets = []
182
+
183
+ for jobset in jobsets.get("items", []):
184
+ _match = self._metaflow_matching_spec(
185
+ run_id=run_id,
186
+ user=user,
187
+ flow_name=flow_name,
188
+ annotations=jobset.get("metadata", {}).get("annotations", {}),
189
+ labels=jobset.get("metadata", {}).get("labels", {}),
190
+ )
191
+ if _match:
192
+ matching_jobsets.append(jobset)
193
+
194
+ return matching_jobsets
195
+ except Exception as e:
196
+ self.echo(f"Error finding jobsets: {str(e)}\n")
197
+ return []
198
+
199
+ def _kill_pods_for_job(self, job):
200
+ """Find and kill pods associated with a specific job"""
201
+ job_name = job.metadata.name
202
+ namespace = job.metadata.namespace
203
+
204
+ try:
205
+ # Find pods with the job-name label matching this job
206
+ pods = self.api_instance.list_namespaced_pod(
207
+ namespace=namespace, label_selector=f"job-name={job_name}"
208
+ )
209
+
210
+ killed_pods = 0
211
+ for pod in pods.items:
212
+ if pod.status.phase in ["Running"]:
213
+ self.echo(
214
+ f"Killing processes in pod {pod.metadata.name} for job {job_name}"
215
+ )
216
+ if self._kill_pod_process(pod):
217
+ killed_pods += 1
218
+
219
+ return killed_pods > 0
220
+ except Exception as e:
221
+ self.echo(f"Failed to find/kill pods for job {job_name}: {str(e)}")
222
+ return False
223
+
224
+ def _handle_job_outcome(self, job, outcome):
225
+ """Handle a job based on the derived outcome"""
226
+ job_name = job.metadata.name
227
+ namespace = job.metadata.namespace
228
+
229
+ if outcome == JobOutcomes.LEAVE_UNCHANGED:
230
+ # self.echo(f"Job {job_name} is in terminal state, leaving unchanged")
231
+ return None
232
+ elif outcome == JobOutcomes.DELETE:
233
+ self.echo(f"Deleting Job {job_name}")
234
+ return self._delete_job(job_name, namespace)
235
+ elif outcome == JobOutcomes.KILL:
236
+ self.echo(f"Killing Job {job_name}")
237
+ # First try to kill the pod processes
238
+ pods_killed = self._kill_pods_for_job(job)
239
+ if pods_killed > 0:
240
+ return True
241
+ # Worst case if we are not able to delete any pod, then delete the Job.
242
+ return self._delete_job(job_name, namespace)
243
+ else:
244
+ self.echo(f"Unknown outcome {outcome} for job {job_name}\n")
245
+ return False
246
+
247
+ def _handle_jobset_outcome(self, jobset, outcome):
248
+ """Handle a jobset based on the derived outcome"""
249
+ jobset_name = jobset.get("metadata", {}).get("name", "unknown")
250
+ namespace = jobset.get("metadata", {}).get("namespace", self._namespace)
251
+
252
+ if outcome == JobOutcomes.LEAVE_UNCHANGED:
253
+ # self.echo(f"JobSet {jobset_name} is in terminal state, leaving unchanged")
254
+ return None
255
+ elif outcome == JobOutcomes.DELETE:
256
+ self.echo(f"Deleting JobSet {jobset_name}")
257
+ try:
258
+ self.jobset_api.delete_namespaced_custom_object(
259
+ group="jobset.x-k8s.io",
260
+ version="v1alpha2",
261
+ namespace=namespace,
262
+ plural="jobsets",
263
+ name=jobset_name,
264
+ )
265
+ return True
266
+ except Exception as e:
267
+ self.echo(f"Failed to delete JobSet {jobset_name}: {str(e)}")
268
+ return False
269
+ else:
270
+ self.echo(f"Unknown outcome {outcome} for JobSet {jobset_name}")
271
+ return False
272
+
273
+ def process_matching_jobs_and_jobsets(self, flow_name, run_id, user):
274
+ """Process all matching jobs and jobsets based on their derived outcomes"""
275
+ results = []
276
+
277
+ # Process matching jobs
278
+ _jobs, _jobsets = [], []
279
+ jobs = self._find_matching_jobs(flow_name, run_id, user)
280
+ for job in jobs:
281
+ outcome = derive_job_outcome(job.status)
282
+ result = self._handle_job_outcome(job, outcome)
283
+ # results.append(result)
284
+ if result is not None:
285
+ results.append(result)
286
+ _jobs.append(result)
287
+
288
+ # Process matching jobsets
289
+ jobsets = self._find_matching_jobsets(flow_name, run_id, user)
290
+ for jobset in jobsets:
291
+ jobset_status = jobset.get("status", {})
292
+ outcome = derive_jobset_outcome(jobset_status)
293
+ result = self._handle_jobset_outcome(jobset, outcome)
294
+ if result is not None:
295
+ results.append(result)
296
+ _jobsets.append(result)
297
+
298
+ return results, len(_jobs), len(_jobsets)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ob-metaflow-extensions
3
- Version: 1.1.160rc0
3
+ Version: 1.1.162rc0
4
4
  Summary: Outerbounds Platform Extensions for Metaflow
5
5
  Author: Outerbounds, Inc.
6
6
  License: Commercial
@@ -26,6 +26,7 @@ metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_cli.py
26
26
  metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_decorator.py
27
27
  metaflow_extensions/outerbounds/plugins/kubernetes/__init__.py
28
28
  metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py
29
+ metaflow_extensions/outerbounds/plugins/kubernetes/pod_killer.py
29
30
  metaflow_extensions/outerbounds/plugins/nim/card.py
30
31
  metaflow_extensions/outerbounds/plugins/nim/nim_decorator.py
31
32
  metaflow_extensions/outerbounds/plugins/nim/nim_manager.py
@@ -2,7 +2,7 @@ from setuptools import setup, find_namespace_packages
2
2
  from pathlib import Path
3
3
 
4
4
 
5
- version = "1.1.160rc0"
5
+ version = "1.1.162rc0"
6
6
  this_directory = Path(__file__).parent
7
7
  long_description = (this_directory / "README.md").read_text()
8
8