ob-metaflow-extensions 1.1.90__tar.gz → 1.1.92__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ob-metaflow-extensions might be problematic. Click here for more details.

Files changed (46) hide show
  1. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/PKG-INFO +1 -1
  2. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/plugins/fast_bakery/docker_environment.py +30 -29
  3. ob-metaflow-extensions-1.1.92/metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py +157 -0
  4. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/plugins/nvcf/heartbeat_store.py +28 -23
  5. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py +6 -2
  6. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/plugins/nvcf/nvcf_cli.py +20 -34
  7. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark.py +2 -2
  8. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/ob_metaflow_extensions.egg-info/PKG-INFO +1 -1
  9. ob-metaflow-extensions-1.1.92/ob_metaflow_extensions.egg-info/requires.txt +3 -0
  10. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/setup.py +2 -2
  11. ob-metaflow-extensions-1.1.90/metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py +0 -61
  12. ob-metaflow-extensions-1.1.90/ob_metaflow_extensions.egg-info/requires.txt +0 -3
  13. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/README.md +0 -0
  14. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/__init__.py +0 -0
  15. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/config/__init__.py +0 -0
  16. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/plugins/__init__.py +0 -0
  17. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/plugins/auth_server.py +0 -0
  18. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/plugins/fast_bakery/__init__.py +0 -0
  19. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery.py +0 -0
  20. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_cli.py +0 -0
  21. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_decorator.py +0 -0
  22. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/plugins/kubernetes/__init__.py +0 -0
  23. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/plugins/nim/__init__.py +0 -0
  24. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/plugins/nim/nim_manager.py +0 -0
  25. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/plugins/nvcf/__init__.py +0 -0
  26. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/plugins/nvcf/nvcf_decorator.py +0 -0
  27. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/plugins/perimeters.py +0 -0
  28. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/plugins/snowpark/__init__.py +0 -0
  29. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_cli.py +0 -0
  30. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_client.py +0 -0
  31. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_decorator.py +0 -0
  32. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_exceptions.py +0 -0
  33. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_job.py +0 -0
  34. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_service_spec.py +0 -0
  35. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/profilers/__init__.py +0 -0
  36. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/profilers/gpu.py +0 -0
  37. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/remote_config.py +0 -0
  38. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/toplevel/__init__.py +0 -0
  39. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py +0 -0
  40. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/toplevel/plugins/azure/__init__.py +0 -0
  41. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/toplevel/plugins/gcp/__init__.py +0 -0
  42. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/metaflow_extensions/outerbounds/toplevel/plugins/kubernetes/__init__.py +0 -0
  43. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/ob_metaflow_extensions.egg-info/SOURCES.txt +0 -0
  44. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/ob_metaflow_extensions.egg-info/dependency_links.txt +0 -0
  45. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/ob_metaflow_extensions.egg-info/top_level.txt +0 -0
  46. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.92}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ob-metaflow-extensions
3
- Version: 1.1.90
3
+ Version: 1.1.92
4
4
  Summary: Outerbounds Platform Extensions for Metaflow
5
5
  Author: Outerbounds, Inc.
6
6
  License: Commercial
@@ -116,15 +116,6 @@ class DockerEnvironment(MetaflowEnvironment):
116
116
 
117
117
  self.datastore = [d for d in DATASTORES if d.TYPE == self.datastore_type][0]
118
118
 
119
- # Mixing @pypi/@conda in a single step is not supported yet
120
- for step in self.flow:
121
- if sum(1 for deco in step.decorators if _is_env_deco(deco)) > 1:
122
- raise MetaflowException(
123
- "Mixing and matching PyPI packages and Conda packages within a\n"
124
- "step is not yet supported. Use one of @pypi or @conda only for the *%s* step."
125
- % step.name
126
- )
127
-
128
119
  def init_environment(self, echo):
129
120
  self.skipped_steps = {
130
121
  step.name for step in self.flow if not _step_executes_remotely(step)
@@ -132,35 +123,45 @@ class DockerEnvironment(MetaflowEnvironment):
132
123
  # Attach environment decorator as needed. This is done on a step-by-step basis
133
124
  # as we require a conda decorator for fallback steps, but prefer pypi for the baked ones.
134
125
  for step in self.flow:
135
- if not _step_has_environment_deco(step):
136
- if step.name in self.skipped_steps:
137
- # Conda fallback requires a conda decorator as the default for a step
138
- decorators._attach_decorators_to_step(step, ["conda"])
139
- else:
126
+ if step.name in self.skipped_steps:
127
+ # Conda fallback requires a conda decorator as the default for a step
128
+ decorators._attach_decorators_to_step(step, ["conda"])
129
+ else:
130
+ if not _step_has_environment_deco(step):
140
131
  # We default to PyPI for steps that are going to be baked.
141
132
  decorators._attach_decorators_to_step(step, ["pypi"])
142
- # Initialize the decorator we attached.
143
- # This is crucial for the conda decorator to work properly in the fallback environment
144
- for deco in step.decorators:
145
- if _is_env_deco(deco):
146
- deco.step_init(
147
- self.flow,
148
- None, # not passing graph as it is not available, and not required by conda/pypi decorators
149
- step.name,
150
- step.decorators,
151
- self,
152
- self.datastore,
153
- echo,
154
- )
133
+ # Initialize the decorator we attached.
134
+ # This is crucial for the conda decorator to work properly in the fallback environment
135
+ for deco in step.decorators:
136
+ if _is_env_deco(deco):
137
+ deco.step_init(
138
+ self.flow,
139
+ None, # not passing graph as it is not available, and not required by conda/pypi decorators
140
+ step.name,
141
+ step.decorators,
142
+ self,
143
+ self.datastore,
144
+ echo,
145
+ )
146
+
147
+ # Mixing @pypi/@conda in a single step is not supported yet
148
+ if sum(1 for deco in step.decorators if _is_env_deco(deco)) > 1:
149
+ raise MetaflowException(
150
+ "Mixing and matching PyPI packages and Conda packages within a\n"
151
+ "step is not yet supported. Use one of @pypi or @conda only for the *%s* step."
152
+ % step.name
153
+ )
155
154
 
156
155
  steps_to_bake = [
157
- step for step in self.flow if step.name not in self.skipped_steps
156
+ step
157
+ for step in self.flow
158
+ if step.name not in self.skipped_steps and not self.is_disabled(step)
158
159
  ]
159
160
  if steps_to_bake:
160
161
  self.logger("🚀 Baking container image(s) ...")
161
162
  start_time = time.time()
162
163
  self.results = self._bake(steps_to_bake)
163
- for step in self.flow:
164
+ for step in steps_to_bake:
164
165
  for d in step.decorators:
165
166
  if _is_remote_deco(d):
166
167
  d.attributes["image"] = self.results[step.name].container_image
@@ -0,0 +1,157 @@
1
+ from concurrent.futures import ThreadPoolExecutor
2
+ import os
3
+ import sys
4
+ import time
5
+
6
+ from metaflow.exception import MetaflowException
7
+ from metaflow.metaflow_config import KUBERNETES_NAMESPACE
8
+
9
+
10
+ CLIENT_REFRESH_INTERVAL_SECONDS = 300
11
+
12
+
13
+ class KubernetesClientException(MetaflowException):
14
+ headline = "Kubernetes client error"
15
+
16
+
17
+ class KubernetesClient(object):
18
+ def __init__(self):
19
+ try:
20
+ # Kubernetes is a soft dependency.
21
+ from kubernetes import client, config
22
+ except (NameError, ImportError):
23
+ raise KubernetesClientException(
24
+ "Could not import module 'kubernetes'.\n\nInstall Kubernetes "
25
+ "Python package (https://pypi.org/project/kubernetes/) first.\n"
26
+ "You can install the module by executing - "
27
+ "%s -m pip install kubernetes\n"
28
+ "or equivalent through your favorite Python package manager."
29
+ % sys.executable
30
+ )
31
+ self._refresh_client()
32
+ self._namespace = KUBERNETES_NAMESPACE
33
+
34
+ def _refresh_client(self):
35
+ from metaflow_extensions.outerbounds.plugins.auth_server import get_token
36
+ from kubernetes import client
37
+
38
+ config = client.Configuration()
39
+ token_info = get_token("/generate/k8s")
40
+ config.host = token_info["endpoint"]
41
+ config.api_key["authorization"] = "Bearer " + token_info["token"]
42
+ config.verify_ssl = False # TODO: FIX THIS
43
+ client.Configuration.set_default(config)
44
+ self._client = client
45
+ self._client_refresh_timestamp = time.time()
46
+
47
+ def get(self):
48
+ if (
49
+ time.time() - self._client_refresh_timestamp
50
+ > CLIENT_REFRESH_INTERVAL_SECONDS
51
+ ):
52
+ self._refresh_client()
53
+
54
+ return self._client
55
+
56
+ def _find_active_pods(self, flow_name, run_id=None, user=None):
57
+ def _request(_continue=None):
58
+ # handle paginated responses
59
+ return self._client.CoreV1Api().list_namespaced_pod(
60
+ namespace=self._namespace,
61
+ # limited selector support for K8S api. We want to cover multiple statuses: Running / Pending / Unknown
62
+ field_selector="status.phase!=Succeeded,status.phase!=Failed",
63
+ limit=1000,
64
+ _continue=_continue,
65
+ )
66
+
67
+ results = _request()
68
+
69
+ if run_id is not None:
70
+ # handle argo prefixes in run_id
71
+ run_id = run_id[run_id.startswith("argo-") and len("argo-") :]
72
+
73
+ while results.metadata._continue or results.items:
74
+ for pod in results.items:
75
+ match = (
76
+ # arbitrary pods might have no annotations at all.
77
+ pod.metadata.annotations
78
+ and pod.metadata.labels
79
+ and (
80
+ run_id is None
81
+ or (pod.metadata.annotations.get("metaflow/run_id") == run_id)
82
+ # we want to also match pods launched by argo-workflows
83
+ or (
84
+ pod.metadata.labels.get("workflows.argoproj.io/workflow")
85
+ == run_id
86
+ )
87
+ )
88
+ and (
89
+ user is None
90
+ or pod.metadata.annotations.get("metaflow/user") == user
91
+ )
92
+ and (
93
+ pod.metadata.annotations.get("metaflow/flow_name") == flow_name
94
+ )
95
+ )
96
+ if match:
97
+ yield pod
98
+ if not results.metadata._continue:
99
+ break
100
+ results = _request(results.metadata._continue)
101
+
102
+ def list(self, flow_name, run_id, user):
103
+ results = self._find_active_pods(flow_name, run_id, user)
104
+
105
+ return list(results)
106
+
107
+ def kill_pods(self, flow_name, run_id, user, echo):
108
+ from kubernetes.stream import stream
109
+
110
+ api_instance = self._client.CoreV1Api()
111
+ job_api = self._client.BatchV1Api()
112
+ pods = self._find_active_pods(flow_name, run_id, user)
113
+
114
+ def _kill_pod(pod):
115
+ echo("Killing Kubernetes pod %s\n" % pod.metadata.name)
116
+ try:
117
+ stream(
118
+ api_instance.connect_get_namespaced_pod_exec,
119
+ name=pod.metadata.name,
120
+ namespace=pod.metadata.namespace,
121
+ command=[
122
+ "/bin/sh",
123
+ "-c",
124
+ "/sbin/killall5",
125
+ ],
126
+ stderr=True,
127
+ stdin=False,
128
+ stdout=True,
129
+ tty=False,
130
+ )
131
+ except Exception:
132
+ # best effort kill for pod can fail.
133
+ try:
134
+ job_name = pod.metadata.labels.get("job-name", None)
135
+ if job_name is None:
136
+ raise Exception("Could not determine job name")
137
+ job_api.patch_namespaced_job(
138
+ name=job_name,
139
+ namespace=pod.metadata.namespace,
140
+ field_manager="metaflow",
141
+ body={"spec": {"parallelism": 0}},
142
+ )
143
+ except Exception as e:
144
+ echo("failed to kill pod %s - %s" % (pod.metadata.name, str(e)))
145
+
146
+ with ThreadPoolExecutor() as executor:
147
+ executor.map(_kill_pod, list(pods))
148
+
149
+ def job(self, **kwargs):
150
+ from metaflow.plugins.kubernetes.kubernetes_job import KubernetesJob
151
+
152
+ return KubernetesJob(self, **kwargs)
153
+
154
+ def jobset(self, **kwargs):
155
+ from metaflow.plugins.kubernetes.kubernetes_job import KubernetesJobSet
156
+
157
+ return KubernetesJobSet(self, **kwargs)
@@ -8,26 +8,27 @@ from datetime import datetime, timezone
8
8
  from metaflow.exception import MetaflowException
9
9
 
10
10
 
11
- def kill_process_and_descendants(pid, termination_timeout=5):
12
- try:
13
- subprocess.check_call(["pkill", "-TERM", "-P", str(pid)])
14
- subprocess.check_call(["kill", "-TERM", str(pid)])
15
- except subprocess.CalledProcessError:
16
- pass
11
+ def kill_process_and_descendants(pid, termination_timeout=1, iterations=20, delay=0.5):
12
+ for i in range(iterations):
13
+ try:
14
+ subprocess.check_call(["pkill", "-TERM", "-P", str(pid)])
15
+ subprocess.check_call(["kill", "-TERM", str(pid)])
16
+
17
+ time.sleep(termination_timeout)
17
18
 
18
- time.sleep(termination_timeout)
19
+ subprocess.check_call(["pkill", "-KILL", "-P", str(pid)])
20
+ subprocess.check_call(["kill", "-KILL", str(pid)])
21
+ except subprocess.CalledProcessError:
22
+ pass
19
23
 
20
- try:
21
- subprocess.check_call(["pkill", "-KILL", "-P", str(pid)])
22
- subprocess.check_call(["kill", "-KILL", str(pid)])
23
- except subprocess.CalledProcessError:
24
- pass
24
+ # Don't delay after the last iteration
25
+ if i < iterations - 1:
26
+ time.sleep(delay)
25
27
 
26
28
 
27
29
  class HeartbeatStore(object):
28
30
  def __init__(
29
31
  self,
30
- heartbeat_prefix,
31
32
  main_pid=None,
32
33
  storage_backend=None,
33
34
  emit_frequency=30,
@@ -35,7 +36,6 @@ class HeartbeatStore(object):
35
36
  monitor_frequency=15,
36
37
  max_missed_heartbeats=3,
37
38
  ) -> None:
38
- self.heartbeat_prefix = heartbeat_prefix
39
39
  self.main_pid = main_pid
40
40
  self.storage_backend = storage_backend
41
41
  self.emit_frequency = emit_frequency
@@ -44,8 +44,8 @@ class HeartbeatStore(object):
44
44
  self.max_missed_heartbeats = max_missed_heartbeats
45
45
  self.missed_heartbeats = 0
46
46
 
47
- def emit_heartbeat(self, folder_name=None):
48
- heartbeat_key = f"{self.heartbeat_prefix}/heartbeat"
47
+ def emit_heartbeat(self, heartbeat_prefix: str, folder_name=None):
48
+ heartbeat_key = f"{heartbeat_prefix}/heartbeat"
49
49
  if folder_name:
50
50
  heartbeat_key = f"{folder_name}/{heartbeat_key}"
51
51
 
@@ -63,8 +63,8 @@ class HeartbeatStore(object):
63
63
 
64
64
  time.sleep(self.emit_frequency)
65
65
 
66
- def emit_tombstone(self, folder_name=None):
67
- tombstone_key = f"{self.heartbeat_prefix}/tombstone"
66
+ def emit_tombstone(self, tombstone_prefix: str, folder_name=None):
67
+ tombstone_key = f"{tombstone_prefix}/tombstone"
68
68
  if folder_name:
69
69
  tombstone_key = f"{folder_name}/{tombstone_key}"
70
70
 
@@ -113,12 +113,12 @@ class HeartbeatStore(object):
113
113
  return False
114
114
  return True
115
115
 
116
- def monitor(self, folder_name=None):
117
- heartbeat_key = f"{self.heartbeat_prefix}/heartbeat"
116
+ def monitor(self, heartbeat_prefix: str, tombstone_prefix: str, folder_name=None):
117
+ heartbeat_key = f"{heartbeat_prefix}/heartbeat"
118
118
  if folder_name:
119
119
  heartbeat_key = f"{folder_name}/{heartbeat_key}"
120
120
 
121
- tombstone_key = f"{self.heartbeat_prefix}/tombstone"
121
+ tombstone_key = f"{tombstone_prefix}/tombstone"
122
122
  if folder_name:
123
123
  tombstone_key = f"{folder_name}/{tombstone_key}"
124
124
 
@@ -162,12 +162,17 @@ if __name__ == "__main__":
162
162
  storage = datastores[0](datastore_sysroot)
163
163
 
164
164
  heartbeat_prefix = f"{os.getenv('MF_PATHSPEC')}/{os.getenv('MF_ATTEMPT')}"
165
+ flow_name, run_id, _, _ = os.getenv("MF_PATHSPEC").split("/")
166
+ tombstone_prefix = f"{flow_name}/{run_id}"
165
167
 
166
168
  store = HeartbeatStore(
167
- heartbeat_prefix=heartbeat_prefix,
168
169
  main_pid=int(main_pid),
169
170
  storage_backend=storage,
170
171
  max_missed_heartbeats=int(NVIDIA_HEARTBEAT_THRESHOLD),
171
172
  )
172
173
 
173
- store.monitor(folder_name=folder_name)
174
+ store.monitor(
175
+ heartbeat_prefix=heartbeat_prefix,
176
+ tombstone_prefix=tombstone_prefix,
177
+ folder_name=folder_name,
178
+ )
@@ -185,13 +185,17 @@ class Job(object):
185
185
  )
186
186
 
187
187
  store = HeartbeatStore(
188
- heartbeat_prefix=heartbeat_prefix,
189
188
  main_pid=None,
190
189
  storage_backend=backend,
191
190
  )
192
191
 
193
192
  self.heartbeat_thread = threading.Thread(
194
- target=store.emit_heartbeat, args=("nvcf_heartbeats",), daemon=True
193
+ target=store.emit_heartbeat,
194
+ args=(
195
+ heartbeat_prefix,
196
+ "nvcf_heartbeats",
197
+ ),
198
+ daemon=True,
195
199
  )
196
200
  self.heartbeat_thread.start()
197
201
 
@@ -56,23 +56,21 @@ def list(ctx, run_id):
56
56
  flow_name = ctx.obj.flow.name
57
57
  run_obj = Run(pathspec=f"{flow_name}/{run_id}", _namespace_check=False)
58
58
  running_invocations = []
59
+
59
60
  for each_step in run_obj:
60
- if (
61
- not each_step.task.finished
62
- and "nvcf-function-id" in each_step.task.metadata_dict
63
- ):
64
- task_pathspec = each_step.task.pathspec
65
- attempt = each_step.task.metadata_dict.get("attempt")
66
- flow_name, run_id, step_name, task_id = task_pathspec.split("/")
67
- running_invocations.append(
68
- f"Flow Name: {flow_name}, Run ID: {run_id}, Step Name: {step_name}, Task ID: {task_id}, Retry Count: {attempt}"
69
- )
61
+ for each_task in each_step:
62
+ if not each_task.finished and "nvcf-function-id" in each_task.metadata_dict:
63
+
64
+ task_pathspec = each_task.pathspec
65
+ attempt = each_task.metadata_dict.get("attempt")
66
+ flow_name, run_id, step_name, task_id = task_pathspec.split("/")
67
+ running_invocations.append(
68
+ f"Flow Name: {flow_name}, Run ID: {run_id}, Step Name: {step_name}, Task ID: {task_id}, Retry Count: {attempt}"
69
+ )
70
70
 
71
71
  if running_invocations:
72
72
  for each_invocation in running_invocations:
73
73
  ctx.obj.echo(each_invocation)
74
- else:
75
- ctx.obj.echo("No running @nvidia invocations for Run ID: %s" % run_id)
76
74
 
77
75
 
78
76
  @nvidia.command(help="Kill steps / tasks running as an nvidia job.")
@@ -88,29 +86,17 @@ def kill(ctx, run_id):
88
86
  HeartbeatStore,
89
87
  )
90
88
 
91
- flow_name = ctx.obj.flow.name
92
- run_obj = Run(pathspec=f"{flow_name}/{run_id}", _namespace_check=False)
93
-
94
- for each_step in run_obj:
95
- if (
96
- not each_step.task.finished
97
- and "nvcf-function-id" in each_step.task.metadata_dict
98
- ):
99
- task_pathspec = each_step.task.pathspec
100
- attempt = each_step.task.metadata_dict.get("attempt")
101
- heartbeat_prefix = "{task_pathspec}/{attempt}".format(
102
- task_pathspec=task_pathspec, attempt=attempt
103
- )
89
+ datastore_root = ctx.obj.datastore_impl.datastore_root
90
+ store = HeartbeatStore(
91
+ main_pid=None,
92
+ storage_backend=ctx.obj.datastore_impl(datastore_root),
93
+ )
104
94
 
105
- datastore_root = ctx.obj.datastore_impl.datastore_root
106
- store = HeartbeatStore(
107
- heartbeat_prefix=heartbeat_prefix,
108
- main_pid=None,
109
- storage_backend=ctx.obj.datastore_impl(datastore_root),
110
- )
111
- store.emit_tombstone(folder_name="nvcf_heartbeats")
112
- else:
113
- ctx.obj.echo("No running @nvidia invocations for Run ID: %s" % run_id)
95
+ flow_name = ctx.obj.flow.name
96
+ tombstone_prefix = f"{flow_name}/{run_id}"
97
+ store.emit_tombstone(
98
+ tombstone_prefix=tombstone_prefix, folder_name="nvcf_heartbeats"
99
+ )
114
100
 
115
101
 
116
102
  @nvidia.command(
@@ -31,7 +31,7 @@ from metaflow.mflog import (
31
31
  )
32
32
 
33
33
  from .snowpark_client import SnowparkClient
34
- from .snowpark_exceptions import SnowparkException
34
+ from .snowpark_exceptions import SnowparkException, SnowparkKilledException
35
35
  from .snowpark_job import SnowparkJob
36
36
 
37
37
  # Redirect structured logs to $PWD/.logs/
@@ -291,7 +291,7 @@ class Snowpark(object):
291
291
  else:
292
292
  if self.job.is_running:
293
293
  # Kill the job if it is still running by throwing an exception.
294
- raise SnowparkException("Task failed!")
294
+ raise SnowparkKilledException("Task failed!")
295
295
  echo(
296
296
  "Task finished with message '%s'." % self.job.message,
297
297
  "stderr",
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ob-metaflow-extensions
3
- Version: 1.1.90
3
+ Version: 1.1.92
4
4
  Summary: Outerbounds Platform Extensions for Metaflow
5
5
  Author: Outerbounds, Inc.
6
6
  License: Commercial
@@ -0,0 +1,3 @@
1
+ boto3
2
+ kubernetes
3
+ ob-metaflow==2.12.20.1
@@ -2,7 +2,7 @@ from setuptools import setup, find_namespace_packages
2
2
  from pathlib import Path
3
3
 
4
4
 
5
- version = "1.1.90"
5
+ version = "1.1.92"
6
6
  this_directory = Path(__file__).parent
7
7
  long_description = (this_directory / "README.md").read_text()
8
8
 
@@ -18,6 +18,6 @@ setup(
18
18
  install_requires=[
19
19
  "boto3",
20
20
  "kubernetes",
21
- "ob-metaflow == 2.12.19.1",
21
+ "ob-metaflow == 2.12.20.1",
22
22
  ],
23
23
  )
@@ -1,61 +0,0 @@
1
- import os
2
- import sys
3
- import time
4
-
5
- from metaflow.exception import MetaflowException
6
-
7
-
8
- CLIENT_REFRESH_INTERVAL_SECONDS = 300
9
-
10
-
11
- class KubernetesClientException(MetaflowException):
12
- headline = "Kubernetes client error"
13
-
14
-
15
- class KubernetesClient(object):
16
- def __init__(self):
17
- try:
18
- # Kubernetes is a soft dependency.
19
- from kubernetes import client, config
20
- except (NameError, ImportError):
21
- raise KubernetesClientException(
22
- "Could not import module 'kubernetes'.\n\nInstall Kubernetes "
23
- "Python package (https://pypi.org/project/kubernetes/) first.\n"
24
- "You can install the module by executing - "
25
- "%s -m pip install kubernetes\n"
26
- "or equivalent through your favorite Python package manager."
27
- % sys.executable
28
- )
29
- self._refresh_client()
30
-
31
- def _refresh_client(self):
32
- from metaflow_extensions.outerbounds.plugins.auth_server import get_token
33
- from kubernetes import client
34
-
35
- config = client.Configuration()
36
- token_info = get_token("/generate/k8s")
37
- config.host = token_info["endpoint"]
38
- config.api_key["authorization"] = "Bearer " + token_info["token"]
39
- config.verify_ssl = False # TODO: FIX THIS
40
- client.Configuration.set_default(config)
41
- self._client = client
42
- self._client_refresh_timestamp = time.time()
43
-
44
- def get(self):
45
- if (
46
- time.time() - self._client_refresh_timestamp
47
- > CLIENT_REFRESH_INTERVAL_SECONDS
48
- ):
49
- self._refresh_client()
50
-
51
- return self._client
52
-
53
- def job(self, **kwargs):
54
- from metaflow.plugins.kubernetes.kubernetes_job import KubernetesJob
55
-
56
- return KubernetesJob(self, **kwargs)
57
-
58
- def jobset(self, **kwargs):
59
- from metaflow.plugins.kubernetes.kubernetes_job import KubernetesJobSet
60
-
61
- return KubernetesJobSet(self, **kwargs)
@@ -1,3 +0,0 @@
1
- boto3
2
- kubernetes
3
- ob-metaflow==2.12.19.1