ob-metaflow-extensions 1.1.90__tar.gz → 1.1.91__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ob-metaflow-extensions might be problematic. Click here for more details.

Files changed (44) hide show
  1. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.91}/PKG-INFO +1 -1
  2. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.91}/metaflow_extensions/outerbounds/plugins/nvcf/heartbeat_store.py +28 -23
  3. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.91}/metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py +6 -2
  4. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.91}/metaflow_extensions/outerbounds/plugins/nvcf/nvcf_cli.py +20 -34
  5. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.91}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark.py +2 -2
  6. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.91}/ob_metaflow_extensions.egg-info/PKG-INFO +1 -1
  7. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.91}/setup.py +1 -1
  8. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.91}/README.md +0 -0
  9. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.91}/metaflow_extensions/outerbounds/__init__.py +0 -0
  10. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.91}/metaflow_extensions/outerbounds/config/__init__.py +0 -0
  11. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.91}/metaflow_extensions/outerbounds/plugins/__init__.py +0 -0
  12. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.91}/metaflow_extensions/outerbounds/plugins/auth_server.py +0 -0
  13. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.91}/metaflow_extensions/outerbounds/plugins/fast_bakery/__init__.py +0 -0
  14. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.91}/metaflow_extensions/outerbounds/plugins/fast_bakery/docker_environment.py +0 -0
  15. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.91}/metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery.py +0 -0
  16. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.91}/metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_cli.py +0 -0
  17. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.91}/metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_decorator.py +0 -0
  18. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.91}/metaflow_extensions/outerbounds/plugins/kubernetes/__init__.py +0 -0
  19. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.91}/metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py +0 -0
  20. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.91}/metaflow_extensions/outerbounds/plugins/nim/__init__.py +0 -0
  21. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.91}/metaflow_extensions/outerbounds/plugins/nim/nim_manager.py +0 -0
  22. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.91}/metaflow_extensions/outerbounds/plugins/nvcf/__init__.py +0 -0
  23. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.91}/metaflow_extensions/outerbounds/plugins/nvcf/nvcf_decorator.py +0 -0
  24. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.91}/metaflow_extensions/outerbounds/plugins/perimeters.py +0 -0
  25. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.91}/metaflow_extensions/outerbounds/plugins/snowpark/__init__.py +0 -0
  26. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.91}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_cli.py +0 -0
  27. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.91}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_client.py +0 -0
  28. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.91}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_decorator.py +0 -0
  29. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.91}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_exceptions.py +0 -0
  30. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.91}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_job.py +0 -0
  31. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.91}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_service_spec.py +0 -0
  32. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.91}/metaflow_extensions/outerbounds/profilers/__init__.py +0 -0
  33. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.91}/metaflow_extensions/outerbounds/profilers/gpu.py +0 -0
  34. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.91}/metaflow_extensions/outerbounds/remote_config.py +0 -0
  35. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.91}/metaflow_extensions/outerbounds/toplevel/__init__.py +0 -0
  36. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.91}/metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py +0 -0
  37. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.91}/metaflow_extensions/outerbounds/toplevel/plugins/azure/__init__.py +0 -0
  38. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.91}/metaflow_extensions/outerbounds/toplevel/plugins/gcp/__init__.py +0 -0
  39. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.91}/metaflow_extensions/outerbounds/toplevel/plugins/kubernetes/__init__.py +0 -0
  40. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.91}/ob_metaflow_extensions.egg-info/SOURCES.txt +0 -0
  41. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.91}/ob_metaflow_extensions.egg-info/dependency_links.txt +0 -0
  42. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.91}/ob_metaflow_extensions.egg-info/requires.txt +0 -0
  43. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.91}/ob_metaflow_extensions.egg-info/top_level.txt +0 -0
  44. {ob-metaflow-extensions-1.1.90 → ob-metaflow-extensions-1.1.91}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ob-metaflow-extensions
3
- Version: 1.1.90
3
+ Version: 1.1.91
4
4
  Summary: Outerbounds Platform Extensions for Metaflow
5
5
  Author: Outerbounds, Inc.
6
6
  License: Commercial
@@ -8,26 +8,27 @@ from datetime import datetime, timezone
8
8
  from metaflow.exception import MetaflowException
9
9
 
10
10
 
11
- def kill_process_and_descendants(pid, termination_timeout=5):
12
- try:
13
- subprocess.check_call(["pkill", "-TERM", "-P", str(pid)])
14
- subprocess.check_call(["kill", "-TERM", str(pid)])
15
- except subprocess.CalledProcessError:
16
- pass
11
+ def kill_process_and_descendants(pid, termination_timeout=1, iterations=20, delay=0.5):
12
+ for i in range(iterations):
13
+ try:
14
+ subprocess.check_call(["pkill", "-TERM", "-P", str(pid)])
15
+ subprocess.check_call(["kill", "-TERM", str(pid)])
16
+
17
+ time.sleep(termination_timeout)
17
18
 
18
- time.sleep(termination_timeout)
19
+ subprocess.check_call(["pkill", "-KILL", "-P", str(pid)])
20
+ subprocess.check_call(["kill", "-KILL", str(pid)])
21
+ except subprocess.CalledProcessError:
22
+ pass
19
23
 
20
- try:
21
- subprocess.check_call(["pkill", "-KILL", "-P", str(pid)])
22
- subprocess.check_call(["kill", "-KILL", str(pid)])
23
- except subprocess.CalledProcessError:
24
- pass
24
+ # Don't delay after the last iteration
25
+ if i < iterations - 1:
26
+ time.sleep(delay)
25
27
 
26
28
 
27
29
  class HeartbeatStore(object):
28
30
  def __init__(
29
31
  self,
30
- heartbeat_prefix,
31
32
  main_pid=None,
32
33
  storage_backend=None,
33
34
  emit_frequency=30,
@@ -35,7 +36,6 @@ class HeartbeatStore(object):
35
36
  monitor_frequency=15,
36
37
  max_missed_heartbeats=3,
37
38
  ) -> None:
38
- self.heartbeat_prefix = heartbeat_prefix
39
39
  self.main_pid = main_pid
40
40
  self.storage_backend = storage_backend
41
41
  self.emit_frequency = emit_frequency
@@ -44,8 +44,8 @@ class HeartbeatStore(object):
44
44
  self.max_missed_heartbeats = max_missed_heartbeats
45
45
  self.missed_heartbeats = 0
46
46
 
47
- def emit_heartbeat(self, folder_name=None):
48
- heartbeat_key = f"{self.heartbeat_prefix}/heartbeat"
47
+ def emit_heartbeat(self, heartbeat_prefix: str, folder_name=None):
48
+ heartbeat_key = f"{heartbeat_prefix}/heartbeat"
49
49
  if folder_name:
50
50
  heartbeat_key = f"{folder_name}/{heartbeat_key}"
51
51
 
@@ -63,8 +63,8 @@ class HeartbeatStore(object):
63
63
 
64
64
  time.sleep(self.emit_frequency)
65
65
 
66
- def emit_tombstone(self, folder_name=None):
67
- tombstone_key = f"{self.heartbeat_prefix}/tombstone"
66
+ def emit_tombstone(self, tombstone_prefix: str, folder_name=None):
67
+ tombstone_key = f"{tombstone_prefix}/tombstone"
68
68
  if folder_name:
69
69
  tombstone_key = f"{folder_name}/{tombstone_key}"
70
70
 
@@ -113,12 +113,12 @@ class HeartbeatStore(object):
113
113
  return False
114
114
  return True
115
115
 
116
- def monitor(self, folder_name=None):
117
- heartbeat_key = f"{self.heartbeat_prefix}/heartbeat"
116
+ def monitor(self, heartbeat_prefix: str, tombstone_prefix: str, folder_name=None):
117
+ heartbeat_key = f"{heartbeat_prefix}/heartbeat"
118
118
  if folder_name:
119
119
  heartbeat_key = f"{folder_name}/{heartbeat_key}"
120
120
 
121
- tombstone_key = f"{self.heartbeat_prefix}/tombstone"
121
+ tombstone_key = f"{tombstone_prefix}/tombstone"
122
122
  if folder_name:
123
123
  tombstone_key = f"{folder_name}/{tombstone_key}"
124
124
 
@@ -162,12 +162,17 @@ if __name__ == "__main__":
162
162
  storage = datastores[0](datastore_sysroot)
163
163
 
164
164
  heartbeat_prefix = f"{os.getenv('MF_PATHSPEC')}/{os.getenv('MF_ATTEMPT')}"
165
+ flow_name, run_id, _, _ = os.getenv("MF_PATHSPEC").split("/")
166
+ tombstone_prefix = f"{flow_name}/{run_id}"
165
167
 
166
168
  store = HeartbeatStore(
167
- heartbeat_prefix=heartbeat_prefix,
168
169
  main_pid=int(main_pid),
169
170
  storage_backend=storage,
170
171
  max_missed_heartbeats=int(NVIDIA_HEARTBEAT_THRESHOLD),
171
172
  )
172
173
 
173
- store.monitor(folder_name=folder_name)
174
+ store.monitor(
175
+ heartbeat_prefix=heartbeat_prefix,
176
+ tombstone_prefix=tombstone_prefix,
177
+ folder_name=folder_name,
178
+ )
@@ -185,13 +185,17 @@ class Job(object):
185
185
  )
186
186
 
187
187
  store = HeartbeatStore(
188
- heartbeat_prefix=heartbeat_prefix,
189
188
  main_pid=None,
190
189
  storage_backend=backend,
191
190
  )
192
191
 
193
192
  self.heartbeat_thread = threading.Thread(
194
- target=store.emit_heartbeat, args=("nvcf_heartbeats",), daemon=True
193
+ target=store.emit_heartbeat,
194
+ args=(
195
+ heartbeat_prefix,
196
+ "nvcf_heartbeats",
197
+ ),
198
+ daemon=True,
195
199
  )
196
200
  self.heartbeat_thread.start()
197
201
 
@@ -56,23 +56,21 @@ def list(ctx, run_id):
56
56
  flow_name = ctx.obj.flow.name
57
57
  run_obj = Run(pathspec=f"{flow_name}/{run_id}", _namespace_check=False)
58
58
  running_invocations = []
59
+
59
60
  for each_step in run_obj:
60
- if (
61
- not each_step.task.finished
62
- and "nvcf-function-id" in each_step.task.metadata_dict
63
- ):
64
- task_pathspec = each_step.task.pathspec
65
- attempt = each_step.task.metadata_dict.get("attempt")
66
- flow_name, run_id, step_name, task_id = task_pathspec.split("/")
67
- running_invocations.append(
68
- f"Flow Name: {flow_name}, Run ID: {run_id}, Step Name: {step_name}, Task ID: {task_id}, Retry Count: {attempt}"
69
- )
61
+ for each_task in each_step:
62
+ if not each_task.finished and "nvcf-function-id" in each_task.metadata_dict:
63
+
64
+ task_pathspec = each_task.pathspec
65
+ attempt = each_task.metadata_dict.get("attempt")
66
+ flow_name, run_id, step_name, task_id = task_pathspec.split("/")
67
+ running_invocations.append(
68
+ f"Flow Name: {flow_name}, Run ID: {run_id}, Step Name: {step_name}, Task ID: {task_id}, Retry Count: {attempt}"
69
+ )
70
70
 
71
71
  if running_invocations:
72
72
  for each_invocation in running_invocations:
73
73
  ctx.obj.echo(each_invocation)
74
- else:
75
- ctx.obj.echo("No running @nvidia invocations for Run ID: %s" % run_id)
76
74
 
77
75
 
78
76
  @nvidia.command(help="Kill steps / tasks running as an nvidia job.")
@@ -88,29 +86,17 @@ def kill(ctx, run_id):
88
86
  HeartbeatStore,
89
87
  )
90
88
 
91
- flow_name = ctx.obj.flow.name
92
- run_obj = Run(pathspec=f"{flow_name}/{run_id}", _namespace_check=False)
93
-
94
- for each_step in run_obj:
95
- if (
96
- not each_step.task.finished
97
- and "nvcf-function-id" in each_step.task.metadata_dict
98
- ):
99
- task_pathspec = each_step.task.pathspec
100
- attempt = each_step.task.metadata_dict.get("attempt")
101
- heartbeat_prefix = "{task_pathspec}/{attempt}".format(
102
- task_pathspec=task_pathspec, attempt=attempt
103
- )
89
+ datastore_root = ctx.obj.datastore_impl.datastore_root
90
+ store = HeartbeatStore(
91
+ main_pid=None,
92
+ storage_backend=ctx.obj.datastore_impl(datastore_root),
93
+ )
104
94
 
105
- datastore_root = ctx.obj.datastore_impl.datastore_root
106
- store = HeartbeatStore(
107
- heartbeat_prefix=heartbeat_prefix,
108
- main_pid=None,
109
- storage_backend=ctx.obj.datastore_impl(datastore_root),
110
- )
111
- store.emit_tombstone(folder_name="nvcf_heartbeats")
112
- else:
113
- ctx.obj.echo("No running @nvidia invocations for Run ID: %s" % run_id)
95
+ flow_name = ctx.obj.flow.name
96
+ tombstone_prefix = f"{flow_name}/{run_id}"
97
+ store.emit_tombstone(
98
+ tombstone_prefix=tombstone_prefix, folder_name="nvcf_heartbeats"
99
+ )
114
100
 
115
101
 
116
102
  @nvidia.command(
@@ -31,7 +31,7 @@ from metaflow.mflog import (
31
31
  )
32
32
 
33
33
  from .snowpark_client import SnowparkClient
34
- from .snowpark_exceptions import SnowparkException
34
+ from .snowpark_exceptions import SnowparkException, SnowparkKilledException
35
35
  from .snowpark_job import SnowparkJob
36
36
 
37
37
  # Redirect structured logs to $PWD/.logs/
@@ -291,7 +291,7 @@ class Snowpark(object):
291
291
  else:
292
292
  if self.job.is_running:
293
293
  # Kill the job if it is still running by throwing an exception.
294
- raise SnowparkException("Task failed!")
294
+ raise SnowparkKilledException("Task failed!")
295
295
  echo(
296
296
  "Task finished with message '%s'." % self.job.message,
297
297
  "stderr",
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ob-metaflow-extensions
3
- Version: 1.1.90
3
+ Version: 1.1.91
4
4
  Summary: Outerbounds Platform Extensions for Metaflow
5
5
  Author: Outerbounds, Inc.
6
6
  License: Commercial
@@ -2,7 +2,7 @@ from setuptools import setup, find_namespace_packages
2
2
  from pathlib import Path
3
3
 
4
4
 
5
- version = "1.1.90"
5
+ version = "1.1.91"
6
6
  this_directory = Path(__file__).parent
7
7
  long_description = (this_directory / "README.md").read_text()
8
8