ob-metaflow-extensions 1.1.124__tar.gz → 1.1.125__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ob-metaflow-extensions might be problematic. Click here for more details.

Files changed (55) hide show
  1. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/PKG-INFO +1 -1
  2. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/nvcf/exceptions.py +11 -0
  3. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py +60 -30
  4. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/nvcf/nvcf_cli.py +5 -0
  5. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/nvcf/nvcf_decorator.py +13 -1
  6. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/ob_metaflow_extensions.egg-info/PKG-INFO +1 -1
  7. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/setup.py +1 -1
  8. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/README.md +0 -0
  9. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/__init__.py +0 -0
  10. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/config/__init__.py +0 -0
  11. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/__init__.py +0 -0
  12. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/auth_server.py +0 -0
  13. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/fast_bakery/__init__.py +0 -0
  14. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/fast_bakery/docker_environment.py +0 -0
  15. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery.py +0 -0
  16. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_cli.py +0 -0
  17. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_decorator.py +0 -0
  18. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/kubernetes/__init__.py +0 -0
  19. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py +0 -0
  20. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/nim/__init__.py +0 -0
  21. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/nim/nim_manager.py +0 -0
  22. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/nvcf/__init__.py +0 -0
  23. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/nvcf/constants.py +0 -0
  24. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/nvcf/heartbeat_store.py +0 -0
  25. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/nvcf/utils.py +0 -0
  26. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/perimeters.py +0 -0
  27. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/profilers/deco_injector.py +0 -0
  28. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/profilers/gpu_profile_decorator.py +0 -0
  29. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/secrets/__init__.py +0 -0
  30. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/secrets/secrets.py +0 -0
  31. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/snowflake/__init__.py +0 -0
  32. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/snowflake/snowflake.py +0 -0
  33. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/snowpark/__init__.py +0 -0
  34. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark.py +0 -0
  35. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_cli.py +0 -0
  36. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_client.py +0 -0
  37. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_decorator.py +0 -0
  38. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_exceptions.py +0 -0
  39. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_job.py +0 -0
  40. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_service_spec.py +0 -0
  41. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/tensorboard/__init__.py +0 -0
  42. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/profilers/__init__.py +0 -0
  43. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/profilers/gpu.py +0 -0
  44. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/remote_config.py +0 -0
  45. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/toplevel/__init__.py +0 -0
  46. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py +0 -0
  47. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/toplevel/plugins/azure/__init__.py +0 -0
  48. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/toplevel/plugins/gcp/__init__.py +0 -0
  49. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/toplevel/plugins/kubernetes/__init__.py +0 -0
  50. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/toplevel/plugins/snowflake/__init__.py +0 -0
  51. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/ob_metaflow_extensions.egg-info/SOURCES.txt +0 -0
  52. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/ob_metaflow_extensions.egg-info/dependency_links.txt +0 -0
  53. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/ob_metaflow_extensions.egg-info/requires.txt +0 -0
  54. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/ob_metaflow_extensions.egg-info/top_level.txt +0 -0
  55. {ob-metaflow-extensions-1.1.124 → ob-metaflow-extensions-1.1.125}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ob-metaflow-extensions
3
- Version: 1.1.124
3
+ Version: 1.1.125
4
4
  Summary: Outerbounds Platform Extensions for Metaflow
5
5
  Author: Outerbounds, Inc.
6
6
  License: Commercial
@@ -79,5 +79,16 @@ class NvcfTimeoutTooShortException(MetaflowException):
79
79
  super(NvcfTimeoutTooShortException, self).__init__(msg)
80
80
 
81
81
 
82
+ class NvcfQueueTimeoutTooShortException(MetaflowException):
83
+ headline = "[@nvidia NvcfQueueTimeoutTooShortException] Queue Timeout too short"
84
+
85
+ def __init__(self, step):
86
+ msg = (
87
+ "The queue timeout for step *{step}* should be at least 60 seconds for "
88
+ "execution with @nvidia".format(step=step)
89
+ )
90
+ super(NvcfQueueTimeoutTooShortException, self).__init__(msg)
91
+
92
+
82
93
  class NvcfKilledException(MetaflowException):
83
94
  headline = "Nvidia job killed"
@@ -23,29 +23,53 @@ STDOUT_PATH = os.path.join(LOGS_DIR, STDOUT_FILE)
23
23
  STDERR_PATH = os.path.join(LOGS_DIR, STDERR_FILE)
24
24
 
25
25
 
26
- RETRIABLE_STATUS_CODES = [500]
27
-
28
-
29
- def retry_on_status(status_codes=RETRIABLE_STATUS_CODES, max_retries=3, delay=1):
26
+ def retry_on_status(status_codes=[500], max_retries=3, delay=1):
30
27
  def decorator(func):
31
28
  @wraps(func)
32
- def wrapper(*args, **kwargs):
29
+ def wrapper(instance, *args, **kwargs):
33
30
  retries = 0
34
- while retries <= max_retries:
31
+
32
+ # Determine retry limit upfront
33
+ use_queue_timeout = 504 in status_codes
34
+ if use_queue_timeout:
35
+ poll_seconds = int(instance._poll_seconds)
36
+ retry_limit = (
37
+ instance._queue_timeout + (poll_seconds - 1)
38
+ ) // poll_seconds
39
+ remainder = instance._queue_timeout % poll_seconds
40
+ last_timeout = remainder if remainder != 0 else poll_seconds
41
+ else:
42
+ retry_limit = max_retries
43
+
44
+ while retries < retry_limit:
35
45
  try:
36
- return func(*args, **kwargs)
46
+ return func(instance, *args, **kwargs)
37
47
  except HTTPError as e:
38
- if e.code in status_codes and retries < max_retries:
39
- retries += 1
40
- print(
41
- f"[@nvidia] Received {e.code} error, retrying ({retries}/{max_retries})..."
42
- )
48
+ if e.code not in status_codes or retries >= retry_limit:
49
+ instance._status = JobStatus.FAILED
50
+ if e.code == 504 and retries >= retry_limit:
51
+ raise NvcfPollingConnectionError(
52
+ "Request timed out after all retries"
53
+ )
54
+ raise
55
+
56
+ if e.code == 504 and retries == retry_limit - 1:
57
+ instance._poll_seconds = str(last_timeout)
58
+
59
+ print(
60
+ f"[@nvidia] {'Queue timeout' if e.code == 504 else f'Received {e.code}'}, "
61
+ f"retrying ({retries + 1}/{retry_limit})... with poll seconds as {instance._poll_seconds}"
62
+ )
63
+
64
+ if e.code != 504:
43
65
  time.sleep(delay)
44
- continue
45
- raise
46
- except Exception:
66
+
67
+ retries += 1
68
+ except URLError as e:
69
+ instance._status = JobStatus.FAILED
47
70
  raise
48
- return func(*args, **kwargs)
71
+ # final attempt
72
+ return func(instance, *args, **kwargs)
49
73
 
50
74
  return wrapper
51
75
 
@@ -53,12 +77,15 @@ def retry_on_status(status_codes=RETRIABLE_STATUS_CODES, max_retries=3, delay=1)
53
77
 
54
78
 
55
79
  class Nvcf(object):
56
- def __init__(self, metadata, datastore, environment, function_id, ngc_api_key):
80
+ def __init__(
81
+ self, metadata, datastore, environment, function_id, ngc_api_key, queue_timeout
82
+ ):
57
83
  self.metadata = metadata
58
84
  self.datastore = datastore
59
85
  self.environment = environment
60
86
  self._function_id = function_id
61
87
  self._ngc_api_key = ngc_api_key
88
+ self._queue_timeout = queue_timeout
62
89
 
63
90
  def launch_job(
64
91
  self,
@@ -120,6 +147,7 @@ class Nvcf(object):
120
147
  self.datastore._storage_impl,
121
148
  self._function_id,
122
149
  self._ngc_api_key,
150
+ self._queue_timeout,
123
151
  )
124
152
  self.job.submit()
125
153
 
@@ -172,7 +200,9 @@ result_endpoint = f"{nvcf_url}/v2/nvcf/pexec/status"
172
200
 
173
201
 
174
202
  class Job(object):
175
- def __init__(self, command, env, task_spec, backend, function_id, ngc_api_key):
203
+ def __init__(
204
+ self, command, env, task_spec, backend, function_id, ngc_api_key, queue_timeout
205
+ ):
176
206
  self._payload = {
177
207
  "command": command,
178
208
  "env": {k: v for k, v in env.items() if v is not None},
@@ -180,6 +210,8 @@ class Job(object):
180
210
  self._result = {}
181
211
  self._function_id = function_id
182
212
  self._ngc_api_key = ngc_api_key
213
+ self._queue_timeout = queue_timeout
214
+ self._poll_seconds = "3600"
183
215
 
184
216
  flow_name = task_spec.get("flow_name")
185
217
  run_id = task_spec.get("run_id")
@@ -214,11 +246,14 @@ class Job(object):
214
246
  )
215
247
  self.heartbeat_thread.start()
216
248
 
249
+ @retry_on_status(status_codes=[504])
217
250
  def submit(self):
218
251
  try:
219
252
  headers = {
220
253
  "Authorization": f"Bearer {self._ngc_api_key}",
221
254
  "Content-Type": "application/json",
255
+ "nvcf-feature-enable-gateway-timeout": "true",
256
+ "NVCF-POLL-SECONDS": self._poll_seconds,
222
257
  }
223
258
  request_data = json.dumps(self._payload).encode()
224
259
  request = Request(
@@ -239,12 +274,9 @@ class Job(object):
239
274
  self._status = JobStatus.SUBMITTED
240
275
  else:
241
276
  self._status = JobStatus.FAILED
242
- # TODO: Handle 404s nicely
243
- except (HTTPError, URLError) as e:
244
- # TODO: If queue is full, wait in line and retry?
245
- # without that, branching over concurrent requests causes error.
246
- self._state = JobStatus.FAILED
247
- raise e
277
+ except URLError:
278
+ self._status = JobStatus.FAILED
279
+ raise
248
280
 
249
281
  @property
250
282
  def status(self):
@@ -272,12 +304,15 @@ class Job(object):
272
304
  def result(self):
273
305
  return self._result
274
306
 
275
- @retry_on_status(status_codes=RETRIABLE_STATUS_CODES, max_retries=3, delay=5)
307
+ @retry_on_status(status_codes=[500], max_retries=3, delay=5)
308
+ @retry_on_status(status_codes=[504])
276
309
  def _poll(self):
277
310
  try:
278
311
  headers = {
279
312
  "Authorization": f"Bearer {self._ngc_api_key}",
280
313
  "Content-Type": "application/json",
314
+ "nvcf-feature-enable-gateway-timeout": "true",
315
+ "NVCF-POLL-SECONDS": self._poll_seconds,
281
316
  }
282
317
  request = Request(
283
318
  f"{result_endpoint}/{self._invocation_id}", headers=headers
@@ -296,11 +331,6 @@ class Job(object):
296
331
  f"[@nvidia] Unexpected response code: {response.getcode()}. Please notify an Outerbounds support engineer if this error persists."
297
332
  )
298
333
  self._status = JobStatus.FAILED
299
- # 4xx and 5xx responses go in 'except' block
300
- except HTTPError as e:
301
- if e.code not in RETRIABLE_STATUS_CODES:
302
- self._status = JobStatus.FAILED
303
- raise
304
334
  except URLError:
305
335
  self._status = JobStatus.FAILED
306
336
  raise
@@ -111,6 +111,9 @@ def kill(ctx, run_id):
111
111
  @click.argument("code-package-url")
112
112
  @click.option("--function-id", help="NVCF function id.")
113
113
  @click.option("--ngc-api-key", help="NGC API key.")
114
+ @click.option(
115
+ "--queue-timeout", default=5 * 24 * 3600, help="Queue timeout in seconds."
116
+ )
114
117
  @click.option("--run-id", help="Passed to the top-level 'step'.")
115
118
  @click.option("--task-id", help="Passed to the top-level 'step'.")
116
119
  @click.option("--input-paths", help="Passed to the top-level 'step'.")
@@ -133,6 +136,7 @@ def step(
133
136
  code_package_url,
134
137
  function_id,
135
138
  ngc_api_key,
139
+ queue_timeout,
136
140
  **kwargs,
137
141
  ):
138
142
  def echo(msg, stream="stderr", _id=None, **kwargs):
@@ -249,6 +253,7 @@ def step(
249
253
  ctx.obj.environment,
250
254
  function_id,
251
255
  ngc_api_key,
256
+ queue_timeout,
252
257
  )
253
258
  try:
254
259
  with ctx.obj.monitor.measure("metaflow.nvcf.launch_job"):
@@ -20,6 +20,7 @@ from .exceptions import (
20
20
  UnsupportedNvcfConfigurationException,
21
21
  UnsupportedNvcfDatastoreException,
22
22
  NvcfTimeoutTooShortException,
23
+ NvcfQueueTimeoutTooShortException,
23
24
  )
24
25
 
25
26
  from metaflow.metaflow_config import SERVICE_URL
@@ -36,10 +37,16 @@ class NvcfDecorator(StepDecorator):
36
37
  Number of GPUs to use.
37
38
  gpu_type : str
38
39
  Type of Nvidia GPU to use.
40
+ queue_timeout : int
41
+ Time to keep the job in NVCF's queue.
39
42
  """
40
43
 
41
44
  name = "nvidia"
42
- defaults = {"gpu": 1, "gpu_type": None}
45
+ defaults = {
46
+ "gpu": 1,
47
+ "gpu_type": None,
48
+ "queue_timeout": 5 * 24 * 3600, # Default 5 days in seconds
49
+ }
43
50
 
44
51
  package_url = None
45
52
  package_sha = None
@@ -118,6 +125,10 @@ class NvcfDecorator(StepDecorator):
118
125
  )
119
126
  self.attributes["function_id"] = available_configurations[desired_configuration]
120
127
 
128
+ queue_timeout = self.attributes["queue_timeout"]
129
+ if not isinstance(queue_timeout, int) or queue_timeout < 60:
130
+ raise NvcfQueueTimeoutTooShortException(step)
131
+
121
132
  def runtime_init(self, flow, graph, package, run_id):
122
133
  # Set some more internal state.
123
134
  self.flow = flow
@@ -144,6 +155,7 @@ class NvcfDecorator(StepDecorator):
144
155
  cli_options = {
145
156
  "function_id": self.attributes["function_id"],
146
157
  "ngc_api_key": self.attributes["ngc_api_key"],
158
+ "queue_timeout": self.attributes["queue_timeout"],
147
159
  }
148
160
  cli_args.command_options.update(cli_options)
149
161
  cli_args.entrypoint[0] = sys.executable
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ob-metaflow-extensions
3
- Version: 1.1.124
3
+ Version: 1.1.125
4
4
  Summary: Outerbounds Platform Extensions for Metaflow
5
5
  Author: Outerbounds, Inc.
6
6
  License: Commercial
@@ -2,7 +2,7 @@ from setuptools import setup, find_namespace_packages
2
2
  from pathlib import Path
3
3
 
4
4
 
5
- version = "1.1.124"
5
+ version = "1.1.125"
6
6
  this_directory = Path(__file__).parent
7
7
  long_description = (this_directory / "README.md").read_text()
8
8