ob-metaflow-extensions 1.1.122__py2.py3-none-any.whl → 1.1.123__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ob-metaflow-extensions might be problematic. Click here for more details.

@@ -1,8 +1,9 @@
1
1
  import json
2
2
  import os
3
+ import time
3
4
  import threading
4
- from urllib.parse import urlparse
5
5
  from urllib.request import HTTPError, Request, URLError, urlopen
6
+ from functools import wraps
6
7
 
7
8
  from metaflow import util
8
9
  from metaflow.mflog import (
@@ -12,7 +13,6 @@ from metaflow.mflog import (
12
13
  tail_logs,
13
14
  get_log_tailer,
14
15
  )
15
- import requests
16
16
  from .exceptions import NvcfJobFailedException, NvcfPollingConnectionError
17
17
 
18
18
  # Redirect structured logs to $PWD/.logs/
@@ -23,6 +23,35 @@ STDOUT_PATH = os.path.join(LOGS_DIR, STDOUT_FILE)
23
23
  STDERR_PATH = os.path.join(LOGS_DIR, STDERR_FILE)
24
24
 
25
25
 
26
+ RETRIABLE_STATUS_CODES = [500]
27
+
28
+
29
+ def retry_on_status(status_codes=RETRIABLE_STATUS_CODES, max_retries=3, delay=1):
30
+ def decorator(func):
31
+ @wraps(func)
32
+ def wrapper(*args, **kwargs):
33
+ retries = 0
34
+ while retries <= max_retries:
35
+ try:
36
+ return func(*args, **kwargs)
37
+ except HTTPError as e:
38
+ if e.code in status_codes and retries < max_retries:
39
+ retries += 1
40
+ print(
41
+ f"[@nvidia] Received {e.code} error, retrying ({retries}/{max_retries})..."
42
+ )
43
+ time.sleep(delay)
44
+ continue
45
+ raise
46
+ except Exception:
47
+ raise
48
+ return func(*args, **kwargs)
49
+
50
+ return wrapper
51
+
52
+ return decorator
53
+
54
+
26
55
  class Nvcf(object):
27
56
  def __init__(self, metadata, datastore, environment, function_id, ngc_api_key):
28
57
  self.metadata = metadata
@@ -220,7 +249,11 @@ class Job(object):
220
249
  @property
221
250
  def status(self):
222
251
  if self._status not in [JobStatus.SUCCESSFUL, JobStatus.FAILED]:
223
- self._poll()
252
+ try:
253
+ self._poll()
254
+ except (HTTPError, URLError) as e:
255
+ self._status = JobStatus.FAILED
256
+ raise NvcfPollingConnectionError(e)
224
257
  return self._status
225
258
 
226
259
  @property
@@ -239,6 +272,7 @@ class Job(object):
239
272
  def result(self):
240
273
  return self._result
241
274
 
275
+ @retry_on_status(status_codes=RETRIABLE_STATUS_CODES, max_retries=3, delay=5)
242
276
  def _poll(self):
243
277
  try:
244
278
  headers = {
@@ -257,11 +291,16 @@ class Job(object):
257
291
  else:
258
292
  self._status = JobStatus.FAILED
259
293
  self._result = data
260
- elif response.getcode() in [400, 500]:
261
- self._status = JobStatus.FAILED
262
294
  elif response.getcode() != 202:
263
295
  print(
264
296
  f"[@nvidia] Unexpected response code: {response.getcode()}. Please notify an Outerbounds support engineer if this error persists."
265
297
  )
266
- except (HTTPError, URLError) as e:
267
- raise NvcfPollingConnectionError(e)
298
+ self._status = JobStatus.FAILED
299
+ # 4xx and 5xx responses go in 'except' block
300
+ except HTTPError as e:
301
+ if e.code not in RETRIABLE_STATUS_CODES:
302
+ self._status = JobStatus.FAILED
303
+ raise
304
+ except URLError:
305
+ self._status = JobStatus.FAILED
306
+ raise
@@ -207,7 +207,7 @@ class RunningJob(object):
207
207
 
208
208
  @property
209
209
  def is_waiting(self):
210
- return self.status == "PENDING"
210
+ return self.status in ["PENDING", "UNKNOWN"]
211
211
 
212
212
  @property
213
213
  def is_running(self):
@@ -1,13 +1,13 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ob-metaflow-extensions
3
- Version: 1.1.122
3
+ Version: 1.1.123
4
4
  Summary: Outerbounds Platform Extensions for Metaflow
5
5
  Author: Outerbounds, Inc.
6
6
  License: Commercial
7
7
  Description-Content-Type: text/markdown
8
8
  Requires-Dist: boto3
9
9
  Requires-Dist: kubernetes
10
- Requires-Dist: ob-metaflow (==2.13.7.1)
10
+ Requires-Dist: ob-metaflow (==2.13.8.1)
11
11
 
12
12
  # Outerbounds platform package
13
13
 
@@ -17,7 +17,7 @@ metaflow_extensions/outerbounds/plugins/nvcf/__init__.py,sha256=47DEQpj8HBSa-_TI
17
17
  metaflow_extensions/outerbounds/plugins/nvcf/constants.py,sha256=aGHdNw_hqBu8i0zWXcatQM6e769wUXox0l8g0f6fNZ8,146
18
18
  metaflow_extensions/outerbounds/plugins/nvcf/exceptions.py,sha256=Wn5WvE_sY-L2jEz-iObMLii5Ds_HQJuE437ufadPFLk,3258
19
19
  metaflow_extensions/outerbounds/plugins/nvcf/heartbeat_store.py,sha256=pOWwm8LFQBbtku0zNBBwCyXxLK8U-hhC4naQcmU69nE,6217
20
- metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py,sha256=zOIDbN4PhRM2VMHczfoAHUeo1df2UrqWMgTwcppsTwc,8990
20
+ metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py,sha256=sfWaKZyKuM02v5DujPdfLbm-WoecxHfGn8g432Roct4,10273
21
21
  metaflow_extensions/outerbounds/plugins/nvcf/nvcf_cli.py,sha256=5pLEekiw3krlwpcjfjjfUL-URep6soZgmfTqtzLz4Vo,9362
22
22
  metaflow_extensions/outerbounds/plugins/nvcf/nvcf_decorator.py,sha256=yGv_6EmrBZNiQQP0rEWWE3akAL-KfI3Wd4ZFrcgl3VQ,8663
23
23
  metaflow_extensions/outerbounds/plugins/nvcf/utils.py,sha256=DxWSCayfa95e0HJkWacey1s1nxoTpaunGhrb_0Ayv28,133
@@ -33,7 +33,7 @@ metaflow_extensions/outerbounds/plugins/snowpark/snowpark_cli.py,sha256=ezJ2Jr8J
33
33
  metaflow_extensions/outerbounds/plugins/snowpark/snowpark_client.py,sha256=JEW0EUxj_mNZXo9OFkJFmWfg-P7_CEgvNbgsMTCBTAE,4273
34
34
  metaflow_extensions/outerbounds/plugins/snowpark/snowpark_decorator.py,sha256=a7LqSKULVh8IrR1StrVPbemHOLojR0nEqh-mMX-M1i4,9904
35
35
  metaflow_extensions/outerbounds/plugins/snowpark/snowpark_exceptions.py,sha256=FTfYlJu-sn9DkPOs2R1V1ChWb1vZthOgeq0BZdT1ucY,296
36
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_job.py,sha256=d_5UhXqZ_12rCvatH1capPQZYGLx1FVqq_rtW65OXyk,6874
36
+ metaflow_extensions/outerbounds/plugins/snowpark/snowpark_job.py,sha256=aQphxX6jqYgfa83w387pEWl0keuLm38V53I8P8UL2ck,6887
37
37
  metaflow_extensions/outerbounds/plugins/snowpark/snowpark_service_spec.py,sha256=AI_kcm1hZV3JRxJkookcH6twiGnAYjk9Dx-MeoYz60Y,8511
38
38
  metaflow_extensions/outerbounds/plugins/tensorboard/__init__.py,sha256=9lUM4Cqi5RjrHBRfG6AQMRz8-R96eZC8Ih0KD2lv22Y,1858
39
39
  metaflow_extensions/outerbounds/profilers/__init__.py,sha256=wa_jhnCBr82TBxoS0e8b6_6sLyZX0fdHicuGJZNTqKw,29
@@ -44,7 +44,7 @@ metaflow_extensions/outerbounds/toplevel/plugins/azure/__init__.py,sha256=WUuhz2
44
44
  metaflow_extensions/outerbounds/toplevel/plugins/gcp/__init__.py,sha256=BbZiaH3uILlEZ6ntBLKeNyqn3If8nIXZFq_Apd7Dhco,70
45
45
  metaflow_extensions/outerbounds/toplevel/plugins/kubernetes/__init__.py,sha256=5zG8gShSj8m7rgF4xgWBZFuY3GDP5n1T0ktjRpGJLHA,69
46
46
  metaflow_extensions/outerbounds/toplevel/plugins/snowflake/__init__.py,sha256=LptpH-ziXHrednMYUjIaosS1SXD3sOtF_9_eRqd8SJw,50
47
- ob_metaflow_extensions-1.1.122.dist-info/METADATA,sha256=cSdliiwbdi-0Cr7GI4nuQfRQyGfmdrXuU3BPR3UJQf0,520
48
- ob_metaflow_extensions-1.1.122.dist-info/WHEEL,sha256=bb2Ot9scclHKMOLDEHY6B2sicWOgugjFKaJsT7vwMQo,110
49
- ob_metaflow_extensions-1.1.122.dist-info/top_level.txt,sha256=NwG0ukwjygtanDETyp_BUdtYtqIA_lOjzFFh1TsnxvI,20
50
- ob_metaflow_extensions-1.1.122.dist-info/RECORD,,
47
+ ob_metaflow_extensions-1.1.123.dist-info/METADATA,sha256=JVisSFX7kwYfSyPmziBWAVYI87W1R7nmS5AJ2_AFtfQ,520
48
+ ob_metaflow_extensions-1.1.123.dist-info/WHEEL,sha256=bb2Ot9scclHKMOLDEHY6B2sicWOgugjFKaJsT7vwMQo,110
49
+ ob_metaflow_extensions-1.1.123.dist-info/top_level.txt,sha256=NwG0ukwjygtanDETyp_BUdtYtqIA_lOjzFFh1TsnxvI,20
50
+ ob_metaflow_extensions-1.1.123.dist-info/RECORD,,