ob-metaflow-extensions 1.1.144__py2.py3-none-any.whl → 1.1.146__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ob-metaflow-extensions might be problematic. Click here for more details.
- metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py +86 -56
- {ob_metaflow_extensions-1.1.144.dist-info → ob_metaflow_extensions-1.1.146.dist-info}/METADATA +1 -1
- {ob_metaflow_extensions-1.1.144.dist-info → ob_metaflow_extensions-1.1.146.dist-info}/RECORD +5 -5
- {ob_metaflow_extensions-1.1.144.dist-info → ob_metaflow_extensions-1.1.146.dist-info}/WHEEL +0 -0
- {ob_metaflow_extensions-1.1.144.dist-info → ob_metaflow_extensions-1.1.146.dist-info}/top_level.txt +0 -0
|
@@ -153,7 +153,7 @@ class Nvcf(object):
|
|
|
153
153
|
|
|
154
154
|
def wait(self, stdout_location, stderr_location, echo=None):
|
|
155
155
|
def wait_for_launch(job):
|
|
156
|
-
status = job.
|
|
156
|
+
status = job._status
|
|
157
157
|
echo(
|
|
158
158
|
"Task status: %s..." % status,
|
|
159
159
|
"stderr",
|
|
@@ -196,6 +196,8 @@ class JobStatus(object):
|
|
|
196
196
|
DISAPPEARED = "DISAPPEARED" # Job disappeared from NVCF but was previously polled (likely successful)
|
|
197
197
|
|
|
198
198
|
|
|
199
|
+
terminal_states = [JobStatus.SUCCESSFUL, JobStatus.FAILED, JobStatus.DISAPPEARED]
|
|
200
|
+
|
|
199
201
|
nvcf_url = "https://api.nvcf.nvidia.com"
|
|
200
202
|
submit_endpoint = f"{nvcf_url}/v2/nvcf/pexec/functions"
|
|
201
203
|
result_endpoint = f"{nvcf_url}/v2/nvcf/pexec/status"
|
|
@@ -218,7 +220,10 @@ class Job(object):
|
|
|
218
220
|
# Initialize status and tracking variables
|
|
219
221
|
self._status = JobStatus.CREATED
|
|
220
222
|
self._last_poll_time = time.time()
|
|
221
|
-
|
|
223
|
+
|
|
224
|
+
# State tracking for long polling
|
|
225
|
+
self._long_polling_active = False
|
|
226
|
+
self._poll_response = None
|
|
222
227
|
|
|
223
228
|
flow_name = task_spec.get("flow_name")
|
|
224
229
|
run_id = task_spec.get("run_id")
|
|
@@ -279,62 +284,31 @@ class Job(object):
|
|
|
279
284
|
self._result = data
|
|
280
285
|
elif response.getcode() == 202:
|
|
281
286
|
self._status = JobStatus.SUBMITTED
|
|
287
|
+
# Start long polling immediately after receiving 202
|
|
288
|
+
self._start_long_polling()
|
|
282
289
|
else:
|
|
283
290
|
self._status = JobStatus.FAILED
|
|
284
291
|
except URLError:
|
|
285
292
|
self._status = JobStatus.FAILED
|
|
286
293
|
raise
|
|
287
294
|
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
# If status is already terminal, don't poll again
|
|
297
|
-
if self._status in terminal_states:
|
|
298
|
-
return self._status
|
|
299
|
-
|
|
300
|
-
current_time = time.time()
|
|
301
|
-
if (
|
|
302
|
-
current_time - self._last_poll_time > self._force_poll_interval
|
|
303
|
-
or self._status not in terminal_states
|
|
304
|
-
):
|
|
295
|
+
def _start_long_polling(self):
|
|
296
|
+
if not self._long_polling_active:
|
|
297
|
+
self._long_polling_active = True
|
|
298
|
+
polling_thread = threading.Thread(target=self._long_poll_loop, daemon=True)
|
|
299
|
+
polling_thread.start()
|
|
300
|
+
|
|
301
|
+
def _long_poll_loop(self):
|
|
302
|
+
while self._long_polling_active and self._status not in terminal_states:
|
|
305
303
|
try:
|
|
306
304
|
self._poll()
|
|
307
|
-
self.
|
|
305
|
+
# No sleep needed - the request itself will block for up to self._poll_seconds
|
|
306
|
+
except Exception as e:
|
|
307
|
+
print(f"[@nvidia] Long polling error: {e}")
|
|
308
|
+
# Brief pause before retry on error
|
|
309
|
+
time.sleep(1)
|
|
308
310
|
|
|
309
|
-
|
|
310
|
-
if self._status == JobStatus.SUBMITTED:
|
|
311
|
-
self._status = JobStatus.POLLED
|
|
312
|
-
|
|
313
|
-
if self._status == JobStatus.SUCCESSFUL:
|
|
314
|
-
return self._status
|
|
315
|
-
|
|
316
|
-
except HTTPError as e:
|
|
317
|
-
if e.code == 404:
|
|
318
|
-
# 404 interpretation depends on job lifecycle
|
|
319
|
-
if self._status in [JobStatus.POLLED, JobStatus.SUBMITTED]:
|
|
320
|
-
# We've submitted or successfully polled this job before,
|
|
321
|
-
# so a 404 likely means it completed and was removed
|
|
322
|
-
self._status = JobStatus.DISAPPEARED
|
|
323
|
-
self._result = {"exit_code": 0}
|
|
324
|
-
else:
|
|
325
|
-
# Job was never successfully tracked
|
|
326
|
-
print(
|
|
327
|
-
f"[@nvidia] 404 received for job that was never successfully tracked - treating as failure"
|
|
328
|
-
)
|
|
329
|
-
self._status = JobStatus.FAILED
|
|
330
|
-
raise NvcfPollingConnectionError(e)
|
|
331
|
-
else:
|
|
332
|
-
self._status = JobStatus.FAILED
|
|
333
|
-
raise NvcfPollingConnectionError(e)
|
|
334
|
-
except URLError as e:
|
|
335
|
-
self._status = JobStatus.FAILED
|
|
336
|
-
raise NvcfPollingConnectionError(e)
|
|
337
|
-
return self._status
|
|
311
|
+
self._long_polling_active = False
|
|
338
312
|
|
|
339
313
|
@property
|
|
340
314
|
def id(self):
|
|
@@ -343,11 +317,11 @@ class Job(object):
|
|
|
343
317
|
@property
|
|
344
318
|
def is_running(self):
|
|
345
319
|
# Job is running if it's in SUBMITTED or POLLED state
|
|
346
|
-
return self.
|
|
320
|
+
return self._status in [JobStatus.SUBMITTED, JobStatus.POLLED]
|
|
347
321
|
|
|
348
322
|
@property
|
|
349
323
|
def has_failed(self):
|
|
350
|
-
return self.
|
|
324
|
+
return self._status == JobStatus.FAILED
|
|
351
325
|
|
|
352
326
|
@property
|
|
353
327
|
def result(self):
|
|
@@ -357,6 +331,14 @@ class Job(object):
|
|
|
357
331
|
@retry_on_status(status_codes=[504])
|
|
358
332
|
def _poll(self):
|
|
359
333
|
try:
|
|
334
|
+
# Implement rate limiting to prevent more than 1 request per second
|
|
335
|
+
current_time = time.time()
|
|
336
|
+
if (
|
|
337
|
+
hasattr(self, "_last_poll_time")
|
|
338
|
+
and current_time - self._last_poll_time < 1
|
|
339
|
+
):
|
|
340
|
+
time.sleep(1 - (current_time - self._last_poll_time))
|
|
341
|
+
|
|
360
342
|
headers = {
|
|
361
343
|
"Authorization": f"Bearer {self._ngc_api_key}",
|
|
362
344
|
"Content-Type": "application/json",
|
|
@@ -366,22 +348,70 @@ class Job(object):
|
|
|
366
348
|
request = Request(
|
|
367
349
|
f"{result_endpoint}/{self._invocation_id}", headers=headers
|
|
368
350
|
)
|
|
351
|
+
|
|
352
|
+
# Record time before making the request
|
|
353
|
+
self._last_poll_time = time.time()
|
|
354
|
+
|
|
369
355
|
response = urlopen(request)
|
|
370
356
|
body = response.read()
|
|
371
|
-
print(f"[@nvidia] polling
|
|
357
|
+
print(f"[@nvidia] polling status code: {response.getcode()}")
|
|
358
|
+
|
|
372
359
|
if response.getcode() == 200:
|
|
373
360
|
data = json.loads(body)
|
|
374
|
-
# TODO: Propagate the internal error forward
|
|
375
361
|
if data.get("exit_code") == 0:
|
|
376
362
|
self._status = JobStatus.SUCCESSFUL
|
|
377
363
|
else:
|
|
378
364
|
self._status = JobStatus.FAILED
|
|
379
365
|
self._result = data
|
|
380
|
-
|
|
366
|
+
self._long_polling_active = False # Stop polling once job completes
|
|
367
|
+
elif response.getcode() == 202:
|
|
368
|
+
# Job is still running - status remains SUBMITTED or POLLED
|
|
369
|
+
if self._status == JobStatus.SUBMITTED:
|
|
370
|
+
self._status = JobStatus.POLLED
|
|
371
|
+
elif response.getcode() == 302:
|
|
372
|
+
# Handle redirects for large responses or requests in different regions
|
|
373
|
+
redirect_location = response.headers.get("Location")
|
|
374
|
+
if redirect_location:
|
|
375
|
+
redirect_request = Request(redirect_location, headers=headers)
|
|
376
|
+
redirect_response = urlopen(redirect_request)
|
|
377
|
+
if redirect_response.getcode() == 200:
|
|
378
|
+
data = json.loads(redirect_response.read())
|
|
379
|
+
if data.get("exit_code") == 0:
|
|
380
|
+
self._status = JobStatus.SUCCESSFUL
|
|
381
|
+
else:
|
|
382
|
+
self._status = JobStatus.FAILED
|
|
383
|
+
self._result = data
|
|
384
|
+
self._long_polling_active = False
|
|
385
|
+
else:
|
|
381
386
|
print(
|
|
382
387
|
f"[@nvidia] Unexpected response code: {response.getcode()}. Please notify an Outerbounds support engineer if this error persists."
|
|
383
388
|
)
|
|
384
389
|
self._status = JobStatus.FAILED
|
|
385
|
-
|
|
390
|
+
|
|
391
|
+
except HTTPError as e:
|
|
392
|
+
if e.code == 404:
|
|
393
|
+
# 404 interpretation depends on job lifecycle
|
|
394
|
+
if self._status in [JobStatus.POLLED, JobStatus.SUBMITTED]:
|
|
395
|
+
# We've submitted or successfully polled this job before,
|
|
396
|
+
# so a 404 likely means it completed and was removed
|
|
397
|
+
self._status = JobStatus.DISAPPEARED
|
|
398
|
+
self._result = {"exit_code": 0}
|
|
399
|
+
print(
|
|
400
|
+
f"[@nvidia] 404 received for job that was previously tracked - assuming job completed"
|
|
401
|
+
)
|
|
402
|
+
else:
|
|
403
|
+
# Job was never successfully tracked
|
|
404
|
+
print(
|
|
405
|
+
f"[@nvidia] 404 received for job that was never successfully tracked - treating as failure"
|
|
406
|
+
)
|
|
407
|
+
self._status = JobStatus.FAILED
|
|
408
|
+
raise NvcfPollingConnectionError(e)
|
|
409
|
+
elif e.code in [500, 504]:
|
|
410
|
+
# Don't set status to FAILED, just re-raise for retry decorator
|
|
411
|
+
raise
|
|
412
|
+
else:
|
|
413
|
+
self._status = JobStatus.FAILED
|
|
414
|
+
raise NvcfPollingConnectionError(e)
|
|
415
|
+
except URLError as e:
|
|
386
416
|
self._status = JobStatus.FAILED
|
|
387
|
-
raise
|
|
417
|
+
raise NvcfPollingConnectionError(e)
|
{ob_metaflow_extensions-1.1.144.dist-info → ob_metaflow_extensions-1.1.146.dist-info}/RECORD
RENAMED
|
@@ -28,7 +28,7 @@ metaflow_extensions/outerbounds/plugins/nvcf/__init__.py,sha256=47DEQpj8HBSa-_TI
|
|
|
28
28
|
metaflow_extensions/outerbounds/plugins/nvcf/constants.py,sha256=aGHdNw_hqBu8i0zWXcatQM6e769wUXox0l8g0f6fNZ8,146
|
|
29
29
|
metaflow_extensions/outerbounds/plugins/nvcf/exceptions.py,sha256=-Pm9cOWUzpv94TvVUeq-FenAWdfLBJd5N7WPqIGZVqU,3671
|
|
30
30
|
metaflow_extensions/outerbounds/plugins/nvcf/heartbeat_store.py,sha256=pOWwm8LFQBbtku0zNBBwCyXxLK8U-hhC4naQcmU69nE,6217
|
|
31
|
-
metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py,sha256=
|
|
31
|
+
metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py,sha256=3ZFdYItVpFWnHMOeyV1nslUyelfvX5rknh2d2IWxVws,15591
|
|
32
32
|
metaflow_extensions/outerbounds/plugins/nvcf/nvcf_cli.py,sha256=3D-r5XO88Yh2k1EAZFJTe_PwdbhWp5qXflG8AgE4ZIU,9500
|
|
33
33
|
metaflow_extensions/outerbounds/plugins/nvcf/nvcf_decorator.py,sha256=pxxNxW4bW3kbB6ybRam657GyKIhvIkMuidB94iFfCD8,9116
|
|
34
34
|
metaflow_extensions/outerbounds/plugins/nvcf/utils.py,sha256=DxWSCayfa95e0HJkWacey1s1nxoTpaunGhrb_0Ayv28,133
|
|
@@ -58,7 +58,7 @@ metaflow_extensions/outerbounds/toplevel/plugins/gcp/__init__.py,sha256=BbZiaH3u
|
|
|
58
58
|
metaflow_extensions/outerbounds/toplevel/plugins/kubernetes/__init__.py,sha256=5zG8gShSj8m7rgF4xgWBZFuY3GDP5n1T0ktjRpGJLHA,69
|
|
59
59
|
metaflow_extensions/outerbounds/toplevel/plugins/ollama/__init__.py,sha256=GRSz2zwqkvlmFS6bcfYD_CX6CMko9DHQokMaH1iBshA,47
|
|
60
60
|
metaflow_extensions/outerbounds/toplevel/plugins/snowflake/__init__.py,sha256=LptpH-ziXHrednMYUjIaosS1SXD3sOtF_9_eRqd8SJw,50
|
|
61
|
-
ob_metaflow_extensions-1.1.
|
|
62
|
-
ob_metaflow_extensions-1.1.
|
|
63
|
-
ob_metaflow_extensions-1.1.
|
|
64
|
-
ob_metaflow_extensions-1.1.
|
|
61
|
+
ob_metaflow_extensions-1.1.146.dist-info/METADATA,sha256=Vtp-8yJaPeg8Hh33WXQYAerLoLlx5Ldya5M2U1lp5z0,520
|
|
62
|
+
ob_metaflow_extensions-1.1.146.dist-info/WHEEL,sha256=bb2Ot9scclHKMOLDEHY6B2sicWOgugjFKaJsT7vwMQo,110
|
|
63
|
+
ob_metaflow_extensions-1.1.146.dist-info/top_level.txt,sha256=NwG0ukwjygtanDETyp_BUdtYtqIA_lOjzFFh1TsnxvI,20
|
|
64
|
+
ob_metaflow_extensions-1.1.146.dist-info/RECORD,,
|
|
File without changes
|
{ob_metaflow_extensions-1.1.144.dist-info → ob_metaflow_extensions-1.1.146.dist-info}/top_level.txt
RENAMED
|
File without changes
|