ob-metaflow-extensions 1.1.144__py2.py3-none-any.whl → 1.1.146__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ob-metaflow-extensions might be problematic. Click here for more details.

@@ -153,7 +153,7 @@ class Nvcf(object):
153
153
 
154
154
  def wait(self, stdout_location, stderr_location, echo=None):
155
155
  def wait_for_launch(job):
156
- status = job.status
156
+ status = job._status
157
157
  echo(
158
158
  "Task status: %s..." % status,
159
159
  "stderr",
@@ -196,6 +196,8 @@ class JobStatus(object):
196
196
  DISAPPEARED = "DISAPPEARED" # Job disappeared from NVCF but was previously polled (likely successful)
197
197
 
198
198
 
199
+ terminal_states = [JobStatus.SUCCESSFUL, JobStatus.FAILED, JobStatus.DISAPPEARED]
200
+
199
201
  nvcf_url = "https://api.nvcf.nvidia.com"
200
202
  submit_endpoint = f"{nvcf_url}/v2/nvcf/pexec/functions"
201
203
  result_endpoint = f"{nvcf_url}/v2/nvcf/pexec/status"
@@ -218,7 +220,10 @@ class Job(object):
218
220
  # Initialize status and tracking variables
219
221
  self._status = JobStatus.CREATED
220
222
  self._last_poll_time = time.time()
221
- self._force_poll_interval = 30
223
+
224
+ # State tracking for long polling
225
+ self._long_polling_active = False
226
+ self._poll_response = None
222
227
 
223
228
  flow_name = task_spec.get("flow_name")
224
229
  run_id = task_spec.get("run_id")
@@ -279,62 +284,31 @@ class Job(object):
279
284
  self._result = data
280
285
  elif response.getcode() == 202:
281
286
  self._status = JobStatus.SUBMITTED
287
+ # Start long polling immediately after receiving 202
288
+ self._start_long_polling()
282
289
  else:
283
290
  self._status = JobStatus.FAILED
284
291
  except URLError:
285
292
  self._status = JobStatus.FAILED
286
293
  raise
287
294
 
288
- @property
289
- def status(self):
290
- terminal_states = [
291
- JobStatus.SUCCESSFUL,
292
- JobStatus.FAILED,
293
- JobStatus.DISAPPEARED,
294
- ]
295
-
296
- # If status is already terminal, don't poll again
297
- if self._status in terminal_states:
298
- return self._status
299
-
300
- current_time = time.time()
301
- if (
302
- current_time - self._last_poll_time > self._force_poll_interval
303
- or self._status not in terminal_states
304
- ):
295
+ def _start_long_polling(self):
296
+ if not self._long_polling_active:
297
+ self._long_polling_active = True
298
+ polling_thread = threading.Thread(target=self._long_poll_loop, daemon=True)
299
+ polling_thread.start()
300
+
301
+ def _long_poll_loop(self):
302
+ while self._long_polling_active and self._status not in terminal_states:
305
303
  try:
306
304
  self._poll()
307
- self._last_poll_time = current_time
305
+ # No sleep needed - the request itself will block for up to self._poll_seconds
306
+ except Exception as e:
307
+ print(f"[@nvidia] Long polling error: {e}")
308
+ # Brief pause before retry on error
309
+ time.sleep(1)
308
310
 
309
- # Update job status to POLLED if this is our first successful poll
310
- if self._status == JobStatus.SUBMITTED:
311
- self._status = JobStatus.POLLED
312
-
313
- if self._status == JobStatus.SUCCESSFUL:
314
- return self._status
315
-
316
- except HTTPError as e:
317
- if e.code == 404:
318
- # 404 interpretation depends on job lifecycle
319
- if self._status in [JobStatus.POLLED, JobStatus.SUBMITTED]:
320
- # We've submitted or successfully polled this job before,
321
- # so a 404 likely means it completed and was removed
322
- self._status = JobStatus.DISAPPEARED
323
- self._result = {"exit_code": 0}
324
- else:
325
- # Job was never successfully tracked
326
- print(
327
- f"[@nvidia] 404 received for job that was never successfully tracked - treating as failure"
328
- )
329
- self._status = JobStatus.FAILED
330
- raise NvcfPollingConnectionError(e)
331
- else:
332
- self._status = JobStatus.FAILED
333
- raise NvcfPollingConnectionError(e)
334
- except URLError as e:
335
- self._status = JobStatus.FAILED
336
- raise NvcfPollingConnectionError(e)
337
- return self._status
311
+ self._long_polling_active = False
338
312
 
339
313
  @property
340
314
  def id(self):
@@ -343,11 +317,11 @@ class Job(object):
343
317
  @property
344
318
  def is_running(self):
345
319
  # Job is running if it's in SUBMITTED or POLLED state
346
- return self.status in [JobStatus.SUBMITTED, JobStatus.POLLED]
320
+ return self._status in [JobStatus.SUBMITTED, JobStatus.POLLED]
347
321
 
348
322
  @property
349
323
  def has_failed(self):
350
- return self.status == JobStatus.FAILED
324
+ return self._status == JobStatus.FAILED
351
325
 
352
326
  @property
353
327
  def result(self):
@@ -357,6 +331,14 @@ class Job(object):
357
331
  @retry_on_status(status_codes=[504])
358
332
  def _poll(self):
359
333
  try:
334
+ # Implement rate limiting to prevent more than 1 request per second
335
+ current_time = time.time()
336
+ if (
337
+ hasattr(self, "_last_poll_time")
338
+ and current_time - self._last_poll_time < 1
339
+ ):
340
+ time.sleep(1 - (current_time - self._last_poll_time))
341
+
360
342
  headers = {
361
343
  "Authorization": f"Bearer {self._ngc_api_key}",
362
344
  "Content-Type": "application/json",
@@ -366,22 +348,70 @@ class Job(object):
366
348
  request = Request(
367
349
  f"{result_endpoint}/{self._invocation_id}", headers=headers
368
350
  )
351
+
352
+ # Record time before making the request
353
+ self._last_poll_time = time.time()
354
+
369
355
  response = urlopen(request)
370
356
  body = response.read()
371
- print(f"[@nvidia] polling response: {body}")
357
+ print(f"[@nvidia] polling status code: {response.getcode()}")
358
+
372
359
  if response.getcode() == 200:
373
360
  data = json.loads(body)
374
- # TODO: Propagate the internal error forward
375
361
  if data.get("exit_code") == 0:
376
362
  self._status = JobStatus.SUCCESSFUL
377
363
  else:
378
364
  self._status = JobStatus.FAILED
379
365
  self._result = data
380
- elif response.getcode() != 202:
366
+ self._long_polling_active = False # Stop polling once job completes
367
+ elif response.getcode() == 202:
368
+ # Job is still running - status remains SUBMITTED or POLLED
369
+ if self._status == JobStatus.SUBMITTED:
370
+ self._status = JobStatus.POLLED
371
+ elif response.getcode() == 302:
372
+ # Handle redirects for large responses or requests in different regions
373
+ redirect_location = response.headers.get("Location")
374
+ if redirect_location:
375
+ redirect_request = Request(redirect_location, headers=headers)
376
+ redirect_response = urlopen(redirect_request)
377
+ if redirect_response.getcode() == 200:
378
+ data = json.loads(redirect_response.read())
379
+ if data.get("exit_code") == 0:
380
+ self._status = JobStatus.SUCCESSFUL
381
+ else:
382
+ self._status = JobStatus.FAILED
383
+ self._result = data
384
+ self._long_polling_active = False
385
+ else:
381
386
  print(
382
387
  f"[@nvidia] Unexpected response code: {response.getcode()}. Please notify an Outerbounds support engineer if this error persists."
383
388
  )
384
389
  self._status = JobStatus.FAILED
385
- except URLError:
390
+
391
+ except HTTPError as e:
392
+ if e.code == 404:
393
+ # 404 interpretation depends on job lifecycle
394
+ if self._status in [JobStatus.POLLED, JobStatus.SUBMITTED]:
395
+ # We've submitted or successfully polled this job before,
396
+ # so a 404 likely means it completed and was removed
397
+ self._status = JobStatus.DISAPPEARED
398
+ self._result = {"exit_code": 0}
399
+ print(
400
+ f"[@nvidia] 404 received for job that was previously tracked - assuming job completed"
401
+ )
402
+ else:
403
+ # Job was never successfully tracked
404
+ print(
405
+ f"[@nvidia] 404 received for job that was never successfully tracked - treating as failure"
406
+ )
407
+ self._status = JobStatus.FAILED
408
+ raise NvcfPollingConnectionError(e)
409
+ elif e.code in [500, 504]:
410
+ # Don't set status to FAILED, just re-raise for retry decorator
411
+ raise
412
+ else:
413
+ self._status = JobStatus.FAILED
414
+ raise NvcfPollingConnectionError(e)
415
+ except URLError as e:
386
416
  self._status = JobStatus.FAILED
387
- raise
417
+ raise NvcfPollingConnectionError(e)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ob-metaflow-extensions
3
- Version: 1.1.144
3
+ Version: 1.1.146
4
4
  Summary: Outerbounds Platform Extensions for Metaflow
5
5
  Author: Outerbounds, Inc.
6
6
  License: Commercial
@@ -28,7 +28,7 @@ metaflow_extensions/outerbounds/plugins/nvcf/__init__.py,sha256=47DEQpj8HBSa-_TI
28
28
  metaflow_extensions/outerbounds/plugins/nvcf/constants.py,sha256=aGHdNw_hqBu8i0zWXcatQM6e769wUXox0l8g0f6fNZ8,146
29
29
  metaflow_extensions/outerbounds/plugins/nvcf/exceptions.py,sha256=-Pm9cOWUzpv94TvVUeq-FenAWdfLBJd5N7WPqIGZVqU,3671
30
30
  metaflow_extensions/outerbounds/plugins/nvcf/heartbeat_store.py,sha256=pOWwm8LFQBbtku0zNBBwCyXxLK8U-hhC4naQcmU69nE,6217
31
- metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py,sha256=pDnd30uGyBLgIMROI0ClN0wAaxvt2XrbBbc0QIZ9LmE,13872
31
+ metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py,sha256=3ZFdYItVpFWnHMOeyV1nslUyelfvX5rknh2d2IWxVws,15591
32
32
  metaflow_extensions/outerbounds/plugins/nvcf/nvcf_cli.py,sha256=3D-r5XO88Yh2k1EAZFJTe_PwdbhWp5qXflG8AgE4ZIU,9500
33
33
  metaflow_extensions/outerbounds/plugins/nvcf/nvcf_decorator.py,sha256=pxxNxW4bW3kbB6ybRam657GyKIhvIkMuidB94iFfCD8,9116
34
34
  metaflow_extensions/outerbounds/plugins/nvcf/utils.py,sha256=DxWSCayfa95e0HJkWacey1s1nxoTpaunGhrb_0Ayv28,133
@@ -58,7 +58,7 @@ metaflow_extensions/outerbounds/toplevel/plugins/gcp/__init__.py,sha256=BbZiaH3u
58
58
  metaflow_extensions/outerbounds/toplevel/plugins/kubernetes/__init__.py,sha256=5zG8gShSj8m7rgF4xgWBZFuY3GDP5n1T0ktjRpGJLHA,69
59
59
  metaflow_extensions/outerbounds/toplevel/plugins/ollama/__init__.py,sha256=GRSz2zwqkvlmFS6bcfYD_CX6CMko9DHQokMaH1iBshA,47
60
60
  metaflow_extensions/outerbounds/toplevel/plugins/snowflake/__init__.py,sha256=LptpH-ziXHrednMYUjIaosS1SXD3sOtF_9_eRqd8SJw,50
61
- ob_metaflow_extensions-1.1.144.dist-info/METADATA,sha256=zphrWhng37KADGpWv9zJHp4jZBfivvktH_a07HwU-i0,520
62
- ob_metaflow_extensions-1.1.144.dist-info/WHEEL,sha256=bb2Ot9scclHKMOLDEHY6B2sicWOgugjFKaJsT7vwMQo,110
63
- ob_metaflow_extensions-1.1.144.dist-info/top_level.txt,sha256=NwG0ukwjygtanDETyp_BUdtYtqIA_lOjzFFh1TsnxvI,20
64
- ob_metaflow_extensions-1.1.144.dist-info/RECORD,,
61
+ ob_metaflow_extensions-1.1.146.dist-info/METADATA,sha256=Vtp-8yJaPeg8Hh33WXQYAerLoLlx5Ldya5M2U1lp5z0,520
62
+ ob_metaflow_extensions-1.1.146.dist-info/WHEEL,sha256=bb2Ot9scclHKMOLDEHY6B2sicWOgugjFKaJsT7vwMQo,110
63
+ ob_metaflow_extensions-1.1.146.dist-info/top_level.txt,sha256=NwG0ukwjygtanDETyp_BUdtYtqIA_lOjzFFh1TsnxvI,20
64
+ ob_metaflow_extensions-1.1.146.dist-info/RECORD,,