ob-metaflow-extensions 1.1.130__py2.py3-none-any.whl → 1.5.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ob-metaflow-extensions might be problematic. Click here for more details.

Files changed (105) hide show
  1. metaflow_extensions/outerbounds/__init__.py +1 -1
  2. metaflow_extensions/outerbounds/plugins/__init__.py +34 -4
  3. metaflow_extensions/outerbounds/plugins/apps/__init__.py +0 -0
  4. metaflow_extensions/outerbounds/plugins/apps/app_cli.py +0 -0
  5. metaflow_extensions/outerbounds/plugins/apps/app_utils.py +187 -0
  6. metaflow_extensions/outerbounds/plugins/apps/consts.py +3 -0
  7. metaflow_extensions/outerbounds/plugins/apps/core/__init__.py +15 -0
  8. metaflow_extensions/outerbounds/plugins/apps/core/_state_machine.py +506 -0
  9. metaflow_extensions/outerbounds/plugins/apps/core/_vendor/__init__.py +0 -0
  10. metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/__init__.py +4 -0
  11. metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/spinners.py +478 -0
  12. metaflow_extensions/outerbounds/plugins/apps/core/app_config.py +128 -0
  13. metaflow_extensions/outerbounds/plugins/apps/core/app_deploy_decorator.py +330 -0
  14. metaflow_extensions/outerbounds/plugins/apps/core/artifacts.py +0 -0
  15. metaflow_extensions/outerbounds/plugins/apps/core/capsule.py +958 -0
  16. metaflow_extensions/outerbounds/plugins/apps/core/click_importer.py +24 -0
  17. metaflow_extensions/outerbounds/plugins/apps/core/code_package/__init__.py +3 -0
  18. metaflow_extensions/outerbounds/plugins/apps/core/code_package/code_packager.py +618 -0
  19. metaflow_extensions/outerbounds/plugins/apps/core/code_package/examples.py +125 -0
  20. metaflow_extensions/outerbounds/plugins/apps/core/config/__init__.py +15 -0
  21. metaflow_extensions/outerbounds/plugins/apps/core/config/cli_generator.py +165 -0
  22. metaflow_extensions/outerbounds/plugins/apps/core/config/config_utils.py +966 -0
  23. metaflow_extensions/outerbounds/plugins/apps/core/config/schema_export.py +299 -0
  24. metaflow_extensions/outerbounds/plugins/apps/core/config/typed_configs.py +233 -0
  25. metaflow_extensions/outerbounds/plugins/apps/core/config/typed_init_generator.py +537 -0
  26. metaflow_extensions/outerbounds/plugins/apps/core/config/unified_config.py +1125 -0
  27. metaflow_extensions/outerbounds/plugins/apps/core/config_schema.yaml +337 -0
  28. metaflow_extensions/outerbounds/plugins/apps/core/dependencies.py +115 -0
  29. metaflow_extensions/outerbounds/plugins/apps/core/deployer.py +959 -0
  30. metaflow_extensions/outerbounds/plugins/apps/core/experimental/__init__.py +89 -0
  31. metaflow_extensions/outerbounds/plugins/apps/core/perimeters.py +87 -0
  32. metaflow_extensions/outerbounds/plugins/apps/core/secrets.py +164 -0
  33. metaflow_extensions/outerbounds/plugins/apps/core/utils.py +233 -0
  34. metaflow_extensions/outerbounds/plugins/apps/core/validations.py +17 -0
  35. metaflow_extensions/outerbounds/plugins/apps/deploy_decorator.py +201 -0
  36. metaflow_extensions/outerbounds/plugins/apps/supervisord_utils.py +243 -0
  37. metaflow_extensions/outerbounds/plugins/aws/__init__.py +4 -0
  38. metaflow_extensions/outerbounds/plugins/aws/assume_role.py +3 -0
  39. metaflow_extensions/outerbounds/plugins/aws/assume_role_decorator.py +118 -0
  40. metaflow_extensions/outerbounds/plugins/card_utilities/injector.py +1 -1
  41. metaflow_extensions/outerbounds/plugins/checkpoint_datastores/__init__.py +2 -0
  42. metaflow_extensions/outerbounds/plugins/checkpoint_datastores/coreweave.py +71 -0
  43. metaflow_extensions/outerbounds/plugins/checkpoint_datastores/external_chckpt.py +85 -0
  44. metaflow_extensions/outerbounds/plugins/checkpoint_datastores/nebius.py +73 -0
  45. metaflow_extensions/outerbounds/plugins/fast_bakery/baker.py +110 -0
  46. metaflow_extensions/outerbounds/plugins/fast_bakery/docker_environment.py +43 -9
  47. metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery.py +12 -0
  48. metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py +18 -44
  49. metaflow_extensions/outerbounds/plugins/kubernetes/pod_killer.py +374 -0
  50. metaflow_extensions/outerbounds/plugins/nim/card.py +2 -16
  51. metaflow_extensions/outerbounds/plugins/nim/{__init__.py → nim_decorator.py} +13 -49
  52. metaflow_extensions/outerbounds/plugins/nim/nim_manager.py +294 -233
  53. metaflow_extensions/outerbounds/plugins/nim/utils.py +36 -0
  54. metaflow_extensions/outerbounds/plugins/nvcf/constants.py +2 -2
  55. metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py +100 -19
  56. metaflow_extensions/outerbounds/plugins/nvcf/nvcf_decorator.py +6 -1
  57. metaflow_extensions/outerbounds/plugins/nvct/__init__.py +0 -0
  58. metaflow_extensions/outerbounds/plugins/nvct/exceptions.py +71 -0
  59. metaflow_extensions/outerbounds/plugins/nvct/nvct.py +131 -0
  60. metaflow_extensions/outerbounds/plugins/nvct/nvct_cli.py +289 -0
  61. metaflow_extensions/outerbounds/plugins/nvct/nvct_decorator.py +286 -0
  62. metaflow_extensions/outerbounds/plugins/nvct/nvct_runner.py +218 -0
  63. metaflow_extensions/outerbounds/plugins/nvct/utils.py +29 -0
  64. metaflow_extensions/outerbounds/plugins/ollama/__init__.py +225 -0
  65. metaflow_extensions/outerbounds/plugins/ollama/constants.py +1 -0
  66. metaflow_extensions/outerbounds/plugins/ollama/exceptions.py +22 -0
  67. metaflow_extensions/outerbounds/plugins/ollama/ollama.py +1924 -0
  68. metaflow_extensions/outerbounds/plugins/ollama/status_card.py +292 -0
  69. metaflow_extensions/outerbounds/plugins/optuna/__init__.py +48 -0
  70. metaflow_extensions/outerbounds/plugins/profilers/simple_card_decorator.py +96 -0
  71. metaflow_extensions/outerbounds/plugins/s3_proxy/__init__.py +7 -0
  72. metaflow_extensions/outerbounds/plugins/s3_proxy/binary_caller.py +132 -0
  73. metaflow_extensions/outerbounds/plugins/s3_proxy/constants.py +11 -0
  74. metaflow_extensions/outerbounds/plugins/s3_proxy/exceptions.py +13 -0
  75. metaflow_extensions/outerbounds/plugins/s3_proxy/proxy_bootstrap.py +59 -0
  76. metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_api.py +93 -0
  77. metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_decorator.py +250 -0
  78. metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_manager.py +225 -0
  79. metaflow_extensions/outerbounds/plugins/secrets/secrets.py +38 -2
  80. metaflow_extensions/outerbounds/plugins/snowflake/snowflake.py +81 -11
  81. metaflow_extensions/outerbounds/plugins/snowpark/snowpark.py +18 -8
  82. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_cli.py +6 -0
  83. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_client.py +45 -18
  84. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_decorator.py +18 -9
  85. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_job.py +10 -4
  86. metaflow_extensions/outerbounds/plugins/torchtune/__init__.py +163 -0
  87. metaflow_extensions/outerbounds/plugins/vllm/__init__.py +255 -0
  88. metaflow_extensions/outerbounds/plugins/vllm/constants.py +1 -0
  89. metaflow_extensions/outerbounds/plugins/vllm/exceptions.py +1 -0
  90. metaflow_extensions/outerbounds/plugins/vllm/status_card.py +352 -0
  91. metaflow_extensions/outerbounds/plugins/vllm/vllm_manager.py +621 -0
  92. metaflow_extensions/outerbounds/remote_config.py +46 -9
  93. metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py +94 -2
  94. metaflow_extensions/outerbounds/toplevel/ob_internal.py +4 -0
  95. metaflow_extensions/outerbounds/toplevel/plugins/ollama/__init__.py +1 -0
  96. metaflow_extensions/outerbounds/toplevel/plugins/optuna/__init__.py +1 -0
  97. metaflow_extensions/outerbounds/toplevel/plugins/torchtune/__init__.py +1 -0
  98. metaflow_extensions/outerbounds/toplevel/plugins/vllm/__init__.py +1 -0
  99. metaflow_extensions/outerbounds/toplevel/s3_proxy.py +88 -0
  100. {ob_metaflow_extensions-1.1.130.dist-info → ob_metaflow_extensions-1.5.1.dist-info}/METADATA +2 -2
  101. ob_metaflow_extensions-1.5.1.dist-info/RECORD +133 -0
  102. metaflow_extensions/outerbounds/plugins/nim/utilities.py +0 -5
  103. ob_metaflow_extensions-1.1.130.dist-info/RECORD +0 -56
  104. {ob_metaflow_extensions-1.1.130.dist-info → ob_metaflow_extensions-1.5.1.dist-info}/WHEEL +0 -0
  105. {ob_metaflow_extensions-1.1.130.dist-info → ob_metaflow_extensions-1.5.1.dist-info}/top_level.txt +0 -0
@@ -153,7 +153,7 @@ class Nvcf(object):
153
153
 
154
154
  def wait(self, stdout_location, stderr_location, echo=None):
155
155
  def wait_for_launch(job):
156
- status = job.status
156
+ status = job._status
157
157
  echo(
158
158
  "Task status: %s..." % status,
159
159
  "stderr",
@@ -188,12 +188,16 @@ class Nvcf(object):
188
188
 
189
189
 
190
190
  class JobStatus(object):
191
- SUBMITTED = "SUBMITTED"
192
- RUNNING = "RUNNING"
193
- SUCCESSFUL = "SUCCESSFUL"
194
- FAILED = "FAILED"
191
+ CREATED = "CREATED" # Job object created but not submitted
192
+ SUBMITTED = "SUBMITTED" # Job submitted to NVCF
193
+ POLLED = "POLLED" # Job has been successfully polled at least once
194
+ SUCCESSFUL = "SUCCESSFUL" # Job completed successfully
195
+ FAILED = "FAILED" # Job failed
196
+ DISAPPEARED = "DISAPPEARED" # Job disappeared from NVCF but was previously polled (likely successful)
195
197
 
196
198
 
199
+ terminal_states = [JobStatus.SUCCESSFUL, JobStatus.FAILED, JobStatus.DISAPPEARED]
200
+
197
201
  nvcf_url = "https://api.nvcf.nvidia.com"
198
202
  submit_endpoint = f"{nvcf_url}/v2/nvcf/pexec/functions"
199
203
  result_endpoint = f"{nvcf_url}/v2/nvcf/pexec/status"
@@ -213,6 +217,14 @@ class Job(object):
213
217
  self._queue_timeout = queue_timeout
214
218
  self._poll_seconds = "3600"
215
219
 
220
+ # Initialize status and tracking variables
221
+ self._status = JobStatus.CREATED
222
+ self._last_poll_time = time.time()
223
+
224
+ # State tracking for long polling
225
+ self._long_polling_active = False
226
+ self._poll_response = None
227
+
216
228
  flow_name = task_spec.get("flow_name")
217
229
  run_id = task_spec.get("run_id")
218
230
  step_name = task_spec.get("step_name")
@@ -272,21 +284,31 @@ class Job(object):
272
284
  self._result = data
273
285
  elif response.getcode() == 202:
274
286
  self._status = JobStatus.SUBMITTED
287
+ # Start long polling immediately after receiving 202
288
+ self._start_long_polling()
275
289
  else:
276
290
  self._status = JobStatus.FAILED
277
291
  except URLError:
278
292
  self._status = JobStatus.FAILED
279
293
  raise
280
294
 
281
- @property
282
- def status(self):
283
- if self._status not in [JobStatus.SUCCESSFUL, JobStatus.FAILED]:
295
+ def _start_long_polling(self):
296
+ if not self._long_polling_active:
297
+ self._long_polling_active = True
298
+ polling_thread = threading.Thread(target=self._long_poll_loop, daemon=True)
299
+ polling_thread.start()
300
+
301
+ def _long_poll_loop(self):
302
+ while self._long_polling_active and self._status not in terminal_states:
284
303
  try:
285
304
  self._poll()
286
- except (HTTPError, URLError) as e:
287
- self._status = JobStatus.FAILED
288
- raise NvcfPollingConnectionError(e)
289
- return self._status
305
+ # No sleep needed - the request itself will block for up to self._poll_seconds
306
+ except Exception as e:
307
+ print(f"[@nvidia] Long polling error: {e}")
308
+ # Brief pause before retry on error
309
+ time.sleep(1)
310
+
311
+ self._long_polling_active = False
290
312
 
291
313
  @property
292
314
  def id(self):
@@ -294,11 +316,12 @@ class Job(object):
294
316
 
295
317
  @property
296
318
  def is_running(self):
297
- return self.status == JobStatus.SUBMITTED
319
+ # Job is running if it's in SUBMITTED or POLLED state
320
+ return self._status in [JobStatus.SUBMITTED, JobStatus.POLLED]
298
321
 
299
322
  @property
300
323
  def has_failed(self):
301
- return self.status == JobStatus.FAILED
324
+ return self._status == JobStatus.FAILED
302
325
 
303
326
  @property
304
327
  def result(self):
@@ -308,6 +331,14 @@ class Job(object):
308
331
  @retry_on_status(status_codes=[504])
309
332
  def _poll(self):
310
333
  try:
334
+ # Implement rate limiting to prevent more than 1 request per second
335
+ current_time = time.time()
336
+ if (
337
+ hasattr(self, "_last_poll_time")
338
+ and current_time - self._last_poll_time < 1
339
+ ):
340
+ time.sleep(1 - (current_time - self._last_poll_time))
341
+
311
342
  headers = {
312
343
  "Authorization": f"Bearer {self._ngc_api_key}",
313
344
  "Content-Type": "application/json",
@@ -317,20 +348,70 @@ class Job(object):
317
348
  request = Request(
318
349
  f"{result_endpoint}/{self._invocation_id}", headers=headers
319
350
  )
351
+
352
+ # Record time before making the request
353
+ self._last_poll_time = time.time()
354
+
320
355
  response = urlopen(request)
356
+ body = response.read()
357
+ print(f"[@nvidia] polling status code: {response.getcode()}")
358
+
321
359
  if response.getcode() == 200:
322
- data = json.loads(response.read())
323
- # TODO: Propagate the internal error forward
360
+ data = json.loads(body)
324
361
  if data.get("exit_code") == 0:
325
362
  self._status = JobStatus.SUCCESSFUL
326
363
  else:
327
364
  self._status = JobStatus.FAILED
328
365
  self._result = data
329
- elif response.getcode() != 202:
366
+ self._long_polling_active = False # Stop polling once job completes
367
+ elif response.getcode() == 202:
368
+ # Job is still running - status remains SUBMITTED or POLLED
369
+ if self._status == JobStatus.SUBMITTED:
370
+ self._status = JobStatus.POLLED
371
+ elif response.getcode() == 302:
372
+ # Handle redirects for large responses or requests in different regions
373
+ redirect_location = response.headers.get("Location")
374
+ if redirect_location:
375
+ redirect_request = Request(redirect_location, headers=headers)
376
+ redirect_response = urlopen(redirect_request)
377
+ if redirect_response.getcode() == 200:
378
+ data = json.loads(redirect_response.read())
379
+ if data.get("exit_code") == 0:
380
+ self._status = JobStatus.SUCCESSFUL
381
+ else:
382
+ self._status = JobStatus.FAILED
383
+ self._result = data
384
+ self._long_polling_active = False
385
+ else:
330
386
  print(
331
387
  f"[@nvidia] Unexpected response code: {response.getcode()}. Please notify an Outerbounds support engineer if this error persists."
332
388
  )
333
389
  self._status = JobStatus.FAILED
334
- except URLError:
390
+
391
+ except HTTPError as e:
392
+ if e.code == 404:
393
+ # 404 interpretation depends on job lifecycle
394
+ if self._status in [JobStatus.POLLED, JobStatus.SUBMITTED]:
395
+ # We've submitted or successfully polled this job before,
396
+ # so a 404 likely means it completed and was removed
397
+ self._status = JobStatus.DISAPPEARED
398
+ self._result = {"exit_code": 0}
399
+ print(
400
+ f"[@nvidia] 404 received for job that was previously tracked - assuming job completed"
401
+ )
402
+ else:
403
+ # Job was never successfully tracked
404
+ print(
405
+ f"[@nvidia] 404 received for job that was never successfully tracked - treating as failure"
406
+ )
407
+ self._status = JobStatus.FAILED
408
+ raise NvcfPollingConnectionError(e)
409
+ elif e.code in [500, 504]:
410
+ # Don't set status to FAILED, just re-raise for retry decorator
411
+ raise
412
+ else:
413
+ self._status = JobStatus.FAILED
414
+ raise NvcfPollingConnectionError(e)
415
+ except URLError as e:
335
416
  self._status = JobStatus.FAILED
336
- raise
417
+ raise NvcfPollingConnectionError(e)
@@ -198,7 +198,12 @@ class NvcfDecorator(StepDecorator):
198
198
  meta["nvcf-nspectid"] = os.environ.get("NVCF_NSPECTID")
199
199
 
200
200
  entries = [
201
- MetaDatum(field=k, value=v, type=k, tags=[])
201
+ MetaDatum(
202
+ field=k,
203
+ value=v,
204
+ type=k,
205
+ tags=["attempt_id:{0}".format(retry_count)],
206
+ )
202
207
  for k, v in meta.items()
203
208
  if v is not None
204
209
  ]
@@ -0,0 +1,71 @@
1
+ from metaflow.exception import MetaflowException
2
+
3
+
4
+ class NvctExecutionException(MetaflowException):
5
+ headline = "Nvct task couldn't be executed"
6
+
7
+
8
+ class NvctTaskFailedException(MetaflowException):
9
+ headline = "Nvct task failed"
10
+
11
+
12
+ class NvctKilledException(MetaflowException):
13
+ headline = "Nvct job killed"
14
+
15
+
16
+ class RequestedGPUTypeUnavailableException(MetaflowException):
17
+ headline = "[@nvct RequestedGPUTypeUnavailableException] GPU type unavailable."
18
+
19
+ def __init__(self, requested_gpu_type, available_gpus):
20
+ msg = (
21
+ f"The requested GPU type @nvct(..., gpu_type='{requested_gpu_type}') is not available. "
22
+ f"Please choose from the following supported GPU types when using @nvct: {available_gpus}"
23
+ )
24
+ super(RequestedGPUTypeUnavailableException, self).__init__(msg)
25
+
26
+
27
+ class UnsupportedNvctConfigurationException(MetaflowException):
28
+ headline = (
29
+ "[@nvct UnsupportedNvctConfigurationException] Unsupported GPU configuration"
30
+ )
31
+
32
+ def __init__(self, n_gpu, gpu_type, available_configurations, step):
33
+ msg = f"The requested configuration of @nvct(gpu={n_gpu}, gpu_type='{gpu_type}') for @step {step} is not available."
34
+ if len(available_configurations) == 0:
35
+ msg += (
36
+ "\n\nNo configurations are available in your Outerbounds deployment."
37
+ " Please contact Outerbounds support if you wish to use @nvct."
38
+ )
39
+ else:
40
+ msg += f"\n\nAvailable configurations for your deployment with {gpu_type} include: \n\t- {self._display(gpu_type, available_configurations)}"
41
+ msg += "\n\nPlease contact Outerbounds support if you wish to use a configuration not listed above."
42
+ super(UnsupportedNvctConfigurationException, self).__init__(msg)
43
+
44
+ def _display(self, gpu_type, configs):
45
+ _available_decos = []
46
+ for cfg in configs:
47
+ n_gpu = cfg["n_gpus"]
48
+ _available_decos.append(f"@nvct(gpu={n_gpu}, gpu_type='{gpu_type}')")
49
+ return "\n\t- ".join(_available_decos)
50
+
51
+
52
+ class UnsupportedNvctDatastoreException(MetaflowException):
53
+ headline = "[@nvct UnsupportedNvctDatastoreException] Unsupported datastore"
54
+
55
+ def __init__(self, ds_type):
56
+ msg = (
57
+ "The *@nvct* decorator requires --datastore=s3 or --datastore=azure or --datastore=gs at the moment."
58
+ f"Current datastore type: {ds_type}."
59
+ )
60
+ super(UnsupportedNvctDatastoreException, self).__init__(msg)
61
+
62
+
63
+ class NvctTimeoutTooShortException(MetaflowException):
64
+ headline = "[@nvct NvctTimeoutTooShortException] Timeout too short"
65
+
66
+ def __init__(self, step):
67
+ msg = (
68
+ "The timeout for step *{step}* should be at least 60 seconds for "
69
+ "execution with @nvct".format(step=step)
70
+ )
71
+ super(NvctTimeoutTooShortException, self).__init__(msg)
@@ -0,0 +1,131 @@
1
+ import requests
2
+ from requests.adapters import HTTPAdapter, Retry
3
+
4
+ BASE_URL = "https://api.ngc.nvidia.com/v2/orgs/zhxkmsaasxhw/"
5
+ POLL_SEC = 1
6
+
7
+
8
+ def _session(api_key):
9
+ s = requests.Session()
10
+ s.headers.update(
11
+ {
12
+ "Authorization": f"Bearer {api_key}",
13
+ "Content-Type": "application/json",
14
+ }
15
+ )
16
+ retry = Retry(total=5, backoff_factor=1.5, status_forcelist=[502, 503, 504])
17
+ s.mount("https://", HTTPAdapter(max_retries=retry))
18
+ return s
19
+
20
+
21
+ def _url(path):
22
+ return BASE_URL.rstrip("/") + path
23
+
24
+
25
+ class NVCTClient:
26
+ def __init__(self, api_key):
27
+ self.sess = _session(api_key)
28
+
29
+ # TODO: Handle https://outerboundsco.slack.com/archives/C05QGNR4E06/p1745970955540289
30
+ def create(self, spec):
31
+ r = self.sess.post(_url("/nvct/tasks"), json=spec, timeout=30)
32
+ r.raise_for_status()
33
+ return r.json().get("task", {}).get("id")
34
+
35
+ def get(self, task_id):
36
+ r = self.sess.get(_url(f"/nvct/tasks/{task_id}"), timeout=30)
37
+ r.raise_for_status()
38
+ return r.json().get("task", {})
39
+
40
+ def cancel(self, task_id):
41
+ r = self.sess.post(_url(f"/nvct/tasks/{task_id}/cancel"), timeout=30)
42
+ r.raise_for_status()
43
+
44
+
45
+ class NVCTRequest(object):
46
+ def __init__(self, name):
47
+ self._spec = {}
48
+ self._spec["name"] = name
49
+ self._spec["gpuSpecification"] = {}
50
+ self._spec["resultHandlingStrategy"] = "NONE"
51
+ self._spec["terminationGracePeriodDuration"] = "PT10M"
52
+
53
+ def container_image(self, image):
54
+ self._spec["containerImage"] = image
55
+ return self
56
+
57
+ def container_args(self, args):
58
+ self._spec["containerArgs"] = args
59
+ return self
60
+
61
+ def env(self, key, value):
62
+ env_list = self._spec.setdefault("containerEnvironment", [])
63
+ env_list.append({"key": key, "value": value})
64
+ return self
65
+
66
+ def gpu(self, gpu, instance_type, backend):
67
+ gpu_spec = self._spec["gpuSpecification"]
68
+ gpu_spec["gpu"] = gpu
69
+ gpu_spec["instanceType"] = instance_type
70
+ gpu_spec["backend"] = backend
71
+ return self
72
+
73
+ def max_runtime(self, iso_duration):
74
+ self._spec["maxRuntimeDuration"] = iso_duration
75
+ return self
76
+
77
+ def max_queued(self, iso_duration="PT72H"):
78
+ self._spec["maxQueuedDuration"] = iso_duration
79
+ return self
80
+
81
+ def termination_grace(self, iso_duration="PT10M"):
82
+ self._spec["terminationGracePeriodDuration"] = iso_duration
83
+ return self
84
+
85
+ def extra(self, key, value):
86
+ self._spec[key] = value
87
+ return self
88
+
89
+ def to_dict(self):
90
+ return self._spec
91
+
92
+
93
+ class NVCTTask:
94
+ def __init__(self, client: NVCTClient, spec):
95
+ self.client = client
96
+ self.spec = spec
97
+ self.id = None
98
+ self.record = None
99
+
100
+ def submit(self):
101
+ self.id = self.client.create(self.spec)
102
+ return self.id
103
+
104
+ def cancel(self):
105
+ if not self.has_finished:
106
+ self.client.cancel(self.id)
107
+
108
+ @property
109
+ def status(self):
110
+ self.record = self.client.get(self.id)
111
+ return self.record["status"]
112
+
113
+ @property
114
+ def is_waiting(self):
115
+ return self.status == "QUEUED"
116
+
117
+ @property
118
+ def is_running(self):
119
+ return self.status in {"RUNNING", "LAUNCHED"}
120
+
121
+ @property
122
+ def has_failed(self):
123
+ return self.status in {"ERRORED", "CANCELED"}
124
+
125
+ @property
126
+ def has_succeeded(self):
127
+ return self.status == "COMPLETED"
128
+
129
+ @property
130
+ def has_finished(self):
131
+ return self.has_succeeded or self.has_failed