ob-metaflow-extensions 1.1.142__py2.py3-none-any.whl → 1.4.33__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metaflow_extensions/outerbounds/__init__.py +1 -1
- metaflow_extensions/outerbounds/plugins/__init__.py +26 -5
- metaflow_extensions/outerbounds/plugins/apps/app_cli.py +0 -0
- metaflow_extensions/outerbounds/plugins/apps/app_deploy_decorator.py +146 -0
- metaflow_extensions/outerbounds/plugins/apps/core/__init__.py +10 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_state_machine.py +506 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_vendor/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/__init__.py +4 -0
- metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/spinners.py +478 -0
- metaflow_extensions/outerbounds/plugins/apps/core/app_cli.py +1200 -0
- metaflow_extensions/outerbounds/plugins/apps/core/app_config.py +146 -0
- metaflow_extensions/outerbounds/plugins/apps/core/artifacts.py +0 -0
- metaflow_extensions/outerbounds/plugins/apps/core/capsule.py +958 -0
- metaflow_extensions/outerbounds/plugins/apps/core/click_importer.py +24 -0
- metaflow_extensions/outerbounds/plugins/apps/core/code_package/__init__.py +3 -0
- metaflow_extensions/outerbounds/plugins/apps/core/code_package/code_packager.py +618 -0
- metaflow_extensions/outerbounds/plugins/apps/core/code_package/examples.py +125 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/__init__.py +12 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/cli_generator.py +161 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/config_utils.py +868 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/schema_export.py +288 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/typed_configs.py +139 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/typed_init_generator.py +398 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config/unified_config.py +1088 -0
- metaflow_extensions/outerbounds/plugins/apps/core/config_schema.yaml +337 -0
- metaflow_extensions/outerbounds/plugins/apps/core/dependencies.py +115 -0
- metaflow_extensions/outerbounds/plugins/apps/core/deployer.py +303 -0
- metaflow_extensions/outerbounds/plugins/apps/core/experimental/__init__.py +89 -0
- metaflow_extensions/outerbounds/plugins/apps/core/perimeters.py +87 -0
- metaflow_extensions/outerbounds/plugins/apps/core/secrets.py +164 -0
- metaflow_extensions/outerbounds/plugins/apps/core/utils.py +233 -0
- metaflow_extensions/outerbounds/plugins/apps/core/validations.py +17 -0
- metaflow_extensions/outerbounds/plugins/aws/__init__.py +4 -0
- metaflow_extensions/outerbounds/plugins/aws/assume_role.py +3 -0
- metaflow_extensions/outerbounds/plugins/aws/assume_role_decorator.py +78 -0
- metaflow_extensions/outerbounds/plugins/checkpoint_datastores/__init__.py +2 -0
- metaflow_extensions/outerbounds/plugins/checkpoint_datastores/coreweave.py +71 -0
- metaflow_extensions/outerbounds/plugins/checkpoint_datastores/external_chckpt.py +85 -0
- metaflow_extensions/outerbounds/plugins/checkpoint_datastores/nebius.py +73 -0
- metaflow_extensions/outerbounds/plugins/fast_bakery/baker.py +110 -0
- metaflow_extensions/outerbounds/plugins/fast_bakery/docker_environment.py +17 -3
- metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery.py +1 -0
- metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py +18 -44
- metaflow_extensions/outerbounds/plugins/kubernetes/pod_killer.py +374 -0
- metaflow_extensions/outerbounds/plugins/nim/card.py +1 -6
- metaflow_extensions/outerbounds/plugins/nim/{__init__.py → nim_decorator.py} +13 -49
- metaflow_extensions/outerbounds/plugins/nim/nim_manager.py +294 -233
- metaflow_extensions/outerbounds/plugins/nim/utils.py +36 -0
- metaflow_extensions/outerbounds/plugins/nvcf/constants.py +2 -2
- metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py +100 -19
- metaflow_extensions/outerbounds/plugins/nvcf/nvcf_decorator.py +6 -1
- metaflow_extensions/outerbounds/plugins/nvct/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/nvct/exceptions.py +71 -0
- metaflow_extensions/outerbounds/plugins/nvct/nvct.py +131 -0
- metaflow_extensions/outerbounds/plugins/nvct/nvct_cli.py +289 -0
- metaflow_extensions/outerbounds/plugins/nvct/nvct_decorator.py +286 -0
- metaflow_extensions/outerbounds/plugins/nvct/nvct_runner.py +218 -0
- metaflow_extensions/outerbounds/plugins/nvct/utils.py +29 -0
- metaflow_extensions/outerbounds/plugins/ollama/__init__.py +171 -16
- metaflow_extensions/outerbounds/plugins/ollama/constants.py +1 -0
- metaflow_extensions/outerbounds/plugins/ollama/exceptions.py +22 -0
- metaflow_extensions/outerbounds/plugins/ollama/ollama.py +1710 -114
- metaflow_extensions/outerbounds/plugins/ollama/status_card.py +292 -0
- metaflow_extensions/outerbounds/plugins/optuna/__init__.py +48 -0
- metaflow_extensions/outerbounds/plugins/profilers/simple_card_decorator.py +96 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/__init__.py +7 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/binary_caller.py +132 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/constants.py +11 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/exceptions.py +13 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/proxy_bootstrap.py +59 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_api.py +93 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_decorator.py +250 -0
- metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_manager.py +225 -0
- metaflow_extensions/outerbounds/plugins/secrets/secrets.py +38 -2
- metaflow_extensions/outerbounds/plugins/snowflake/snowflake.py +44 -4
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_client.py +6 -3
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_decorator.py +13 -7
- metaflow_extensions/outerbounds/plugins/snowpark/snowpark_job.py +8 -2
- metaflow_extensions/outerbounds/plugins/torchtune/__init__.py +163 -0
- metaflow_extensions/outerbounds/plugins/vllm/__init__.py +255 -0
- metaflow_extensions/outerbounds/plugins/vllm/constants.py +1 -0
- metaflow_extensions/outerbounds/plugins/vllm/exceptions.py +1 -0
- metaflow_extensions/outerbounds/plugins/vllm/status_card.py +352 -0
- metaflow_extensions/outerbounds/plugins/vllm/vllm_manager.py +621 -0
- metaflow_extensions/outerbounds/remote_config.py +27 -3
- metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py +87 -2
- metaflow_extensions/outerbounds/toplevel/ob_internal.py +4 -0
- metaflow_extensions/outerbounds/toplevel/plugins/optuna/__init__.py +1 -0
- metaflow_extensions/outerbounds/toplevel/plugins/torchtune/__init__.py +1 -0
- metaflow_extensions/outerbounds/toplevel/plugins/vllm/__init__.py +1 -0
- metaflow_extensions/outerbounds/toplevel/s3_proxy.py +88 -0
- {ob_metaflow_extensions-1.1.142.dist-info → ob_metaflow_extensions-1.4.33.dist-info}/METADATA +2 -2
- ob_metaflow_extensions-1.4.33.dist-info/RECORD +134 -0
- metaflow_extensions/outerbounds/plugins/nim/utilities.py +0 -5
- ob_metaflow_extensions-1.1.142.dist-info/RECORD +0 -64
- {ob_metaflow_extensions-1.1.142.dist-info → ob_metaflow_extensions-1.4.33.dist-info}/WHEEL +0 -0
- {ob_metaflow_extensions-1.1.142.dist-info → ob_metaflow_extensions-1.4.33.dist-info}/top_level.txt +0 -0
|
@@ -153,7 +153,7 @@ class Nvcf(object):
|
|
|
153
153
|
|
|
154
154
|
def wait(self, stdout_location, stderr_location, echo=None):
|
|
155
155
|
def wait_for_launch(job):
|
|
156
|
-
status = job.
|
|
156
|
+
status = job._status
|
|
157
157
|
echo(
|
|
158
158
|
"Task status: %s..." % status,
|
|
159
159
|
"stderr",
|
|
@@ -188,12 +188,16 @@ class Nvcf(object):
|
|
|
188
188
|
|
|
189
189
|
|
|
190
190
|
class JobStatus(object):
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
191
|
+
CREATED = "CREATED" # Job object created but not submitted
|
|
192
|
+
SUBMITTED = "SUBMITTED" # Job submitted to NVCF
|
|
193
|
+
POLLED = "POLLED" # Job has been successfully polled at least once
|
|
194
|
+
SUCCESSFUL = "SUCCESSFUL" # Job completed successfully
|
|
195
|
+
FAILED = "FAILED" # Job failed
|
|
196
|
+
DISAPPEARED = "DISAPPEARED" # Job disappeared from NVCF but was previously polled (likely successful)
|
|
195
197
|
|
|
196
198
|
|
|
199
|
+
terminal_states = [JobStatus.SUCCESSFUL, JobStatus.FAILED, JobStatus.DISAPPEARED]
|
|
200
|
+
|
|
197
201
|
nvcf_url = "https://api.nvcf.nvidia.com"
|
|
198
202
|
submit_endpoint = f"{nvcf_url}/v2/nvcf/pexec/functions"
|
|
199
203
|
result_endpoint = f"{nvcf_url}/v2/nvcf/pexec/status"
|
|
@@ -213,6 +217,14 @@ class Job(object):
|
|
|
213
217
|
self._queue_timeout = queue_timeout
|
|
214
218
|
self._poll_seconds = "3600"
|
|
215
219
|
|
|
220
|
+
# Initialize status and tracking variables
|
|
221
|
+
self._status = JobStatus.CREATED
|
|
222
|
+
self._last_poll_time = time.time()
|
|
223
|
+
|
|
224
|
+
# State tracking for long polling
|
|
225
|
+
self._long_polling_active = False
|
|
226
|
+
self._poll_response = None
|
|
227
|
+
|
|
216
228
|
flow_name = task_spec.get("flow_name")
|
|
217
229
|
run_id = task_spec.get("run_id")
|
|
218
230
|
step_name = task_spec.get("step_name")
|
|
@@ -272,21 +284,31 @@ class Job(object):
|
|
|
272
284
|
self._result = data
|
|
273
285
|
elif response.getcode() == 202:
|
|
274
286
|
self._status = JobStatus.SUBMITTED
|
|
287
|
+
# Start long polling immediately after receiving 202
|
|
288
|
+
self._start_long_polling()
|
|
275
289
|
else:
|
|
276
290
|
self._status = JobStatus.FAILED
|
|
277
291
|
except URLError:
|
|
278
292
|
self._status = JobStatus.FAILED
|
|
279
293
|
raise
|
|
280
294
|
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
295
|
+
def _start_long_polling(self):
|
|
296
|
+
if not self._long_polling_active:
|
|
297
|
+
self._long_polling_active = True
|
|
298
|
+
polling_thread = threading.Thread(target=self._long_poll_loop, daemon=True)
|
|
299
|
+
polling_thread.start()
|
|
300
|
+
|
|
301
|
+
def _long_poll_loop(self):
|
|
302
|
+
while self._long_polling_active and self._status not in terminal_states:
|
|
284
303
|
try:
|
|
285
304
|
self._poll()
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
305
|
+
# No sleep needed - the request itself will block for up to self._poll_seconds
|
|
306
|
+
except Exception as e:
|
|
307
|
+
print(f"[@nvidia] Long polling error: {e}")
|
|
308
|
+
# Brief pause before retry on error
|
|
309
|
+
time.sleep(1)
|
|
310
|
+
|
|
311
|
+
self._long_polling_active = False
|
|
290
312
|
|
|
291
313
|
@property
|
|
292
314
|
def id(self):
|
|
@@ -294,11 +316,12 @@ class Job(object):
|
|
|
294
316
|
|
|
295
317
|
@property
|
|
296
318
|
def is_running(self):
|
|
297
|
-
|
|
319
|
+
# Job is running if it's in SUBMITTED or POLLED state
|
|
320
|
+
return self._status in [JobStatus.SUBMITTED, JobStatus.POLLED]
|
|
298
321
|
|
|
299
322
|
@property
|
|
300
323
|
def has_failed(self):
|
|
301
|
-
return self.
|
|
324
|
+
return self._status == JobStatus.FAILED
|
|
302
325
|
|
|
303
326
|
@property
|
|
304
327
|
def result(self):
|
|
@@ -308,6 +331,14 @@ class Job(object):
|
|
|
308
331
|
@retry_on_status(status_codes=[504])
|
|
309
332
|
def _poll(self):
|
|
310
333
|
try:
|
|
334
|
+
# Implement rate limiting to prevent more than 1 request per second
|
|
335
|
+
current_time = time.time()
|
|
336
|
+
if (
|
|
337
|
+
hasattr(self, "_last_poll_time")
|
|
338
|
+
and current_time - self._last_poll_time < 1
|
|
339
|
+
):
|
|
340
|
+
time.sleep(1 - (current_time - self._last_poll_time))
|
|
341
|
+
|
|
311
342
|
headers = {
|
|
312
343
|
"Authorization": f"Bearer {self._ngc_api_key}",
|
|
313
344
|
"Content-Type": "application/json",
|
|
@@ -317,20 +348,70 @@ class Job(object):
|
|
|
317
348
|
request = Request(
|
|
318
349
|
f"{result_endpoint}/{self._invocation_id}", headers=headers
|
|
319
350
|
)
|
|
351
|
+
|
|
352
|
+
# Record time before making the request
|
|
353
|
+
self._last_poll_time = time.time()
|
|
354
|
+
|
|
320
355
|
response = urlopen(request)
|
|
356
|
+
body = response.read()
|
|
357
|
+
print(f"[@nvidia] polling status code: {response.getcode()}")
|
|
358
|
+
|
|
321
359
|
if response.getcode() == 200:
|
|
322
|
-
data = json.loads(
|
|
323
|
-
# TODO: Propagate the internal error forward
|
|
360
|
+
data = json.loads(body)
|
|
324
361
|
if data.get("exit_code") == 0:
|
|
325
362
|
self._status = JobStatus.SUCCESSFUL
|
|
326
363
|
else:
|
|
327
364
|
self._status = JobStatus.FAILED
|
|
328
365
|
self._result = data
|
|
329
|
-
|
|
366
|
+
self._long_polling_active = False # Stop polling once job completes
|
|
367
|
+
elif response.getcode() == 202:
|
|
368
|
+
# Job is still running - status remains SUBMITTED or POLLED
|
|
369
|
+
if self._status == JobStatus.SUBMITTED:
|
|
370
|
+
self._status = JobStatus.POLLED
|
|
371
|
+
elif response.getcode() == 302:
|
|
372
|
+
# Handle redirects for large responses or requests in different regions
|
|
373
|
+
redirect_location = response.headers.get("Location")
|
|
374
|
+
if redirect_location:
|
|
375
|
+
redirect_request = Request(redirect_location, headers=headers)
|
|
376
|
+
redirect_response = urlopen(redirect_request)
|
|
377
|
+
if redirect_response.getcode() == 200:
|
|
378
|
+
data = json.loads(redirect_response.read())
|
|
379
|
+
if data.get("exit_code") == 0:
|
|
380
|
+
self._status = JobStatus.SUCCESSFUL
|
|
381
|
+
else:
|
|
382
|
+
self._status = JobStatus.FAILED
|
|
383
|
+
self._result = data
|
|
384
|
+
self._long_polling_active = False
|
|
385
|
+
else:
|
|
330
386
|
print(
|
|
331
387
|
f"[@nvidia] Unexpected response code: {response.getcode()}. Please notify an Outerbounds support engineer if this error persists."
|
|
332
388
|
)
|
|
333
389
|
self._status = JobStatus.FAILED
|
|
334
|
-
|
|
390
|
+
|
|
391
|
+
except HTTPError as e:
|
|
392
|
+
if e.code == 404:
|
|
393
|
+
# 404 interpretation depends on job lifecycle
|
|
394
|
+
if self._status in [JobStatus.POLLED, JobStatus.SUBMITTED]:
|
|
395
|
+
# We've submitted or successfully polled this job before,
|
|
396
|
+
# so a 404 likely means it completed and was removed
|
|
397
|
+
self._status = JobStatus.DISAPPEARED
|
|
398
|
+
self._result = {"exit_code": 0}
|
|
399
|
+
print(
|
|
400
|
+
f"[@nvidia] 404 received for job that was previously tracked - assuming job completed"
|
|
401
|
+
)
|
|
402
|
+
else:
|
|
403
|
+
# Job was never successfully tracked
|
|
404
|
+
print(
|
|
405
|
+
f"[@nvidia] 404 received for job that was never successfully tracked - treating as failure"
|
|
406
|
+
)
|
|
407
|
+
self._status = JobStatus.FAILED
|
|
408
|
+
raise NvcfPollingConnectionError(e)
|
|
409
|
+
elif e.code in [500, 504]:
|
|
410
|
+
# Don't set status to FAILED, just re-raise for retry decorator
|
|
411
|
+
raise
|
|
412
|
+
else:
|
|
413
|
+
self._status = JobStatus.FAILED
|
|
414
|
+
raise NvcfPollingConnectionError(e)
|
|
415
|
+
except URLError as e:
|
|
335
416
|
self._status = JobStatus.FAILED
|
|
336
|
-
raise
|
|
417
|
+
raise NvcfPollingConnectionError(e)
|
|
@@ -198,7 +198,12 @@ class NvcfDecorator(StepDecorator):
|
|
|
198
198
|
meta["nvcf-nspectid"] = os.environ.get("NVCF_NSPECTID")
|
|
199
199
|
|
|
200
200
|
entries = [
|
|
201
|
-
MetaDatum(
|
|
201
|
+
MetaDatum(
|
|
202
|
+
field=k,
|
|
203
|
+
value=v,
|
|
204
|
+
type=k,
|
|
205
|
+
tags=["attempt_id:{0}".format(retry_count)],
|
|
206
|
+
)
|
|
202
207
|
for k, v in meta.items()
|
|
203
208
|
if v is not None
|
|
204
209
|
]
|
|
File without changes
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
from metaflow.exception import MetaflowException
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class NvctExecutionException(MetaflowException):
|
|
5
|
+
headline = "Nvct task couldn't be executed"
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class NvctTaskFailedException(MetaflowException):
|
|
9
|
+
headline = "Nvct task failed"
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class NvctKilledException(MetaflowException):
|
|
13
|
+
headline = "Nvct job killed"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class RequestedGPUTypeUnavailableException(MetaflowException):
|
|
17
|
+
headline = "[@nvct RequestedGPUTypeUnavailableException] GPU type unavailable."
|
|
18
|
+
|
|
19
|
+
def __init__(self, requested_gpu_type, available_gpus):
|
|
20
|
+
msg = (
|
|
21
|
+
f"The requested GPU type @nvct(..., gpu_type='{requested_gpu_type}') is not available. "
|
|
22
|
+
f"Please choose from the following supported GPU types when using @nvct: {available_gpus}"
|
|
23
|
+
)
|
|
24
|
+
super(RequestedGPUTypeUnavailableException, self).__init__(msg)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class UnsupportedNvctConfigurationException(MetaflowException):
|
|
28
|
+
headline = (
|
|
29
|
+
"[@nvct UnsupportedNvctConfigurationException] Unsupported GPU configuration"
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
def __init__(self, n_gpu, gpu_type, available_configurations, step):
|
|
33
|
+
msg = f"The requested configuration of @nvct(gpu={n_gpu}, gpu_type='{gpu_type}') for @step {step} is not available."
|
|
34
|
+
if len(available_configurations) == 0:
|
|
35
|
+
msg += (
|
|
36
|
+
"\n\nNo configurations are available in your Outerbounds deployment."
|
|
37
|
+
" Please contact Outerbounds support if you wish to use @nvct."
|
|
38
|
+
)
|
|
39
|
+
else:
|
|
40
|
+
msg += f"\n\nAvailable configurations for your deployment with {gpu_type} include: \n\t- {self._display(gpu_type, available_configurations)}"
|
|
41
|
+
msg += "\n\nPlease contact Outerbounds support if you wish to use a configuration not listed above."
|
|
42
|
+
super(UnsupportedNvctConfigurationException, self).__init__(msg)
|
|
43
|
+
|
|
44
|
+
def _display(self, gpu_type, configs):
|
|
45
|
+
_available_decos = []
|
|
46
|
+
for cfg in configs:
|
|
47
|
+
n_gpu = cfg["n_gpus"]
|
|
48
|
+
_available_decos.append(f"@nvct(gpu={n_gpu}, gpu_type='{gpu_type}')")
|
|
49
|
+
return "\n\t- ".join(_available_decos)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class UnsupportedNvctDatastoreException(MetaflowException):
|
|
53
|
+
headline = "[@nvct UnsupportedNvctDatastoreException] Unsupported datastore"
|
|
54
|
+
|
|
55
|
+
def __init__(self, ds_type):
|
|
56
|
+
msg = (
|
|
57
|
+
"The *@nvct* decorator requires --datastore=s3 or --datastore=azure or --datastore=gs at the moment."
|
|
58
|
+
f"Current datastore type: {ds_type}."
|
|
59
|
+
)
|
|
60
|
+
super(UnsupportedNvctDatastoreException, self).__init__(msg)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class NvctTimeoutTooShortException(MetaflowException):
|
|
64
|
+
headline = "[@nvct NvctTimeoutTooShortException] Timeout too short"
|
|
65
|
+
|
|
66
|
+
def __init__(self, step):
|
|
67
|
+
msg = (
|
|
68
|
+
"The timeout for step *{step}* should be at least 60 seconds for "
|
|
69
|
+
"execution with @nvct".format(step=step)
|
|
70
|
+
)
|
|
71
|
+
super(NvctTimeoutTooShortException, self).__init__(msg)
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
import requests
|
|
2
|
+
from requests.adapters import HTTPAdapter, Retry
|
|
3
|
+
|
|
4
|
+
BASE_URL = "https://api.ngc.nvidia.com/v2/orgs/zhxkmsaasxhw/"
|
|
5
|
+
POLL_SEC = 1
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def _session(api_key):
|
|
9
|
+
s = requests.Session()
|
|
10
|
+
s.headers.update(
|
|
11
|
+
{
|
|
12
|
+
"Authorization": f"Bearer {api_key}",
|
|
13
|
+
"Content-Type": "application/json",
|
|
14
|
+
}
|
|
15
|
+
)
|
|
16
|
+
retry = Retry(total=5, backoff_factor=1.5, status_forcelist=[502, 503, 504])
|
|
17
|
+
s.mount("https://", HTTPAdapter(max_retries=retry))
|
|
18
|
+
return s
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _url(path):
|
|
22
|
+
return BASE_URL.rstrip("/") + path
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class NVCTClient:
|
|
26
|
+
def __init__(self, api_key):
|
|
27
|
+
self.sess = _session(api_key)
|
|
28
|
+
|
|
29
|
+
# TODO: Handle https://outerboundsco.slack.com/archives/C05QGNR4E06/p1745970955540289
|
|
30
|
+
def create(self, spec):
|
|
31
|
+
r = self.sess.post(_url("/nvct/tasks"), json=spec, timeout=30)
|
|
32
|
+
r.raise_for_status()
|
|
33
|
+
return r.json().get("task", {}).get("id")
|
|
34
|
+
|
|
35
|
+
def get(self, task_id):
|
|
36
|
+
r = self.sess.get(_url(f"/nvct/tasks/{task_id}"), timeout=30)
|
|
37
|
+
r.raise_for_status()
|
|
38
|
+
return r.json().get("task", {})
|
|
39
|
+
|
|
40
|
+
def cancel(self, task_id):
|
|
41
|
+
r = self.sess.post(_url(f"/nvct/tasks/{task_id}/cancel"), timeout=30)
|
|
42
|
+
r.raise_for_status()
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class NVCTRequest(object):
|
|
46
|
+
def __init__(self, name):
|
|
47
|
+
self._spec = {}
|
|
48
|
+
self._spec["name"] = name
|
|
49
|
+
self._spec["gpuSpecification"] = {}
|
|
50
|
+
self._spec["resultHandlingStrategy"] = "NONE"
|
|
51
|
+
self._spec["terminationGracePeriodDuration"] = "PT10M"
|
|
52
|
+
|
|
53
|
+
def container_image(self, image):
|
|
54
|
+
self._spec["containerImage"] = image
|
|
55
|
+
return self
|
|
56
|
+
|
|
57
|
+
def container_args(self, args):
|
|
58
|
+
self._spec["containerArgs"] = args
|
|
59
|
+
return self
|
|
60
|
+
|
|
61
|
+
def env(self, key, value):
|
|
62
|
+
env_list = self._spec.setdefault("containerEnvironment", [])
|
|
63
|
+
env_list.append({"key": key, "value": value})
|
|
64
|
+
return self
|
|
65
|
+
|
|
66
|
+
def gpu(self, gpu, instance_type, backend):
|
|
67
|
+
gpu_spec = self._spec["gpuSpecification"]
|
|
68
|
+
gpu_spec["gpu"] = gpu
|
|
69
|
+
gpu_spec["instanceType"] = instance_type
|
|
70
|
+
gpu_spec["backend"] = backend
|
|
71
|
+
return self
|
|
72
|
+
|
|
73
|
+
def max_runtime(self, iso_duration):
|
|
74
|
+
self._spec["maxRuntimeDuration"] = iso_duration
|
|
75
|
+
return self
|
|
76
|
+
|
|
77
|
+
def max_queued(self, iso_duration="PT72H"):
|
|
78
|
+
self._spec["maxQueuedDuration"] = iso_duration
|
|
79
|
+
return self
|
|
80
|
+
|
|
81
|
+
def termination_grace(self, iso_duration="PT10M"):
|
|
82
|
+
self._spec["terminationGracePeriodDuration"] = iso_duration
|
|
83
|
+
return self
|
|
84
|
+
|
|
85
|
+
def extra(self, key, value):
|
|
86
|
+
self._spec[key] = value
|
|
87
|
+
return self
|
|
88
|
+
|
|
89
|
+
def to_dict(self):
|
|
90
|
+
return self._spec
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class NVCTTask:
|
|
94
|
+
def __init__(self, client: NVCTClient, spec):
|
|
95
|
+
self.client = client
|
|
96
|
+
self.spec = spec
|
|
97
|
+
self.id = None
|
|
98
|
+
self.record = None
|
|
99
|
+
|
|
100
|
+
def submit(self):
|
|
101
|
+
self.id = self.client.create(self.spec)
|
|
102
|
+
return self.id
|
|
103
|
+
|
|
104
|
+
def cancel(self):
|
|
105
|
+
if not self.has_finished:
|
|
106
|
+
self.client.cancel(self.id)
|
|
107
|
+
|
|
108
|
+
@property
|
|
109
|
+
def status(self):
|
|
110
|
+
self.record = self.client.get(self.id)
|
|
111
|
+
return self.record["status"]
|
|
112
|
+
|
|
113
|
+
@property
|
|
114
|
+
def is_waiting(self):
|
|
115
|
+
return self.status == "QUEUED"
|
|
116
|
+
|
|
117
|
+
@property
|
|
118
|
+
def is_running(self):
|
|
119
|
+
return self.status in {"RUNNING", "LAUNCHED"}
|
|
120
|
+
|
|
121
|
+
@property
|
|
122
|
+
def has_failed(self):
|
|
123
|
+
return self.status in {"ERRORED", "CANCELED"}
|
|
124
|
+
|
|
125
|
+
@property
|
|
126
|
+
def has_succeeded(self):
|
|
127
|
+
return self.status == "COMPLETED"
|
|
128
|
+
|
|
129
|
+
@property
|
|
130
|
+
def has_finished(self):
|
|
131
|
+
return self.has_succeeded or self.has_failed
|