ob-metaflow-extensions 1.1.124__py2.py3-none-any.whl → 1.1.125__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ob-metaflow-extensions might be problematic. Click here for more details.
- metaflow_extensions/outerbounds/plugins/nvcf/exceptions.py +11 -0
- metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py +60 -30
- metaflow_extensions/outerbounds/plugins/nvcf/nvcf_cli.py +5 -0
- metaflow_extensions/outerbounds/plugins/nvcf/nvcf_decorator.py +13 -1
- {ob_metaflow_extensions-1.1.124.dist-info → ob_metaflow_extensions-1.1.125.dist-info}/METADATA +1 -1
- {ob_metaflow_extensions-1.1.124.dist-info → ob_metaflow_extensions-1.1.125.dist-info}/RECORD +8 -8
- {ob_metaflow_extensions-1.1.124.dist-info → ob_metaflow_extensions-1.1.125.dist-info}/WHEEL +0 -0
- {ob_metaflow_extensions-1.1.124.dist-info → ob_metaflow_extensions-1.1.125.dist-info}/top_level.txt +0 -0
|
@@ -79,5 +79,16 @@ class NvcfTimeoutTooShortException(MetaflowException):
|
|
|
79
79
|
super(NvcfTimeoutTooShortException, self).__init__(msg)
|
|
80
80
|
|
|
81
81
|
|
|
82
|
+
class NvcfQueueTimeoutTooShortException(MetaflowException):
|
|
83
|
+
headline = "[@nvidia NvcfQueueTimeoutTooShortException] Queue Timeout too short"
|
|
84
|
+
|
|
85
|
+
def __init__(self, step):
|
|
86
|
+
msg = (
|
|
87
|
+
"The queue timeout for step *{step}* should be at least 60 seconds for "
|
|
88
|
+
"execution with @nvidia".format(step=step)
|
|
89
|
+
)
|
|
90
|
+
super(NvcfQueueTimeoutTooShortException, self).__init__(msg)
|
|
91
|
+
|
|
92
|
+
|
|
82
93
|
class NvcfKilledException(MetaflowException):
|
|
83
94
|
headline = "Nvidia job killed"
|
|
@@ -23,29 +23,53 @@ STDOUT_PATH = os.path.join(LOGS_DIR, STDOUT_FILE)
|
|
|
23
23
|
STDERR_PATH = os.path.join(LOGS_DIR, STDERR_FILE)
|
|
24
24
|
|
|
25
25
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
def retry_on_status(status_codes=RETRIABLE_STATUS_CODES, max_retries=3, delay=1):
|
|
26
|
+
def retry_on_status(status_codes=[500], max_retries=3, delay=1):
|
|
30
27
|
def decorator(func):
|
|
31
28
|
@wraps(func)
|
|
32
|
-
def wrapper(*args, **kwargs):
|
|
29
|
+
def wrapper(instance, *args, **kwargs):
|
|
33
30
|
retries = 0
|
|
34
|
-
|
|
31
|
+
|
|
32
|
+
# Determine retry limit upfront
|
|
33
|
+
use_queue_timeout = 504 in status_codes
|
|
34
|
+
if use_queue_timeout:
|
|
35
|
+
poll_seconds = int(instance._poll_seconds)
|
|
36
|
+
retry_limit = (
|
|
37
|
+
instance._queue_timeout + (poll_seconds - 1)
|
|
38
|
+
) // poll_seconds
|
|
39
|
+
remainder = instance._queue_timeout % poll_seconds
|
|
40
|
+
last_timeout = remainder if remainder != 0 else poll_seconds
|
|
41
|
+
else:
|
|
42
|
+
retry_limit = max_retries
|
|
43
|
+
|
|
44
|
+
while retries < retry_limit:
|
|
35
45
|
try:
|
|
36
|
-
return func(*args, **kwargs)
|
|
46
|
+
return func(instance, *args, **kwargs)
|
|
37
47
|
except HTTPError as e:
|
|
38
|
-
if e.code in status_codes
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
48
|
+
if e.code not in status_codes or retries >= retry_limit:
|
|
49
|
+
instance._status = JobStatus.FAILED
|
|
50
|
+
if e.code == 504 and retries >= retry_limit:
|
|
51
|
+
raise NvcfPollingConnectionError(
|
|
52
|
+
"Request timed out after all retries"
|
|
53
|
+
)
|
|
54
|
+
raise
|
|
55
|
+
|
|
56
|
+
if e.code == 504 and retries == retry_limit - 1:
|
|
57
|
+
instance._poll_seconds = str(last_timeout)
|
|
58
|
+
|
|
59
|
+
print(
|
|
60
|
+
f"[@nvidia] {'Queue timeout' if e.code == 504 else f'Received {e.code}'}, "
|
|
61
|
+
f"retrying ({retries + 1}/{retry_limit})... with poll seconds as {instance._poll_seconds}"
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
if e.code != 504:
|
|
43
65
|
time.sleep(delay)
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
except
|
|
66
|
+
|
|
67
|
+
retries += 1
|
|
68
|
+
except URLError as e:
|
|
69
|
+
instance._status = JobStatus.FAILED
|
|
47
70
|
raise
|
|
48
|
-
|
|
71
|
+
# final attempt
|
|
72
|
+
return func(instance, *args, **kwargs)
|
|
49
73
|
|
|
50
74
|
return wrapper
|
|
51
75
|
|
|
@@ -53,12 +77,15 @@ def retry_on_status(status_codes=RETRIABLE_STATUS_CODES, max_retries=3, delay=1)
|
|
|
53
77
|
|
|
54
78
|
|
|
55
79
|
class Nvcf(object):
|
|
56
|
-
def __init__(
|
|
80
|
+
def __init__(
|
|
81
|
+
self, metadata, datastore, environment, function_id, ngc_api_key, queue_timeout
|
|
82
|
+
):
|
|
57
83
|
self.metadata = metadata
|
|
58
84
|
self.datastore = datastore
|
|
59
85
|
self.environment = environment
|
|
60
86
|
self._function_id = function_id
|
|
61
87
|
self._ngc_api_key = ngc_api_key
|
|
88
|
+
self._queue_timeout = queue_timeout
|
|
62
89
|
|
|
63
90
|
def launch_job(
|
|
64
91
|
self,
|
|
@@ -120,6 +147,7 @@ class Nvcf(object):
|
|
|
120
147
|
self.datastore._storage_impl,
|
|
121
148
|
self._function_id,
|
|
122
149
|
self._ngc_api_key,
|
|
150
|
+
self._queue_timeout,
|
|
123
151
|
)
|
|
124
152
|
self.job.submit()
|
|
125
153
|
|
|
@@ -172,7 +200,9 @@ result_endpoint = f"{nvcf_url}/v2/nvcf/pexec/status"
|
|
|
172
200
|
|
|
173
201
|
|
|
174
202
|
class Job(object):
|
|
175
|
-
def __init__(
|
|
203
|
+
def __init__(
|
|
204
|
+
self, command, env, task_spec, backend, function_id, ngc_api_key, queue_timeout
|
|
205
|
+
):
|
|
176
206
|
self._payload = {
|
|
177
207
|
"command": command,
|
|
178
208
|
"env": {k: v for k, v in env.items() if v is not None},
|
|
@@ -180,6 +210,8 @@ class Job(object):
|
|
|
180
210
|
self._result = {}
|
|
181
211
|
self._function_id = function_id
|
|
182
212
|
self._ngc_api_key = ngc_api_key
|
|
213
|
+
self._queue_timeout = queue_timeout
|
|
214
|
+
self._poll_seconds = "3600"
|
|
183
215
|
|
|
184
216
|
flow_name = task_spec.get("flow_name")
|
|
185
217
|
run_id = task_spec.get("run_id")
|
|
@@ -214,11 +246,14 @@ class Job(object):
|
|
|
214
246
|
)
|
|
215
247
|
self.heartbeat_thread.start()
|
|
216
248
|
|
|
249
|
+
@retry_on_status(status_codes=[504])
|
|
217
250
|
def submit(self):
|
|
218
251
|
try:
|
|
219
252
|
headers = {
|
|
220
253
|
"Authorization": f"Bearer {self._ngc_api_key}",
|
|
221
254
|
"Content-Type": "application/json",
|
|
255
|
+
"nvcf-feature-enable-gateway-timeout": "true",
|
|
256
|
+
"NVCF-POLL-SECONDS": self._poll_seconds,
|
|
222
257
|
}
|
|
223
258
|
request_data = json.dumps(self._payload).encode()
|
|
224
259
|
request = Request(
|
|
@@ -239,12 +274,9 @@ class Job(object):
|
|
|
239
274
|
self._status = JobStatus.SUBMITTED
|
|
240
275
|
else:
|
|
241
276
|
self._status = JobStatus.FAILED
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
# without that, branching over concurrent requests causes error.
|
|
246
|
-
self._state = JobStatus.FAILED
|
|
247
|
-
raise e
|
|
277
|
+
except URLError:
|
|
278
|
+
self._status = JobStatus.FAILED
|
|
279
|
+
raise
|
|
248
280
|
|
|
249
281
|
@property
|
|
250
282
|
def status(self):
|
|
@@ -272,12 +304,15 @@ class Job(object):
|
|
|
272
304
|
def result(self):
|
|
273
305
|
return self._result
|
|
274
306
|
|
|
275
|
-
@retry_on_status(status_codes=
|
|
307
|
+
@retry_on_status(status_codes=[500], max_retries=3, delay=5)
|
|
308
|
+
@retry_on_status(status_codes=[504])
|
|
276
309
|
def _poll(self):
|
|
277
310
|
try:
|
|
278
311
|
headers = {
|
|
279
312
|
"Authorization": f"Bearer {self._ngc_api_key}",
|
|
280
313
|
"Content-Type": "application/json",
|
|
314
|
+
"nvcf-feature-enable-gateway-timeout": "true",
|
|
315
|
+
"NVCF-POLL-SECONDS": self._poll_seconds,
|
|
281
316
|
}
|
|
282
317
|
request = Request(
|
|
283
318
|
f"{result_endpoint}/{self._invocation_id}", headers=headers
|
|
@@ -296,11 +331,6 @@ class Job(object):
|
|
|
296
331
|
f"[@nvidia] Unexpected response code: {response.getcode()}. Please notify an Outerbounds support engineer if this error persists."
|
|
297
332
|
)
|
|
298
333
|
self._status = JobStatus.FAILED
|
|
299
|
-
# 4xx and 5xx responses go in 'except' block
|
|
300
|
-
except HTTPError as e:
|
|
301
|
-
if e.code not in RETRIABLE_STATUS_CODES:
|
|
302
|
-
self._status = JobStatus.FAILED
|
|
303
|
-
raise
|
|
304
334
|
except URLError:
|
|
305
335
|
self._status = JobStatus.FAILED
|
|
306
336
|
raise
|
|
@@ -111,6 +111,9 @@ def kill(ctx, run_id):
|
|
|
111
111
|
@click.argument("code-package-url")
|
|
112
112
|
@click.option("--function-id", help="NVCF function id.")
|
|
113
113
|
@click.option("--ngc-api-key", help="NGC API key.")
|
|
114
|
+
@click.option(
|
|
115
|
+
"--queue-timeout", default=5 * 24 * 3600, help="Queue timeout in seconds."
|
|
116
|
+
)
|
|
114
117
|
@click.option("--run-id", help="Passed to the top-level 'step'.")
|
|
115
118
|
@click.option("--task-id", help="Passed to the top-level 'step'.")
|
|
116
119
|
@click.option("--input-paths", help="Passed to the top-level 'step'.")
|
|
@@ -133,6 +136,7 @@ def step(
|
|
|
133
136
|
code_package_url,
|
|
134
137
|
function_id,
|
|
135
138
|
ngc_api_key,
|
|
139
|
+
queue_timeout,
|
|
136
140
|
**kwargs,
|
|
137
141
|
):
|
|
138
142
|
def echo(msg, stream="stderr", _id=None, **kwargs):
|
|
@@ -249,6 +253,7 @@ def step(
|
|
|
249
253
|
ctx.obj.environment,
|
|
250
254
|
function_id,
|
|
251
255
|
ngc_api_key,
|
|
256
|
+
queue_timeout,
|
|
252
257
|
)
|
|
253
258
|
try:
|
|
254
259
|
with ctx.obj.monitor.measure("metaflow.nvcf.launch_job"):
|
|
@@ -20,6 +20,7 @@ from .exceptions import (
|
|
|
20
20
|
UnsupportedNvcfConfigurationException,
|
|
21
21
|
UnsupportedNvcfDatastoreException,
|
|
22
22
|
NvcfTimeoutTooShortException,
|
|
23
|
+
NvcfQueueTimeoutTooShortException,
|
|
23
24
|
)
|
|
24
25
|
|
|
25
26
|
from metaflow.metaflow_config import SERVICE_URL
|
|
@@ -36,10 +37,16 @@ class NvcfDecorator(StepDecorator):
|
|
|
36
37
|
Number of GPUs to use.
|
|
37
38
|
gpu_type : str
|
|
38
39
|
Type of Nvidia GPU to use.
|
|
40
|
+
queue_timeout : int
|
|
41
|
+
Time to keep the job in NVCF's queue.
|
|
39
42
|
"""
|
|
40
43
|
|
|
41
44
|
name = "nvidia"
|
|
42
|
-
defaults = {
|
|
45
|
+
defaults = {
|
|
46
|
+
"gpu": 1,
|
|
47
|
+
"gpu_type": None,
|
|
48
|
+
"queue_timeout": 5 * 24 * 3600, # Default 5 days in seconds
|
|
49
|
+
}
|
|
43
50
|
|
|
44
51
|
package_url = None
|
|
45
52
|
package_sha = None
|
|
@@ -118,6 +125,10 @@ class NvcfDecorator(StepDecorator):
|
|
|
118
125
|
)
|
|
119
126
|
self.attributes["function_id"] = available_configurations[desired_configuration]
|
|
120
127
|
|
|
128
|
+
queue_timeout = self.attributes["queue_timeout"]
|
|
129
|
+
if not isinstance(queue_timeout, int) or queue_timeout < 60:
|
|
130
|
+
raise NvcfQueueTimeoutTooShortException(step)
|
|
131
|
+
|
|
121
132
|
def runtime_init(self, flow, graph, package, run_id):
|
|
122
133
|
# Set some more internal state.
|
|
123
134
|
self.flow = flow
|
|
@@ -144,6 +155,7 @@ class NvcfDecorator(StepDecorator):
|
|
|
144
155
|
cli_options = {
|
|
145
156
|
"function_id": self.attributes["function_id"],
|
|
146
157
|
"ngc_api_key": self.attributes["ngc_api_key"],
|
|
158
|
+
"queue_timeout": self.attributes["queue_timeout"],
|
|
147
159
|
}
|
|
148
160
|
cli_args.command_options.update(cli_options)
|
|
149
161
|
cli_args.entrypoint[0] = sys.executable
|
{ob_metaflow_extensions-1.1.124.dist-info → ob_metaflow_extensions-1.1.125.dist-info}/RECORD
RENAMED
|
@@ -15,11 +15,11 @@ metaflow_extensions/outerbounds/plugins/nim/__init__.py,sha256=GVnvSTjqYVj5oG2yh
|
|
|
15
15
|
metaflow_extensions/outerbounds/plugins/nim/nim_manager.py,sha256=SWieODDxtIaeZwdMYtObDi57Kjyfw2DUuE6pJtU750w,9206
|
|
16
16
|
metaflow_extensions/outerbounds/plugins/nvcf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
17
|
metaflow_extensions/outerbounds/plugins/nvcf/constants.py,sha256=aGHdNw_hqBu8i0zWXcatQM6e769wUXox0l8g0f6fNZ8,146
|
|
18
|
-
metaflow_extensions/outerbounds/plugins/nvcf/exceptions.py,sha256
|
|
18
|
+
metaflow_extensions/outerbounds/plugins/nvcf/exceptions.py,sha256=-Pm9cOWUzpv94TvVUeq-FenAWdfLBJd5N7WPqIGZVqU,3671
|
|
19
19
|
metaflow_extensions/outerbounds/plugins/nvcf/heartbeat_store.py,sha256=pOWwm8LFQBbtku0zNBBwCyXxLK8U-hhC4naQcmU69nE,6217
|
|
20
|
-
metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py,sha256=
|
|
21
|
-
metaflow_extensions/outerbounds/plugins/nvcf/nvcf_cli.py,sha256=
|
|
22
|
-
metaflow_extensions/outerbounds/plugins/nvcf/nvcf_decorator.py,sha256=
|
|
20
|
+
metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py,sha256=zhHgoE76WOpCJnoMvSNekliy0p4YVzcwcoIKlXXXzlE,11529
|
|
21
|
+
metaflow_extensions/outerbounds/plugins/nvcf/nvcf_cli.py,sha256=3D-r5XO88Yh2k1EAZFJTe_PwdbhWp5qXflG8AgE4ZIU,9500
|
|
22
|
+
metaflow_extensions/outerbounds/plugins/nvcf/nvcf_decorator.py,sha256=pxxNxW4bW3kbB6ybRam657GyKIhvIkMuidB94iFfCD8,9116
|
|
23
23
|
metaflow_extensions/outerbounds/plugins/nvcf/utils.py,sha256=DxWSCayfa95e0HJkWacey1s1nxoTpaunGhrb_0Ayv28,133
|
|
24
24
|
metaflow_extensions/outerbounds/plugins/profilers/deco_injector.py,sha256=oI_C3c64XBm7n88FILqHwn-Nnc5DeT_68I67lM9rXaI,2434
|
|
25
25
|
metaflow_extensions/outerbounds/plugins/profilers/gpu_profile_decorator.py,sha256=gDHQ2sMIp4NuZSzUspbSd8RGdFAoO5mgZAyFcZ2a51Y,2619
|
|
@@ -44,7 +44,7 @@ metaflow_extensions/outerbounds/toplevel/plugins/azure/__init__.py,sha256=WUuhz2
|
|
|
44
44
|
metaflow_extensions/outerbounds/toplevel/plugins/gcp/__init__.py,sha256=BbZiaH3uILlEZ6ntBLKeNyqn3If8nIXZFq_Apd7Dhco,70
|
|
45
45
|
metaflow_extensions/outerbounds/toplevel/plugins/kubernetes/__init__.py,sha256=5zG8gShSj8m7rgF4xgWBZFuY3GDP5n1T0ktjRpGJLHA,69
|
|
46
46
|
metaflow_extensions/outerbounds/toplevel/plugins/snowflake/__init__.py,sha256=LptpH-ziXHrednMYUjIaosS1SXD3sOtF_9_eRqd8SJw,50
|
|
47
|
-
ob_metaflow_extensions-1.1.
|
|
48
|
-
ob_metaflow_extensions-1.1.
|
|
49
|
-
ob_metaflow_extensions-1.1.
|
|
50
|
-
ob_metaflow_extensions-1.1.
|
|
47
|
+
ob_metaflow_extensions-1.1.125.dist-info/METADATA,sha256=0tew8M6hNw_p64Uc8XX8FroXNthGs7lmcIFPzrn1_Lc,520
|
|
48
|
+
ob_metaflow_extensions-1.1.125.dist-info/WHEEL,sha256=bb2Ot9scclHKMOLDEHY6B2sicWOgugjFKaJsT7vwMQo,110
|
|
49
|
+
ob_metaflow_extensions-1.1.125.dist-info/top_level.txt,sha256=NwG0ukwjygtanDETyp_BUdtYtqIA_lOjzFFh1TsnxvI,20
|
|
50
|
+
ob_metaflow_extensions-1.1.125.dist-info/RECORD,,
|
|
File without changes
|
{ob_metaflow_extensions-1.1.124.dist-info → ob_metaflow_extensions-1.1.125.dist-info}/top_level.txt
RENAMED
|
File without changes
|