ob-metaflow-extensions 1.1.123rc3__tar.gz → 1.1.125__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ob-metaflow-extensions might be problematic. Click here for more details.
- {ob-metaflow-extensions-1.1.123rc3 → ob-metaflow-extensions-1.1.125}/PKG-INFO +1 -1
- {ob-metaflow-extensions-1.1.123rc3 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/__init__.py +0 -1
- {ob-metaflow-extensions-1.1.123rc3 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/nvcf/exceptions.py +11 -0
- {ob-metaflow-extensions-1.1.123rc3 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py +60 -30
- {ob-metaflow-extensions-1.1.123rc3 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/nvcf/nvcf_cli.py +5 -0
- {ob-metaflow-extensions-1.1.123rc3 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/nvcf/nvcf_decorator.py +13 -1
- {ob-metaflow-extensions-1.1.123rc3 → ob-metaflow-extensions-1.1.125}/ob_metaflow_extensions.egg-info/PKG-INFO +1 -1
- {ob-metaflow-extensions-1.1.123rc3 → ob-metaflow-extensions-1.1.125}/ob_metaflow_extensions.egg-info/SOURCES.txt +0 -4
- ob-metaflow-extensions-1.1.125/ob_metaflow_extensions.egg-info/requires.txt +3 -0
- {ob-metaflow-extensions-1.1.123rc3 → ob-metaflow-extensions-1.1.125}/setup.py +2 -2
- ob-metaflow-extensions-1.1.123rc3/metaflow_extensions/outerbounds/plugins/apps/app_utils.py +0 -170
- ob-metaflow-extensions-1.1.123rc3/metaflow_extensions/outerbounds/plugins/apps/deploy_decorator.py +0 -154
- ob-metaflow-extensions-1.1.123rc3/metaflow_extensions/outerbounds/plugins/apps/supervisord_utils.py +0 -239
- ob-metaflow-extensions-1.1.123rc3/metaflow_extensions/outerbounds/plugins/snowpark/__init__.py +0 -0
- ob-metaflow-extensions-1.1.123rc3/ob_metaflow_extensions.egg-info/requires.txt +0 -3
- {ob-metaflow-extensions-1.1.123rc3 → ob-metaflow-extensions-1.1.125}/README.md +0 -0
- {ob-metaflow-extensions-1.1.123rc3 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.123rc3 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/config/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.123rc3 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/auth_server.py +0 -0
- {ob-metaflow-extensions-1.1.123rc3/metaflow_extensions/outerbounds/plugins/apps → ob-metaflow-extensions-1.1.125/metaflow_extensions/outerbounds/plugins/fast_bakery}/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.123rc3 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/fast_bakery/docker_environment.py +0 -0
- {ob-metaflow-extensions-1.1.123rc3 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery.py +0 -0
- {ob-metaflow-extensions-1.1.123rc3 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_cli.py +0 -0
- {ob-metaflow-extensions-1.1.123rc3 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_decorator.py +0 -0
- {ob-metaflow-extensions-1.1.123rc3 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/kubernetes/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.123rc3 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py +0 -0
- {ob-metaflow-extensions-1.1.123rc3 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/nim/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.123rc3 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/nim/nim_manager.py +0 -0
- {ob-metaflow-extensions-1.1.123rc3/metaflow_extensions/outerbounds/plugins/fast_bakery → ob-metaflow-extensions-1.1.125/metaflow_extensions/outerbounds/plugins/nvcf}/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.123rc3 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/nvcf/constants.py +0 -0
- {ob-metaflow-extensions-1.1.123rc3 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/nvcf/heartbeat_store.py +0 -0
- {ob-metaflow-extensions-1.1.123rc3 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/nvcf/utils.py +0 -0
- {ob-metaflow-extensions-1.1.123rc3 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/perimeters.py +0 -0
- {ob-metaflow-extensions-1.1.123rc3 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/profilers/deco_injector.py +0 -0
- {ob-metaflow-extensions-1.1.123rc3 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/profilers/gpu_profile_decorator.py +0 -0
- {ob-metaflow-extensions-1.1.123rc3/metaflow_extensions/outerbounds/plugins/nvcf → ob-metaflow-extensions-1.1.125/metaflow_extensions/outerbounds/plugins/secrets}/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.123rc3 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/secrets/secrets.py +0 -0
- {ob-metaflow-extensions-1.1.123rc3 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/snowflake/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.123rc3 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/snowflake/snowflake.py +0 -0
- {ob-metaflow-extensions-1.1.123rc3/metaflow_extensions/outerbounds/plugins/secrets → ob-metaflow-extensions-1.1.125/metaflow_extensions/outerbounds/plugins/snowpark}/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.123rc3 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark.py +0 -0
- {ob-metaflow-extensions-1.1.123rc3 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_cli.py +0 -0
- {ob-metaflow-extensions-1.1.123rc3 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_client.py +0 -0
- {ob-metaflow-extensions-1.1.123rc3 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_decorator.py +0 -0
- {ob-metaflow-extensions-1.1.123rc3 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_exceptions.py +0 -0
- {ob-metaflow-extensions-1.1.123rc3 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_job.py +0 -0
- {ob-metaflow-extensions-1.1.123rc3 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/snowpark/snowpark_service_spec.py +0 -0
- {ob-metaflow-extensions-1.1.123rc3 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/plugins/tensorboard/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.123rc3 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/profilers/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.123rc3 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/profilers/gpu.py +0 -0
- {ob-metaflow-extensions-1.1.123rc3 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/remote_config.py +0 -0
- {ob-metaflow-extensions-1.1.123rc3 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/toplevel/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.123rc3 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py +0 -0
- {ob-metaflow-extensions-1.1.123rc3 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/toplevel/plugins/azure/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.123rc3 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/toplevel/plugins/gcp/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.123rc3 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/toplevel/plugins/kubernetes/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.123rc3 → ob-metaflow-extensions-1.1.125}/metaflow_extensions/outerbounds/toplevel/plugins/snowflake/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.123rc3 → ob-metaflow-extensions-1.1.125}/ob_metaflow_extensions.egg-info/dependency_links.txt +0 -0
- {ob-metaflow-extensions-1.1.123rc3 → ob-metaflow-extensions-1.1.125}/ob_metaflow_extensions.egg-info/top_level.txt +0 -0
- {ob-metaflow-extensions-1.1.123rc3 → ob-metaflow-extensions-1.1.125}/setup.cfg +0 -0
|
@@ -319,7 +319,6 @@ STEP_DECORATORS_DESC = [
|
|
|
319
319
|
("snowpark", ".snowpark.snowpark_decorator.SnowparkDecorator"),
|
|
320
320
|
("tensorboard", ".tensorboard.TensorboardDecorator"),
|
|
321
321
|
("gpu_profile", ".profilers.gpu_profile_decorator.GPUProfileDecorator"),
|
|
322
|
-
("app_deploy", ".apps.deploy_decorator.WorkstationAppDeployDecorator"),
|
|
323
322
|
]
|
|
324
323
|
FLOW_DECORATORS_DESC = [("nim", ".nim.NimDecorator")]
|
|
325
324
|
TOGGLE_STEP_DECORATOR = [
|
|
@@ -79,5 +79,16 @@ class NvcfTimeoutTooShortException(MetaflowException):
|
|
|
79
79
|
super(NvcfTimeoutTooShortException, self).__init__(msg)
|
|
80
80
|
|
|
81
81
|
|
|
82
|
+
class NvcfQueueTimeoutTooShortException(MetaflowException):
|
|
83
|
+
headline = "[@nvidia NvcfQueueTimeoutTooShortException] Queue Timeout too short"
|
|
84
|
+
|
|
85
|
+
def __init__(self, step):
|
|
86
|
+
msg = (
|
|
87
|
+
"The queue timeout for step *{step}* should be at least 60 seconds for "
|
|
88
|
+
"execution with @nvidia".format(step=step)
|
|
89
|
+
)
|
|
90
|
+
super(NvcfQueueTimeoutTooShortException, self).__init__(msg)
|
|
91
|
+
|
|
92
|
+
|
|
82
93
|
class NvcfKilledException(MetaflowException):
|
|
83
94
|
headline = "Nvidia job killed"
|
|
@@ -23,29 +23,53 @@ STDOUT_PATH = os.path.join(LOGS_DIR, STDOUT_FILE)
|
|
|
23
23
|
STDERR_PATH = os.path.join(LOGS_DIR, STDERR_FILE)
|
|
24
24
|
|
|
25
25
|
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
def retry_on_status(status_codes=RETRIABLE_STATUS_CODES, max_retries=3, delay=1):
|
|
26
|
+
def retry_on_status(status_codes=[500], max_retries=3, delay=1):
|
|
30
27
|
def decorator(func):
|
|
31
28
|
@wraps(func)
|
|
32
|
-
def wrapper(*args, **kwargs):
|
|
29
|
+
def wrapper(instance, *args, **kwargs):
|
|
33
30
|
retries = 0
|
|
34
|
-
|
|
31
|
+
|
|
32
|
+
# Determine retry limit upfront
|
|
33
|
+
use_queue_timeout = 504 in status_codes
|
|
34
|
+
if use_queue_timeout:
|
|
35
|
+
poll_seconds = int(instance._poll_seconds)
|
|
36
|
+
retry_limit = (
|
|
37
|
+
instance._queue_timeout + (poll_seconds - 1)
|
|
38
|
+
) // poll_seconds
|
|
39
|
+
remainder = instance._queue_timeout % poll_seconds
|
|
40
|
+
last_timeout = remainder if remainder != 0 else poll_seconds
|
|
41
|
+
else:
|
|
42
|
+
retry_limit = max_retries
|
|
43
|
+
|
|
44
|
+
while retries < retry_limit:
|
|
35
45
|
try:
|
|
36
|
-
return func(*args, **kwargs)
|
|
46
|
+
return func(instance, *args, **kwargs)
|
|
37
47
|
except HTTPError as e:
|
|
38
|
-
if e.code in status_codes
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
48
|
+
if e.code not in status_codes or retries >= retry_limit:
|
|
49
|
+
instance._status = JobStatus.FAILED
|
|
50
|
+
if e.code == 504 and retries >= retry_limit:
|
|
51
|
+
raise NvcfPollingConnectionError(
|
|
52
|
+
"Request timed out after all retries"
|
|
53
|
+
)
|
|
54
|
+
raise
|
|
55
|
+
|
|
56
|
+
if e.code == 504 and retries == retry_limit - 1:
|
|
57
|
+
instance._poll_seconds = str(last_timeout)
|
|
58
|
+
|
|
59
|
+
print(
|
|
60
|
+
f"[@nvidia] {'Queue timeout' if e.code == 504 else f'Received {e.code}'}, "
|
|
61
|
+
f"retrying ({retries + 1}/{retry_limit})... with poll seconds as {instance._poll_seconds}"
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
if e.code != 504:
|
|
43
65
|
time.sleep(delay)
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
except
|
|
66
|
+
|
|
67
|
+
retries += 1
|
|
68
|
+
except URLError as e:
|
|
69
|
+
instance._status = JobStatus.FAILED
|
|
47
70
|
raise
|
|
48
|
-
|
|
71
|
+
# final attempt
|
|
72
|
+
return func(instance, *args, **kwargs)
|
|
49
73
|
|
|
50
74
|
return wrapper
|
|
51
75
|
|
|
@@ -53,12 +77,15 @@ def retry_on_status(status_codes=RETRIABLE_STATUS_CODES, max_retries=3, delay=1)
|
|
|
53
77
|
|
|
54
78
|
|
|
55
79
|
class Nvcf(object):
|
|
56
|
-
def __init__(
|
|
80
|
+
def __init__(
|
|
81
|
+
self, metadata, datastore, environment, function_id, ngc_api_key, queue_timeout
|
|
82
|
+
):
|
|
57
83
|
self.metadata = metadata
|
|
58
84
|
self.datastore = datastore
|
|
59
85
|
self.environment = environment
|
|
60
86
|
self._function_id = function_id
|
|
61
87
|
self._ngc_api_key = ngc_api_key
|
|
88
|
+
self._queue_timeout = queue_timeout
|
|
62
89
|
|
|
63
90
|
def launch_job(
|
|
64
91
|
self,
|
|
@@ -120,6 +147,7 @@ class Nvcf(object):
|
|
|
120
147
|
self.datastore._storage_impl,
|
|
121
148
|
self._function_id,
|
|
122
149
|
self._ngc_api_key,
|
|
150
|
+
self._queue_timeout,
|
|
123
151
|
)
|
|
124
152
|
self.job.submit()
|
|
125
153
|
|
|
@@ -172,7 +200,9 @@ result_endpoint = f"{nvcf_url}/v2/nvcf/pexec/status"
|
|
|
172
200
|
|
|
173
201
|
|
|
174
202
|
class Job(object):
|
|
175
|
-
def __init__(
|
|
203
|
+
def __init__(
|
|
204
|
+
self, command, env, task_spec, backend, function_id, ngc_api_key, queue_timeout
|
|
205
|
+
):
|
|
176
206
|
self._payload = {
|
|
177
207
|
"command": command,
|
|
178
208
|
"env": {k: v for k, v in env.items() if v is not None},
|
|
@@ -180,6 +210,8 @@ class Job(object):
|
|
|
180
210
|
self._result = {}
|
|
181
211
|
self._function_id = function_id
|
|
182
212
|
self._ngc_api_key = ngc_api_key
|
|
213
|
+
self._queue_timeout = queue_timeout
|
|
214
|
+
self._poll_seconds = "3600"
|
|
183
215
|
|
|
184
216
|
flow_name = task_spec.get("flow_name")
|
|
185
217
|
run_id = task_spec.get("run_id")
|
|
@@ -214,11 +246,14 @@ class Job(object):
|
|
|
214
246
|
)
|
|
215
247
|
self.heartbeat_thread.start()
|
|
216
248
|
|
|
249
|
+
@retry_on_status(status_codes=[504])
|
|
217
250
|
def submit(self):
|
|
218
251
|
try:
|
|
219
252
|
headers = {
|
|
220
253
|
"Authorization": f"Bearer {self._ngc_api_key}",
|
|
221
254
|
"Content-Type": "application/json",
|
|
255
|
+
"nvcf-feature-enable-gateway-timeout": "true",
|
|
256
|
+
"NVCF-POLL-SECONDS": self._poll_seconds,
|
|
222
257
|
}
|
|
223
258
|
request_data = json.dumps(self._payload).encode()
|
|
224
259
|
request = Request(
|
|
@@ -239,12 +274,9 @@ class Job(object):
|
|
|
239
274
|
self._status = JobStatus.SUBMITTED
|
|
240
275
|
else:
|
|
241
276
|
self._status = JobStatus.FAILED
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
# without that, branching over concurrent requests causes error.
|
|
246
|
-
self._state = JobStatus.FAILED
|
|
247
|
-
raise e
|
|
277
|
+
except URLError:
|
|
278
|
+
self._status = JobStatus.FAILED
|
|
279
|
+
raise
|
|
248
280
|
|
|
249
281
|
@property
|
|
250
282
|
def status(self):
|
|
@@ -272,12 +304,15 @@ class Job(object):
|
|
|
272
304
|
def result(self):
|
|
273
305
|
return self._result
|
|
274
306
|
|
|
275
|
-
@retry_on_status(status_codes=
|
|
307
|
+
@retry_on_status(status_codes=[500], max_retries=3, delay=5)
|
|
308
|
+
@retry_on_status(status_codes=[504])
|
|
276
309
|
def _poll(self):
|
|
277
310
|
try:
|
|
278
311
|
headers = {
|
|
279
312
|
"Authorization": f"Bearer {self._ngc_api_key}",
|
|
280
313
|
"Content-Type": "application/json",
|
|
314
|
+
"nvcf-feature-enable-gateway-timeout": "true",
|
|
315
|
+
"NVCF-POLL-SECONDS": self._poll_seconds,
|
|
281
316
|
}
|
|
282
317
|
request = Request(
|
|
283
318
|
f"{result_endpoint}/{self._invocation_id}", headers=headers
|
|
@@ -296,11 +331,6 @@ class Job(object):
|
|
|
296
331
|
f"[@nvidia] Unexpected response code: {response.getcode()}. Please notify an Outerbounds support engineer if this error persists."
|
|
297
332
|
)
|
|
298
333
|
self._status = JobStatus.FAILED
|
|
299
|
-
# 4xx and 5xx responses go in 'except' block
|
|
300
|
-
except HTTPError as e:
|
|
301
|
-
if e.code not in RETRIABLE_STATUS_CODES:
|
|
302
|
-
self._status = JobStatus.FAILED
|
|
303
|
-
raise
|
|
304
334
|
except URLError:
|
|
305
335
|
self._status = JobStatus.FAILED
|
|
306
336
|
raise
|
|
@@ -111,6 +111,9 @@ def kill(ctx, run_id):
|
|
|
111
111
|
@click.argument("code-package-url")
|
|
112
112
|
@click.option("--function-id", help="NVCF function id.")
|
|
113
113
|
@click.option("--ngc-api-key", help="NGC API key.")
|
|
114
|
+
@click.option(
|
|
115
|
+
"--queue-timeout", default=5 * 24 * 3600, help="Queue timeout in seconds."
|
|
116
|
+
)
|
|
114
117
|
@click.option("--run-id", help="Passed to the top-level 'step'.")
|
|
115
118
|
@click.option("--task-id", help="Passed to the top-level 'step'.")
|
|
116
119
|
@click.option("--input-paths", help="Passed to the top-level 'step'.")
|
|
@@ -133,6 +136,7 @@ def step(
|
|
|
133
136
|
code_package_url,
|
|
134
137
|
function_id,
|
|
135
138
|
ngc_api_key,
|
|
139
|
+
queue_timeout,
|
|
136
140
|
**kwargs,
|
|
137
141
|
):
|
|
138
142
|
def echo(msg, stream="stderr", _id=None, **kwargs):
|
|
@@ -249,6 +253,7 @@ def step(
|
|
|
249
253
|
ctx.obj.environment,
|
|
250
254
|
function_id,
|
|
251
255
|
ngc_api_key,
|
|
256
|
+
queue_timeout,
|
|
252
257
|
)
|
|
253
258
|
try:
|
|
254
259
|
with ctx.obj.monitor.measure("metaflow.nvcf.launch_job"):
|
|
@@ -20,6 +20,7 @@ from .exceptions import (
|
|
|
20
20
|
UnsupportedNvcfConfigurationException,
|
|
21
21
|
UnsupportedNvcfDatastoreException,
|
|
22
22
|
NvcfTimeoutTooShortException,
|
|
23
|
+
NvcfQueueTimeoutTooShortException,
|
|
23
24
|
)
|
|
24
25
|
|
|
25
26
|
from metaflow.metaflow_config import SERVICE_URL
|
|
@@ -36,10 +37,16 @@ class NvcfDecorator(StepDecorator):
|
|
|
36
37
|
Number of GPUs to use.
|
|
37
38
|
gpu_type : str
|
|
38
39
|
Type of Nvidia GPU to use.
|
|
40
|
+
queue_timeout : int
|
|
41
|
+
Time to keep the job in NVCF's queue.
|
|
39
42
|
"""
|
|
40
43
|
|
|
41
44
|
name = "nvidia"
|
|
42
|
-
defaults = {
|
|
45
|
+
defaults = {
|
|
46
|
+
"gpu": 1,
|
|
47
|
+
"gpu_type": None,
|
|
48
|
+
"queue_timeout": 5 * 24 * 3600, # Default 5 days in seconds
|
|
49
|
+
}
|
|
43
50
|
|
|
44
51
|
package_url = None
|
|
45
52
|
package_sha = None
|
|
@@ -118,6 +125,10 @@ class NvcfDecorator(StepDecorator):
|
|
|
118
125
|
)
|
|
119
126
|
self.attributes["function_id"] = available_configurations[desired_configuration]
|
|
120
127
|
|
|
128
|
+
queue_timeout = self.attributes["queue_timeout"]
|
|
129
|
+
if not isinstance(queue_timeout, int) or queue_timeout < 60:
|
|
130
|
+
raise NvcfQueueTimeoutTooShortException(step)
|
|
131
|
+
|
|
121
132
|
def runtime_init(self, flow, graph, package, run_id):
|
|
122
133
|
# Set some more internal state.
|
|
123
134
|
self.flow = flow
|
|
@@ -144,6 +155,7 @@ class NvcfDecorator(StepDecorator):
|
|
|
144
155
|
cli_options = {
|
|
145
156
|
"function_id": self.attributes["function_id"],
|
|
146
157
|
"ngc_api_key": self.attributes["ngc_api_key"],
|
|
158
|
+
"queue_timeout": self.attributes["queue_timeout"],
|
|
147
159
|
}
|
|
148
160
|
cli_args.command_options.update(cli_options)
|
|
149
161
|
cli_args.entrypoint[0] = sys.executable
|
|
@@ -6,10 +6,6 @@ metaflow_extensions/outerbounds/config/__init__.py
|
|
|
6
6
|
metaflow_extensions/outerbounds/plugins/__init__.py
|
|
7
7
|
metaflow_extensions/outerbounds/plugins/auth_server.py
|
|
8
8
|
metaflow_extensions/outerbounds/plugins/perimeters.py
|
|
9
|
-
metaflow_extensions/outerbounds/plugins/apps/__init__.py
|
|
10
|
-
metaflow_extensions/outerbounds/plugins/apps/app_utils.py
|
|
11
|
-
metaflow_extensions/outerbounds/plugins/apps/deploy_decorator.py
|
|
12
|
-
metaflow_extensions/outerbounds/plugins/apps/supervisord_utils.py
|
|
13
9
|
metaflow_extensions/outerbounds/plugins/fast_bakery/__init__.py
|
|
14
10
|
metaflow_extensions/outerbounds/plugins/fast_bakery/docker_environment.py
|
|
15
11
|
metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery.py
|
|
@@ -2,7 +2,7 @@ from setuptools import setup, find_namespace_packages
|
|
|
2
2
|
from pathlib import Path
|
|
3
3
|
|
|
4
4
|
|
|
5
|
-
version = "1.1.
|
|
5
|
+
version = "1.1.125"
|
|
6
6
|
this_directory = Path(__file__).parent
|
|
7
7
|
long_description = (this_directory / "README.md").read_text()
|
|
8
8
|
|
|
@@ -18,6 +18,6 @@ setup(
|
|
|
18
18
|
install_requires=[
|
|
19
19
|
"boto3",
|
|
20
20
|
"kubernetes",
|
|
21
|
-
"ob-metaflow == 2.13.
|
|
21
|
+
"ob-metaflow == 2.13.9.1",
|
|
22
22
|
],
|
|
23
23
|
)
|
|
@@ -1,170 +0,0 @@
|
|
|
1
|
-
from metaflow.exception import MetaflowException
|
|
2
|
-
import os
|
|
3
|
-
from metaflow.metaflow_config_funcs import init_config
|
|
4
|
-
import requests
|
|
5
|
-
import time
|
|
6
|
-
import random
|
|
7
|
-
|
|
8
|
-
# IMPORTANT: Currently contents of this file are mostly duplicated from the outerbounds package.
|
|
9
|
-
# This is purely due to the time rush of having to deliver this feature. As a fast forward, we
|
|
10
|
-
# will reorganize things in a way that the amount of duplication in minimum.
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
APP_READY_POLL_TIMEOUT_SECONDS = 300
|
|
14
|
-
# Even after our backend validates that the app routes are ready, it takes a few seconds for
|
|
15
|
-
# the app to be accessible via the browser. Till we hunt down this delay, add an extra buffer.
|
|
16
|
-
APP_READY_EXTRA_BUFFER_SECONDS = 30
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
def start_app(port=-1, name=""):
|
|
20
|
-
if len(name) == 0 or len(name) >= 20:
|
|
21
|
-
raise MetaflowException("App name should not be more than 20 characters long.")
|
|
22
|
-
elif not name.isalnum() or not name.islower():
|
|
23
|
-
raise MetaflowException(
|
|
24
|
-
"App name can only contain lowercase alphanumeric characters."
|
|
25
|
-
)
|
|
26
|
-
|
|
27
|
-
if "WORKSTATION_ID" not in os.environ:
|
|
28
|
-
raise MetaflowException(
|
|
29
|
-
"All outerbounds app commands can only be run from a workstation."
|
|
30
|
-
)
|
|
31
|
-
|
|
32
|
-
workstation_id = os.environ["WORKSTATION_ID"]
|
|
33
|
-
|
|
34
|
-
try:
|
|
35
|
-
try:
|
|
36
|
-
conf = init_config()
|
|
37
|
-
metaflow_token = conf["METAFLOW_SERVICE_AUTH_KEY"]
|
|
38
|
-
api_url = conf["OBP_API_SERVER"]
|
|
39
|
-
|
|
40
|
-
workstations_response = requests.get(
|
|
41
|
-
f"https://{api_url}/v1/workstations",
|
|
42
|
-
headers={"x-api-key": metaflow_token},
|
|
43
|
-
)
|
|
44
|
-
workstations_response.raise_for_status()
|
|
45
|
-
except:
|
|
46
|
-
raise MetaflowException("Failed to list workstations!")
|
|
47
|
-
|
|
48
|
-
workstations_json = workstations_response.json()["workstations"]
|
|
49
|
-
for workstation in workstations_json:
|
|
50
|
-
if workstation["instance_id"] == os.environ["WORKSTATION_ID"]:
|
|
51
|
-
if "named_ports" in workstation["spec"]:
|
|
52
|
-
try:
|
|
53
|
-
ensure_app_start_request_is_valid(
|
|
54
|
-
workstation["spec"]["named_ports"], port, name
|
|
55
|
-
)
|
|
56
|
-
except ValueError as e:
|
|
57
|
-
raise MetaflowException(str(e))
|
|
58
|
-
|
|
59
|
-
for named_port in workstation["spec"]["named_ports"]:
|
|
60
|
-
if int(named_port["port"]) == port:
|
|
61
|
-
if named_port["enabled"] and named_port["name"] == name:
|
|
62
|
-
print(f"App {name} started on port {port}!")
|
|
63
|
-
print(
|
|
64
|
-
f"Browser URL: https://{api_url.replace('api', 'ui')}/apps/{os.environ['WORKSTATION_ID']}/{name}/"
|
|
65
|
-
)
|
|
66
|
-
print(
|
|
67
|
-
f"API URL: https://{api_url}/apps/{os.environ['WORKSTATION_ID']}/{name}/"
|
|
68
|
-
)
|
|
69
|
-
return
|
|
70
|
-
else:
|
|
71
|
-
try:
|
|
72
|
-
response = requests.put(
|
|
73
|
-
f"https://{api_url}/v1/workstations/update/{workstation_id}/namedports",
|
|
74
|
-
headers={"x-api-key": metaflow_token},
|
|
75
|
-
json={
|
|
76
|
-
"port": port,
|
|
77
|
-
"name": name,
|
|
78
|
-
"enabled": True,
|
|
79
|
-
},
|
|
80
|
-
)
|
|
81
|
-
|
|
82
|
-
response.raise_for_status()
|
|
83
|
-
poll_success = wait_for_app_port_to_be_accessible(
|
|
84
|
-
api_url,
|
|
85
|
-
metaflow_token,
|
|
86
|
-
workstation_id,
|
|
87
|
-
name,
|
|
88
|
-
APP_READY_POLL_TIMEOUT_SECONDS,
|
|
89
|
-
)
|
|
90
|
-
if poll_success:
|
|
91
|
-
print(f"App {name} started on port {port}!")
|
|
92
|
-
print(
|
|
93
|
-
f"Browser URL: https://{api_url.replace('api', 'ui')}/apps/{os.environ['WORKSTATION_ID']}/{name}/"
|
|
94
|
-
)
|
|
95
|
-
print(
|
|
96
|
-
f"API URL: https://{api_url}/apps/{os.environ['WORKSTATION_ID']}/{name}/"
|
|
97
|
-
)
|
|
98
|
-
else:
|
|
99
|
-
raise MetaflowException(
|
|
100
|
-
f"The app could not be deployed in {APP_READY_POLL_TIMEOUT_SECONDS / 60} minutes. Please try again later."
|
|
101
|
-
)
|
|
102
|
-
except Exception:
|
|
103
|
-
raise MetaflowException(
|
|
104
|
-
f"Failed to start app {name} on port {port}!"
|
|
105
|
-
)
|
|
106
|
-
except Exception as e:
|
|
107
|
-
raise MetaflowException(f"Failed to start app {name} on port {port}!")
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
def ensure_app_start_request_is_valid(existing_named_ports, port: int, name: str):
|
|
111
|
-
existing_apps_by_port = {np["port"]: np for np in existing_named_ports}
|
|
112
|
-
|
|
113
|
-
if port not in existing_apps_by_port:
|
|
114
|
-
raise MetaflowException(f"Port {port} not found on workstation")
|
|
115
|
-
|
|
116
|
-
for existing_named_port in existing_named_ports:
|
|
117
|
-
if (
|
|
118
|
-
name == existing_named_port["name"]
|
|
119
|
-
and existing_named_port["port"] != port
|
|
120
|
-
and existing_named_port["enabled"]
|
|
121
|
-
):
|
|
122
|
-
raise MetaflowException(
|
|
123
|
-
f"App with name '{name}' is already deployed on port {existing_named_port['port']}"
|
|
124
|
-
)
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
def wait_for_app_port_to_be_accessible(
|
|
128
|
-
api_url, metaflow_token, workstation_id, app_name, poll_timeout_seconds
|
|
129
|
-
) -> bool:
|
|
130
|
-
num_retries_per_request = 3
|
|
131
|
-
start_time = time.time()
|
|
132
|
-
retry_delay = 1.0
|
|
133
|
-
poll_interval = 10
|
|
134
|
-
wait_message = f"App {app_name} is currently being deployed..."
|
|
135
|
-
while time.time() - start_time < poll_timeout_seconds:
|
|
136
|
-
for _ in range(num_retries_per_request):
|
|
137
|
-
try:
|
|
138
|
-
workstations_response = requests.get(
|
|
139
|
-
f"https://{api_url}/v1/workstations",
|
|
140
|
-
headers={"x-api-key": metaflow_token},
|
|
141
|
-
)
|
|
142
|
-
workstations_response.raise_for_status()
|
|
143
|
-
if is_app_ready(workstations_response.json(), workstation_id, app_name):
|
|
144
|
-
print(wait_message)
|
|
145
|
-
time.sleep(APP_READY_EXTRA_BUFFER_SECONDS)
|
|
146
|
-
return True
|
|
147
|
-
else:
|
|
148
|
-
print(wait_message)
|
|
149
|
-
time.sleep(poll_interval)
|
|
150
|
-
except (
|
|
151
|
-
requests.exceptions.ConnectionError,
|
|
152
|
-
requests.exceptions.ReadTimeout,
|
|
153
|
-
):
|
|
154
|
-
time.sleep(retry_delay)
|
|
155
|
-
retry_delay *= 2 # Double the delay for the next attempt
|
|
156
|
-
retry_delay += random.uniform(0, 1) # Add jitter
|
|
157
|
-
retry_delay = min(retry_delay, 10)
|
|
158
|
-
return False
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
def is_app_ready(response_json: dict, workstation_id: str, app_name: str) -> bool:
|
|
162
|
-
"""Checks if the app is ready in the given workstation's response."""
|
|
163
|
-
workstations = response_json.get("workstations", [])
|
|
164
|
-
for workstation in workstations:
|
|
165
|
-
if workstation.get("instance_id") == workstation_id:
|
|
166
|
-
hosted_apps = workstation.get("status", {}).get("hosted_apps", [])
|
|
167
|
-
for hosted_app in hosted_apps:
|
|
168
|
-
if hosted_app.get("name") == app_name:
|
|
169
|
-
return bool(hosted_app.get("ready"))
|
|
170
|
-
return False
|
ob-metaflow-extensions-1.1.123rc3/metaflow_extensions/outerbounds/plugins/apps/deploy_decorator.py
DELETED
|
@@ -1,154 +0,0 @@
|
|
|
1
|
-
from metaflow.exception import MetaflowException
|
|
2
|
-
from metaflow.decorators import StepDecorator
|
|
3
|
-
from metaflow import current
|
|
4
|
-
from .app_utils import start_app
|
|
5
|
-
from .supervisord_utils import SupervisorClient, SupervisorClientException
|
|
6
|
-
import os
|
|
7
|
-
import random
|
|
8
|
-
import string
|
|
9
|
-
import tempfile
|
|
10
|
-
import sys
|
|
11
|
-
|
|
12
|
-
DEFAULT_WAIT_TIME_SECONDS_FOR_PROCESS_TO_START = 10
|
|
13
|
-
BASE_DIR_FOR_APP_ASSETS = "/home/ob-workspace/.appdaemon/apps/"
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
class WorkstationAppDeployDecorator(StepDecorator):
|
|
17
|
-
"""
|
|
18
|
-
Specifies that this step is used to deploy an instance of the app.
|
|
19
|
-
Requires that self.app_name, self.app_port, self.entrypoint and self.deployDir is set.
|
|
20
|
-
|
|
21
|
-
Parameters
|
|
22
|
-
----------
|
|
23
|
-
app_port : int
|
|
24
|
-
Number of GPUs to use.
|
|
25
|
-
app_name : str
|
|
26
|
-
Name of the app to deploy.
|
|
27
|
-
"""
|
|
28
|
-
|
|
29
|
-
name = "app_deploy"
|
|
30
|
-
defaults = {"app_port": 8080, "app_name": "app"}
|
|
31
|
-
|
|
32
|
-
def step_init(self, flow, graph, step, decos, environment, flow_datastore, logger):
|
|
33
|
-
if any([deco.name == "kubernetes" for deco in decos]):
|
|
34
|
-
raise MetaflowException(
|
|
35
|
-
"@app_deploy decorator is only supported locally and does not work with remote execution environments like @kubernetes, @nvidia."
|
|
36
|
-
)
|
|
37
|
-
|
|
38
|
-
# We always need to have some environment defined through the flow to deploy and app.
|
|
39
|
-
# Which means either step decorators like @pypi / @conda must be defined.
|
|
40
|
-
# or flow level decorators like @conda_base / @pypi_base.
|
|
41
|
-
if not any([deco.name == "pypi" or deco.name == "conda" for deco in decos]):
|
|
42
|
-
flow_decorators = flow._flow_decorators.keys()
|
|
43
|
-
if (
|
|
44
|
-
"conda_base" not in flow_decorators
|
|
45
|
-
and "pypi_base" not in flow_decorators
|
|
46
|
-
):
|
|
47
|
-
raise MetaflowException(
|
|
48
|
-
"@app_deploy requires either step decorators like @pypi / @conda or flow level decorators like @conda_base / @pypi_base to be defined."
|
|
49
|
-
)
|
|
50
|
-
|
|
51
|
-
app_port = self.attributes["app_port"]
|
|
52
|
-
app_name = self.attributes["app_name"]
|
|
53
|
-
|
|
54
|
-
# Currently this decorator is expected to only execute on workstation.
|
|
55
|
-
if app_port is None or app_port < 6000 or app_port > 6002:
|
|
56
|
-
raise MetaflowException(
|
|
57
|
-
"AppDeployDecorator requires app_port to be between 6000 and 6002."
|
|
58
|
-
)
|
|
59
|
-
|
|
60
|
-
if app_name is None:
|
|
61
|
-
raise MetaflowException("AppDeployDecorator requires app_name to be set.")
|
|
62
|
-
|
|
63
|
-
def task_pre_step(
|
|
64
|
-
self,
|
|
65
|
-
step_name,
|
|
66
|
-
task_datastore,
|
|
67
|
-
metadata,
|
|
68
|
-
run_id,
|
|
69
|
-
task_id,
|
|
70
|
-
flow,
|
|
71
|
-
graph,
|
|
72
|
-
retry_count,
|
|
73
|
-
max_user_code_retries,
|
|
74
|
-
ubf_context,
|
|
75
|
-
inputs,
|
|
76
|
-
):
|
|
77
|
-
os.makedirs(BASE_DIR_FOR_APP_ASSETS, exist_ok=True)
|
|
78
|
-
# First we want to create a directory where the user's app directory and artifacts can be stored.
|
|
79
|
-
with tempfile.TemporaryDirectory(
|
|
80
|
-
prefix=BASE_DIR_FOR_APP_ASSETS, delete=False
|
|
81
|
-
) as temp_dir:
|
|
82
|
-
launch_temp_dir = temp_dir
|
|
83
|
-
|
|
84
|
-
# Expose this to the user, so that they can use it write their artifacts.
|
|
85
|
-
setattr(flow, "deploy_dir", launch_temp_dir)
|
|
86
|
-
|
|
87
|
-
# Make sure to record deploy_dir so that the user cannot accidentally override it.
|
|
88
|
-
self._deploy_dir = launch_temp_dir
|
|
89
|
-
|
|
90
|
-
def task_post_step(
|
|
91
|
-
self, step_name, flow, graph, retry_count, max_user_code_retries
|
|
92
|
-
):
|
|
93
|
-
deploy_dir = self._deploy_dir
|
|
94
|
-
|
|
95
|
-
# By default we assume that the user has a __main__.py file in their app directory.
|
|
96
|
-
# They can always override this behavior.
|
|
97
|
-
user_provided_entrypoint = getattr(flow, "entrypoint", None)
|
|
98
|
-
|
|
99
|
-
if user_provided_entrypoint is not None and not isinstance(
|
|
100
|
-
user_provided_entrypoint, str
|
|
101
|
-
):
|
|
102
|
-
raise MetaflowException(
|
|
103
|
-
f"@app_deploy requires entrypoint to be set to a string. The current value of entrypoint {user_provided_entrypoint} is not valid."
|
|
104
|
-
)
|
|
105
|
-
|
|
106
|
-
flow_directory = os.path.dirname(os.path.abspath(sys.argv[0]))
|
|
107
|
-
|
|
108
|
-
# By default, we assume that the layout of the flow directory is:
|
|
109
|
-
# flow_dir/
|
|
110
|
-
# - deployer_flow.py
|
|
111
|
-
# - my_custom_app/
|
|
112
|
-
# - __main__.py
|
|
113
|
-
# - other_files
|
|
114
|
-
# - other_dirs/
|
|
115
|
-
# This can be overridden by the user by setting the app_dir attribute.
|
|
116
|
-
# None of this matters if the user provides a custom entrypoint, since in that case we don't copy
|
|
117
|
-
# anything anywhere.
|
|
118
|
-
app_location = getattr(
|
|
119
|
-
flow, "app_dir", os.path.join(flow_directory, self.attributes["app_name"])
|
|
120
|
-
)
|
|
121
|
-
|
|
122
|
-
if user_provided_entrypoint is None and not os.path.exists(app_location):
|
|
123
|
-
raise MetaflowException(f"App directory {app_location} does not exist.")
|
|
124
|
-
|
|
125
|
-
wait_time_for_app_start = getattr(
|
|
126
|
-
flow,
|
|
127
|
-
"wait_time_for_app_start",
|
|
128
|
-
DEFAULT_WAIT_TIME_SECONDS_FOR_PROCESS_TO_START,
|
|
129
|
-
)
|
|
130
|
-
|
|
131
|
-
try:
|
|
132
|
-
supervisor_client = SupervisorClient(
|
|
133
|
-
wait_time_seconds_for_app_start=wait_time_for_app_start
|
|
134
|
-
)
|
|
135
|
-
|
|
136
|
-
# First, let's deploy the app.
|
|
137
|
-
start_app(
|
|
138
|
-
port=self.attributes["app_port"], name=self.attributes["app_name"]
|
|
139
|
-
)
|
|
140
|
-
|
|
141
|
-
# Now, let's add the app to supervisor.
|
|
142
|
-
supervisor_client.start_process_with_supervisord(
|
|
143
|
-
self.attributes["app_name"],
|
|
144
|
-
self.attributes["app_port"],
|
|
145
|
-
user_provided_entrypoint,
|
|
146
|
-
deploy_dir,
|
|
147
|
-
app_location,
|
|
148
|
-
)
|
|
149
|
-
except SupervisorClientException as e:
|
|
150
|
-
raise MetaflowException(str(e))
|
|
151
|
-
except Exception as e:
|
|
152
|
-
raise MetaflowException(
|
|
153
|
-
f"Failed to start {self.attributes['app_name']}! Cause: {str(e)}"
|
|
154
|
-
) from e
|
ob-metaflow-extensions-1.1.123rc3/metaflow_extensions/outerbounds/plugins/apps/supervisord_utils.py
DELETED
|
@@ -1,239 +0,0 @@
|
|
|
1
|
-
import argparse
|
|
2
|
-
import os
|
|
3
|
-
import configparser
|
|
4
|
-
import tempfile
|
|
5
|
-
import sys
|
|
6
|
-
import subprocess
|
|
7
|
-
from pathlib import Path
|
|
8
|
-
import shutil
|
|
9
|
-
from enum import Enum
|
|
10
|
-
import time
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class SupervisorClientException(Exception):
|
|
14
|
-
pass
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
class SupervisorClient:
|
|
18
|
-
class SupervisodProcessCodes(Enum):
|
|
19
|
-
STOPPED = 0
|
|
20
|
-
STARTING = 10
|
|
21
|
-
RUNNING = 20
|
|
22
|
-
BACKOFF = 30
|
|
23
|
-
STOPPING = 40
|
|
24
|
-
EXITED = 100
|
|
25
|
-
FATAL = 200
|
|
26
|
-
UNKNOWN = 1000
|
|
27
|
-
|
|
28
|
-
def __init__(self, wait_time_seconds_for_app_start: int):
|
|
29
|
-
self.supervisor_conf_loc = os.environ.get("SUPERVISOR_CONF_PATH")
|
|
30
|
-
|
|
31
|
-
self.wait_time_seconds_for_app_start = wait_time_seconds_for_app_start
|
|
32
|
-
if self.supervisor_conf_loc is None or not os.path.exists(
|
|
33
|
-
self.supervisor_conf_loc
|
|
34
|
-
):
|
|
35
|
-
raise SupervisorClientException(
|
|
36
|
-
"This workstation does not support deploying apps! Please reach out to Outerbounds for support."
|
|
37
|
-
)
|
|
38
|
-
|
|
39
|
-
self.metaflow_envs_persistent_path = os.environ.get(
|
|
40
|
-
"SUPERVISOR_PYTHON_ENVS_PATH"
|
|
41
|
-
)
|
|
42
|
-
if self.metaflow_envs_persistent_path is None:
|
|
43
|
-
raise SupervisorClientException(
|
|
44
|
-
"This workstation does not support deploying apps! Please reach out to Outerbounds for support."
|
|
45
|
-
)
|
|
46
|
-
|
|
47
|
-
# Check if supervisorctl is installed
|
|
48
|
-
if not shutil.which("supervisorctl"):
|
|
49
|
-
raise SupervisorClientException(
|
|
50
|
-
"This workstation does not support deploying apps! Please reach out to Outerbounds for support."
|
|
51
|
-
)
|
|
52
|
-
|
|
53
|
-
def _stop_existing_app_at_port(self, app_port):
|
|
54
|
-
supervisor_config = configparser.ConfigParser()
|
|
55
|
-
supervisor_config.read(self.supervisor_conf_loc)
|
|
56
|
-
|
|
57
|
-
for program in supervisor_config.sections():
|
|
58
|
-
if "obp_app_port" in supervisor_config[program]:
|
|
59
|
-
if supervisor_config[program]["obp_app_port"].strip() == str(app_port):
|
|
60
|
-
res = subprocess.run(
|
|
61
|
-
["supervisorctl", "stop", program],
|
|
62
|
-
stdout=subprocess.DEVNULL,
|
|
63
|
-
stderr=subprocess.DEVNULL,
|
|
64
|
-
)
|
|
65
|
-
|
|
66
|
-
del supervisor_config[program]
|
|
67
|
-
|
|
68
|
-
with tempfile.NamedTemporaryFile(
|
|
69
|
-
"w", dir=os.path.dirname(self.supervisor_conf_loc), delete=False
|
|
70
|
-
) as f:
|
|
71
|
-
supervisor_config.write(f)
|
|
72
|
-
tmp_file = f.name
|
|
73
|
-
|
|
74
|
-
os.rename(tmp_file, self.supervisor_conf_loc)
|
|
75
|
-
|
|
76
|
-
def start_process_with_supervisord(
|
|
77
|
-
self,
|
|
78
|
-
app_name,
|
|
79
|
-
app_port,
|
|
80
|
-
user_provided_entrypoint,
|
|
81
|
-
deploy_dir=None,
|
|
82
|
-
app_dir=None,
|
|
83
|
-
):
|
|
84
|
-
"""
|
|
85
|
-
Add a new program entry to supervisor configuration.
|
|
86
|
-
|
|
87
|
-
Args:
|
|
88
|
-
app_name: The name of the app to start.
|
|
89
|
-
entrypoint: The entrypoint to start the app with.
|
|
90
|
-
directory: The directory to run the app in.
|
|
91
|
-
deploy_dir: The directory to copy the app to and deploy from.
|
|
92
|
-
app_dir: The directory to copy the app from.
|
|
93
|
-
"""
|
|
94
|
-
|
|
95
|
-
entrypoint = user_provided_entrypoint
|
|
96
|
-
deploy_dir_for_port = "/home/ob-workspace/.appdaemon/apps/6000"
|
|
97
|
-
launch_directory = (
|
|
98
|
-
"/home/ob-workspace/.appdaemon/apps"
|
|
99
|
-
if entrypoint is None
|
|
100
|
-
else "/home/ob-workspace/.appdaemon"
|
|
101
|
-
)
|
|
102
|
-
|
|
103
|
-
# Step 1: Stop any existing apps that are running on the same port.
|
|
104
|
-
self._stop_existing_app_at_port(app_port)
|
|
105
|
-
|
|
106
|
-
if user_provided_entrypoint is None:
|
|
107
|
-
# Step 2: Copy the app_dir to the deploy_dir.
|
|
108
|
-
recursive_copy(app_dir, deploy_dir)
|
|
109
|
-
|
|
110
|
-
# Step 3: Copy the entire deploy_dir to the port specific directory.
|
|
111
|
-
if os.path.exists(deploy_dir_for_port):
|
|
112
|
-
shutil.rmtree(deploy_dir_for_port)
|
|
113
|
-
|
|
114
|
-
os.makedirs(deploy_dir_for_port)
|
|
115
|
-
recursive_copy(deploy_dir, deploy_dir_for_port)
|
|
116
|
-
|
|
117
|
-
# Apply default value
|
|
118
|
-
entrypoint = f"-m {str(app_port)}"
|
|
119
|
-
|
|
120
|
-
shutil.rmtree(deploy_dir)
|
|
121
|
-
|
|
122
|
-
persistent_path_for_executable = (
|
|
123
|
-
self.persist_metaflow_generated_python_environment()
|
|
124
|
-
)
|
|
125
|
-
|
|
126
|
-
command = f"{persistent_path_for_executable} {entrypoint}"
|
|
127
|
-
|
|
128
|
-
entry = {
|
|
129
|
-
"command": command,
|
|
130
|
-
"directory": launch_directory,
|
|
131
|
-
"autostart": "true",
|
|
132
|
-
"autorestart": "true",
|
|
133
|
-
"obp_app_port": app_port, # Record the app port for internal reference. This is not used by supervisor.
|
|
134
|
-
}
|
|
135
|
-
|
|
136
|
-
supervisor_config = configparser.ConfigParser()
|
|
137
|
-
supervisor_config.read(self.supervisor_conf_loc)
|
|
138
|
-
|
|
139
|
-
supervisor_config[f"program:{app_name}"] = entry
|
|
140
|
-
|
|
141
|
-
with tempfile.NamedTemporaryFile(
|
|
142
|
-
"w", dir=os.path.dirname(self.supervisor_conf_loc), delete=False
|
|
143
|
-
) as f:
|
|
144
|
-
supervisor_config.write(f)
|
|
145
|
-
tmp_file = f.name
|
|
146
|
-
|
|
147
|
-
os.rename(tmp_file, self.supervisor_conf_loc)
|
|
148
|
-
|
|
149
|
-
# Execute supervisorctl reload
|
|
150
|
-
# Capture the exit code
|
|
151
|
-
exit_code = subprocess.run(
|
|
152
|
-
["supervisorctl", "reload"],
|
|
153
|
-
stdout=subprocess.DEVNULL,
|
|
154
|
-
stderr=subprocess.DEVNULL,
|
|
155
|
-
).returncode
|
|
156
|
-
if exit_code != 0:
|
|
157
|
-
print("Failed to reload supervisor configuration!", file=sys.stderr)
|
|
158
|
-
return
|
|
159
|
-
|
|
160
|
-
print(
|
|
161
|
-
f"Waiting for {self.wait_time_seconds_for_app_start} seconds for {app_name} to start..."
|
|
162
|
-
)
|
|
163
|
-
time.sleep(self.wait_time_seconds_for_app_start)
|
|
164
|
-
status = self._get_launched_prcoess_status(app_name)
|
|
165
|
-
|
|
166
|
-
if status not in [
|
|
167
|
-
self.SupervisodProcessCodes.RUNNING,
|
|
168
|
-
self.SupervisodProcessCodes.STARTING,
|
|
169
|
-
]:
|
|
170
|
-
raise SupervisorClientException(
|
|
171
|
-
f"Failed to start {app_name}! Try running {command} manually to debug."
|
|
172
|
-
)
|
|
173
|
-
|
|
174
|
-
def _get_launched_prcoess_status(self, app_name):
|
|
175
|
-
status_cmd_output = subprocess.run(
|
|
176
|
-
["supervisorctl", "status", app_name],
|
|
177
|
-
stdout=subprocess.PIPE,
|
|
178
|
-
stderr=subprocess.PIPE,
|
|
179
|
-
).stdout.decode("utf-8")
|
|
180
|
-
|
|
181
|
-
status_cmd_output_parts = [
|
|
182
|
-
x.strip() for x in status_cmd_output.split(" ") if x.strip()
|
|
183
|
-
]
|
|
184
|
-
|
|
185
|
-
status_str = status_cmd_output_parts[1]
|
|
186
|
-
|
|
187
|
-
if status_str == "RUNNING":
|
|
188
|
-
return self.SupervisodProcessCodes.RUNNING
|
|
189
|
-
elif status_str == "STOPPED":
|
|
190
|
-
return self.SupervisodProcessCodes.STOPPED
|
|
191
|
-
elif status_str == "STARTING":
|
|
192
|
-
return self.SupervisodProcessCodes.STARTING
|
|
193
|
-
elif status_str == "BACKOFF":
|
|
194
|
-
return self.SupervisodProcessCodes.BACKOFF
|
|
195
|
-
elif status_str == "STOPPING":
|
|
196
|
-
return self.SupervisodProcessCodes.STOPPING
|
|
197
|
-
elif status_str == "EXITED":
|
|
198
|
-
return self.SupervisodProcessCodes.EXITED
|
|
199
|
-
elif status_str == "FATAL":
|
|
200
|
-
return self.SupervisodProcessCodes.FATAL
|
|
201
|
-
else:
|
|
202
|
-
return self.SupervisodProcessCodes.UNKNOWN
|
|
203
|
-
|
|
204
|
-
# By default, an environment generated by metaflow will end up in a path like: /root/micromamba/envs/metaflow/linux-64/02699a4d2d50cfc/bin/python
|
|
205
|
-
# However, on a workstation these environments are not persisted, so we need to copy them over to /home/ob-workspace
|
|
206
|
-
def persist_metaflow_generated_python_environment(self):
|
|
207
|
-
current_executable = sys.executable
|
|
208
|
-
environment_path = Path(current_executable).parent.parent
|
|
209
|
-
|
|
210
|
-
persistent_path_for_this_environment = os.path.join(
|
|
211
|
-
self.metaflow_envs_persistent_path,
|
|
212
|
-
environment_path.parent.name,
|
|
213
|
-
environment_path.name,
|
|
214
|
-
)
|
|
215
|
-
|
|
216
|
-
final_executable_path = os.path.join(
|
|
217
|
-
persistent_path_for_this_environment,
|
|
218
|
-
Path(current_executable).parent.name,
|
|
219
|
-
Path(current_executable).name,
|
|
220
|
-
)
|
|
221
|
-
|
|
222
|
-
if os.path.exists(final_executable_path):
|
|
223
|
-
return final_executable_path
|
|
224
|
-
|
|
225
|
-
os.makedirs(persistent_path_for_this_environment, exist_ok=True)
|
|
226
|
-
|
|
227
|
-
recursive_copy(environment_path, persistent_path_for_this_environment)
|
|
228
|
-
|
|
229
|
-
return final_executable_path
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
def recursive_copy(src, dst):
|
|
233
|
-
for item in os.listdir(src):
|
|
234
|
-
s = os.path.join(src, item)
|
|
235
|
-
d = os.path.join(dst, item)
|
|
236
|
-
if os.path.isdir(s):
|
|
237
|
-
shutil.copytree(s, d, dirs_exist_ok=True)
|
|
238
|
-
else:
|
|
239
|
-
shutil.copy2(s, d)
|
ob-metaflow-extensions-1.1.123rc3/metaflow_extensions/outerbounds/plugins/snowpark/__init__.py
DELETED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|