ob-metaflow-extensions 1.1.150__py2.py3-none-any.whl → 1.1.151__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- metaflow_extensions/outerbounds/plugins/__init__.py +2 -0
- metaflow_extensions/outerbounds/plugins/nvct/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/nvct/exceptions.py +71 -0
- metaflow_extensions/outerbounds/plugins/nvct/nvct.py +131 -0
- metaflow_extensions/outerbounds/plugins/nvct/nvct_cli.py +289 -0
- metaflow_extensions/outerbounds/plugins/nvct/nvct_decorator.py +262 -0
- metaflow_extensions/outerbounds/plugins/nvct/nvct_runner.py +218 -0
- metaflow_extensions/outerbounds/plugins/nvct/utils.py +29 -0
- {ob_metaflow_extensions-1.1.150.dist-info → ob_metaflow_extensions-1.1.151.dist-info}/METADATA +1 -1
- {ob_metaflow_extensions-1.1.150.dist-info → ob_metaflow_extensions-1.1.151.dist-info}/RECORD +12 -5
- {ob_metaflow_extensions-1.1.150.dist-info → ob_metaflow_extensions-1.1.151.dist-info}/WHEEL +0 -0
- {ob_metaflow_extensions-1.1.150.dist-info → ob_metaflow_extensions-1.1.151.dist-info}/top_level.txt +0 -0
|
@@ -312,11 +312,13 @@ class ObpGcpAuthProvider(object):
|
|
|
312
312
|
GCP_CLIENT_PROVIDERS_DESC = [("obp", ".ObpGcpAuthProvider")]
|
|
313
313
|
CLIS_DESC = [
|
|
314
314
|
("nvidia", ".nvcf.nvcf_cli.cli"),
|
|
315
|
+
("nvct", ".nvct.nvct_cli.cli"),
|
|
315
316
|
("fast-bakery", ".fast_bakery.fast_bakery_cli.cli"),
|
|
316
317
|
("snowpark", ".snowpark.snowpark_cli.cli"),
|
|
317
318
|
]
|
|
318
319
|
STEP_DECORATORS_DESC = [
|
|
319
320
|
("nvidia", ".nvcf.nvcf_decorator.NvcfDecorator"),
|
|
321
|
+
("nvct", ".nvct.nvct_decorator.NvctDecorator"),
|
|
320
322
|
(
|
|
321
323
|
"fast_bakery_internal",
|
|
322
324
|
".fast_bakery.fast_bakery_decorator.InternalFastBakeryDecorator",
|
|
File without changes
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
from metaflow.exception import MetaflowException
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class NvctExecutionException(MetaflowException):
|
|
5
|
+
headline = "Nvct task couldn't be executed"
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class NvctTaskFailedException(MetaflowException):
|
|
9
|
+
headline = "Nvct task failed"
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class NvctKilledException(MetaflowException):
|
|
13
|
+
headline = "Nvct job killed"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class RequestedGPUTypeUnavailableException(MetaflowException):
|
|
17
|
+
headline = "[@nvct RequestedGPUTypeUnavailableException] GPU type unavailable."
|
|
18
|
+
|
|
19
|
+
def __init__(self, requested_gpu_type, available_gpus):
|
|
20
|
+
msg = (
|
|
21
|
+
f"The requested GPU type @nvct(..., gpu_type='{requested_gpu_type}') is not available. "
|
|
22
|
+
f"Please choose from the following supported GPU types when using @nvct: {available_gpus}"
|
|
23
|
+
)
|
|
24
|
+
super(RequestedGPUTypeUnavailableException, self).__init__(msg)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class UnsupportedNvctConfigurationException(MetaflowException):
|
|
28
|
+
headline = (
|
|
29
|
+
"[@nvct UnsupportedNvctConfigurationException] Unsupported GPU configuration"
|
|
30
|
+
)
|
|
31
|
+
|
|
32
|
+
def __init__(self, n_gpu, gpu_type, available_configurations, step):
|
|
33
|
+
msg = f"The requested configuration of @nvct(gpu={n_gpu}, gpu_type='{gpu_type}') for @step {step} is not available."
|
|
34
|
+
if len(available_configurations) == 0:
|
|
35
|
+
msg += (
|
|
36
|
+
"\n\nNo configurations are available in your Outerbounds deployment."
|
|
37
|
+
" Please contact Outerbounds support if you wish to use @nvct."
|
|
38
|
+
)
|
|
39
|
+
else:
|
|
40
|
+
msg += f"\n\nAvailable configurations for your deployment with {gpu_type} include: \n\t- {self._display(gpu_type, available_configurations)}"
|
|
41
|
+
msg += "\n\nPlease contact Outerbounds support if you wish to use a configuration not listed above."
|
|
42
|
+
super(UnsupportedNvctConfigurationException, self).__init__(msg)
|
|
43
|
+
|
|
44
|
+
def _display(self, gpu_type, configs):
|
|
45
|
+
_available_decos = []
|
|
46
|
+
for cfg in configs:
|
|
47
|
+
n_gpu = cfg["n_gpus"]
|
|
48
|
+
_available_decos.append(f"@nvct(gpu={n_gpu}, gpu_type='{gpu_type}')")
|
|
49
|
+
return "\n\t- ".join(_available_decos)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class UnsupportedNvctDatastoreException(MetaflowException):
|
|
53
|
+
headline = "[@nvct UnsupportedNvctDatastoreException] Unsupported datastore"
|
|
54
|
+
|
|
55
|
+
def __init__(self, ds_type):
|
|
56
|
+
msg = (
|
|
57
|
+
"The *@nvct* decorator requires --datastore=s3 or --datastore=azure or --datastore=gs at the moment."
|
|
58
|
+
f"Current datastore type: {ds_type}."
|
|
59
|
+
)
|
|
60
|
+
super(UnsupportedNvctDatastoreException, self).__init__(msg)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class NvctTimeoutTooShortException(MetaflowException):
|
|
64
|
+
headline = "[@nvct NvctTimeoutTooShortException] Timeout too short"
|
|
65
|
+
|
|
66
|
+
def __init__(self, step):
|
|
67
|
+
msg = (
|
|
68
|
+
"The timeout for step *{step}* should be at least 60 seconds for "
|
|
69
|
+
"execution with @nvct".format(step=step)
|
|
70
|
+
)
|
|
71
|
+
super(NvctTimeoutTooShortException, self).__init__(msg)
|
|
@@ -0,0 +1,131 @@
|
|
|
1
|
+
import requests
|
|
2
|
+
from requests.adapters import HTTPAdapter, Retry
|
|
3
|
+
|
|
4
|
+
BASE_URL = "https://api.ngc.nvidia.com/v2/orgs/zhxkmsaasxhw/"
|
|
5
|
+
POLL_SEC = 1
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def _session(api_key):
|
|
9
|
+
s = requests.Session()
|
|
10
|
+
s.headers.update(
|
|
11
|
+
{
|
|
12
|
+
"Authorization": f"Bearer {api_key}",
|
|
13
|
+
"Content-Type": "application/json",
|
|
14
|
+
}
|
|
15
|
+
)
|
|
16
|
+
retry = Retry(total=5, backoff_factor=1.5, status_forcelist=[502, 503, 504])
|
|
17
|
+
s.mount("https://", HTTPAdapter(max_retries=retry))
|
|
18
|
+
return s
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _url(path):
|
|
22
|
+
return BASE_URL.rstrip("/") + path
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class NVCTClient:
|
|
26
|
+
def __init__(self, api_key):
|
|
27
|
+
self.sess = _session(api_key)
|
|
28
|
+
|
|
29
|
+
# TODO: Handle https://outerboundsco.slack.com/archives/C05QGNR4E06/p1745970955540289
|
|
30
|
+
def create(self, spec):
|
|
31
|
+
r = self.sess.post(_url("/nvct/tasks"), json=spec, timeout=30)
|
|
32
|
+
r.raise_for_status()
|
|
33
|
+
return r.json().get("task", {}).get("id")
|
|
34
|
+
|
|
35
|
+
def get(self, task_id):
|
|
36
|
+
r = self.sess.get(_url(f"/nvct/tasks/{task_id}"), timeout=30)
|
|
37
|
+
r.raise_for_status()
|
|
38
|
+
return r.json().get("task", {})
|
|
39
|
+
|
|
40
|
+
def cancel(self, task_id):
|
|
41
|
+
r = self.sess.post(_url(f"/nvct/tasks/{task_id}/cancel"), timeout=30)
|
|
42
|
+
r.raise_for_status()
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class NVCTRequest(object):
|
|
46
|
+
def __init__(self, name):
|
|
47
|
+
self._spec = {}
|
|
48
|
+
self._spec["name"] = name
|
|
49
|
+
self._spec["gpuSpecification"] = {}
|
|
50
|
+
self._spec["resultHandlingStrategy"] = "NONE"
|
|
51
|
+
self._spec["terminationGracePeriodDuration"] = "PT10M"
|
|
52
|
+
|
|
53
|
+
def container_image(self, image):
|
|
54
|
+
self._spec["containerImage"] = image
|
|
55
|
+
return self
|
|
56
|
+
|
|
57
|
+
def container_args(self, args):
|
|
58
|
+
self._spec["containerArgs"] = args
|
|
59
|
+
return self
|
|
60
|
+
|
|
61
|
+
def env(self, key, value):
|
|
62
|
+
env_list = self._spec.setdefault("containerEnvironment", [])
|
|
63
|
+
env_list.append({"key": key, "value": value})
|
|
64
|
+
return self
|
|
65
|
+
|
|
66
|
+
def gpu(self, gpu, instance_type, backend):
|
|
67
|
+
gpu_spec = self._spec["gpuSpecification"]
|
|
68
|
+
gpu_spec["gpu"] = gpu
|
|
69
|
+
gpu_spec["instanceType"] = instance_type
|
|
70
|
+
gpu_spec["backend"] = backend
|
|
71
|
+
return self
|
|
72
|
+
|
|
73
|
+
def max_runtime(self, iso_duration):
|
|
74
|
+
self._spec["maxRuntimeDuration"] = iso_duration
|
|
75
|
+
return self
|
|
76
|
+
|
|
77
|
+
def max_queued(self, iso_duration="PT72H"):
|
|
78
|
+
self._spec["maxQueuedDuration"] = iso_duration
|
|
79
|
+
return self
|
|
80
|
+
|
|
81
|
+
def termination_grace(self, iso_duration="PT10M"):
|
|
82
|
+
self._spec["terminationGracePeriodDuration"] = iso_duration
|
|
83
|
+
return self
|
|
84
|
+
|
|
85
|
+
def extra(self, key, value):
|
|
86
|
+
self._spec[key] = value
|
|
87
|
+
return self
|
|
88
|
+
|
|
89
|
+
def to_dict(self):
|
|
90
|
+
return self._spec
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class NVCTTask:
|
|
94
|
+
def __init__(self, client: NVCTClient, spec):
|
|
95
|
+
self.client = client
|
|
96
|
+
self.spec = spec
|
|
97
|
+
self.id = None
|
|
98
|
+
self.record = None
|
|
99
|
+
|
|
100
|
+
def submit(self):
|
|
101
|
+
self.id = self.client.create(self.spec)
|
|
102
|
+
return self.id
|
|
103
|
+
|
|
104
|
+
def cancel(self):
|
|
105
|
+
if not self.has_finished:
|
|
106
|
+
self.client.cancel(self.id)
|
|
107
|
+
|
|
108
|
+
@property
|
|
109
|
+
def status(self):
|
|
110
|
+
self.record = self.client.get(self.id)
|
|
111
|
+
return self.record["status"]
|
|
112
|
+
|
|
113
|
+
@property
|
|
114
|
+
def is_waiting(self):
|
|
115
|
+
return self.status == "QUEUED"
|
|
116
|
+
|
|
117
|
+
@property
|
|
118
|
+
def is_running(self):
|
|
119
|
+
return self.status in {"RUNNING", "LAUNCHED"}
|
|
120
|
+
|
|
121
|
+
@property
|
|
122
|
+
def has_failed(self):
|
|
123
|
+
return self.status in {"ERRORED", "CANCELED"}
|
|
124
|
+
|
|
125
|
+
@property
|
|
126
|
+
def has_succeeded(self):
|
|
127
|
+
return self.status == "COMPLETED"
|
|
128
|
+
|
|
129
|
+
@property
|
|
130
|
+
def has_finished(self):
|
|
131
|
+
return self.has_succeeded or self.has_failed
|
|
@@ -0,0 +1,289 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import sys
|
|
4
|
+
import time
|
|
5
|
+
import traceback
|
|
6
|
+
|
|
7
|
+
from metaflow import util, Run
|
|
8
|
+
from metaflow._vendor import click
|
|
9
|
+
from metaflow.exception import METAFLOW_EXIT_DISALLOW_RETRY
|
|
10
|
+
from metaflow.metadata_provider.util import sync_local_metadata_from_datastore
|
|
11
|
+
from metaflow.metaflow_config import (
|
|
12
|
+
CARD_S3ROOT,
|
|
13
|
+
DATASTORE_LOCAL_DIR,
|
|
14
|
+
DATASTORE_SYSROOT_S3,
|
|
15
|
+
DATATOOLS_S3ROOT,
|
|
16
|
+
DEFAULT_METADATA,
|
|
17
|
+
SERVICE_HEADERS,
|
|
18
|
+
SERVICE_URL,
|
|
19
|
+
DEFAULT_SECRETS_BACKEND_TYPE,
|
|
20
|
+
DEFAULT_AWS_CLIENT_PROVIDER,
|
|
21
|
+
AWS_SECRETS_MANAGER_DEFAULT_REGION,
|
|
22
|
+
S3_ENDPOINT_URL,
|
|
23
|
+
AZURE_STORAGE_BLOB_SERVICE_ENDPOINT,
|
|
24
|
+
DATASTORE_SYSROOT_AZURE,
|
|
25
|
+
CARD_AZUREROOT,
|
|
26
|
+
DATASTORE_SYSROOT_GS,
|
|
27
|
+
CARD_GSROOT,
|
|
28
|
+
KUBERNETES_SANDBOX_INIT_SCRIPT,
|
|
29
|
+
OTEL_ENDPOINT,
|
|
30
|
+
)
|
|
31
|
+
from metaflow.mflog import TASK_LOG_SOURCE
|
|
32
|
+
from .nvct_runner import NvctRunner
|
|
33
|
+
from .nvct import NVCTClient
|
|
34
|
+
from .utils import get_ngc_api_key
|
|
35
|
+
from .exceptions import NvctKilledException
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@click.group()
|
|
39
|
+
def cli():
|
|
40
|
+
pass
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
@cli.group(help="Commands related to nvct.")
|
|
44
|
+
def nvct():
|
|
45
|
+
pass
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
@nvct.command(help="List steps / tasks running as an nvct job.")
|
|
49
|
+
@click.option(
|
|
50
|
+
"--run-id",
|
|
51
|
+
default=None,
|
|
52
|
+
required=True,
|
|
53
|
+
help="List unfinished and running tasks corresponding to the run id.",
|
|
54
|
+
)
|
|
55
|
+
@click.pass_context
|
|
56
|
+
def list(ctx, run_id):
|
|
57
|
+
flow_name = ctx.obj.flow.name
|
|
58
|
+
run_obj = Run(pathspec=f"{flow_name}/{run_id}", _namespace_check=False)
|
|
59
|
+
running_invocations = []
|
|
60
|
+
|
|
61
|
+
for each_step in run_obj:
|
|
62
|
+
for each_task in each_step:
|
|
63
|
+
if not each_task.finished and "nvct-task-id" in each_task.metadata_dict:
|
|
64
|
+
task_pathspec = each_task.pathspec
|
|
65
|
+
attempt = each_task.metadata_dict.get("attempt")
|
|
66
|
+
flow_name, run_id, step_name, task_id = task_pathspec.split("/")
|
|
67
|
+
running_invocations.append(
|
|
68
|
+
f"Flow Name: {flow_name}, Run ID: {run_id}, Step Name: {step_name}, Task ID: {task_id}, Retry Count: {attempt}"
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
if running_invocations:
|
|
72
|
+
for each_invocation in running_invocations:
|
|
73
|
+
ctx.obj.echo(each_invocation)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
@nvct.command(help="Cancel steps / tasks running as an nvct job.")
|
|
77
|
+
@click.option(
|
|
78
|
+
"--run-id",
|
|
79
|
+
default=None,
|
|
80
|
+
required=True,
|
|
81
|
+
help="Terminate unfinished tasks corresponding to the run id.",
|
|
82
|
+
)
|
|
83
|
+
@click.pass_context
|
|
84
|
+
def kill(ctx, run_id):
|
|
85
|
+
ngc_api_key = get_ngc_api_key()
|
|
86
|
+
nvct_client = NVCTClient(api_key=ngc_api_key)
|
|
87
|
+
|
|
88
|
+
flow_name = ctx.obj.flow.name
|
|
89
|
+
run_obj = Run(pathspec=f"{flow_name}/{run_id}", _namespace_check=False)
|
|
90
|
+
tasks_cancelled = []
|
|
91
|
+
|
|
92
|
+
for each_step in run_obj:
|
|
93
|
+
for each_task in each_step:
|
|
94
|
+
if not each_task.finished and "nvct-task-id" in each_task.metadata_dict:
|
|
95
|
+
task_pathspec = each_task.pathspec
|
|
96
|
+
attempt = each_task.metadata_dict.get("attempt")
|
|
97
|
+
_, _, step_name, task_id = task_pathspec.split("/")
|
|
98
|
+
|
|
99
|
+
nvct_task_id = each_task.metadata_dict.get("nvct-task-id")
|
|
100
|
+
nvct_client.cancel(nvct_task_id)
|
|
101
|
+
|
|
102
|
+
tasks_cancelled.append(
|
|
103
|
+
f"[{nvct_task_id}] -- Flow Name: {flow_name}, Run ID: {run_id}, Step Name: {step_name}, Task ID: {task_id}, Retry Count: {attempt} is cancelled."
|
|
104
|
+
)
|
|
105
|
+
|
|
106
|
+
if tasks_cancelled:
|
|
107
|
+
for each_cancelled_task in tasks_cancelled:
|
|
108
|
+
ctx.obj.echo(each_cancelled_task)
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
@nvct.command(
|
|
112
|
+
help="Execute a single task using @nvct. This command calls the "
|
|
113
|
+
"top-level step command inside an nvct job with the given options. "
|
|
114
|
+
"Typically you do not call this command directly; it is used internally by "
|
|
115
|
+
"Metaflow."
|
|
116
|
+
)
|
|
117
|
+
@click.argument("step-name")
|
|
118
|
+
@click.argument("code-package-sha")
|
|
119
|
+
@click.argument("code-package-url")
|
|
120
|
+
@click.option("--gpu-type", help="Type of Nvidia GPU to use.")
|
|
121
|
+
@click.option("--instance-type", help="Instance type to use.")
|
|
122
|
+
@click.option("--backend", help="Backend to use.")
|
|
123
|
+
@click.option("--ngc-api-key", help="NGC API key.")
|
|
124
|
+
@click.option("--run-id", help="Passed to the top-level 'step'.")
|
|
125
|
+
@click.option("--task-id", help="Passed to the top-level 'step'.")
|
|
126
|
+
@click.option("--input-paths", help="Passed to the top-level 'step'.")
|
|
127
|
+
@click.option("--split-index", help="Passed to the top-level 'step'.")
|
|
128
|
+
@click.option("--clone-path", help="Passed to the top-level 'step'.")
|
|
129
|
+
@click.option("--clone-run-id", help="Passed to the top-level 'step'.")
|
|
130
|
+
@click.option(
|
|
131
|
+
"--tag", multiple=True, default=None, help="Passed to the top-level 'step'."
|
|
132
|
+
)
|
|
133
|
+
@click.option("--namespace", default=None, help="Passed to the top-level 'step'.")
|
|
134
|
+
@click.option("--retry-count", default=0, help="Passed to the top-level 'step'.")
|
|
135
|
+
@click.option(
|
|
136
|
+
"--max-user-code-retries", default=0, help="Passed to the top-level 'step'."
|
|
137
|
+
)
|
|
138
|
+
@click.pass_context
|
|
139
|
+
def step(
|
|
140
|
+
ctx,
|
|
141
|
+
step_name,
|
|
142
|
+
code_package_sha,
|
|
143
|
+
code_package_url,
|
|
144
|
+
gpu_type,
|
|
145
|
+
instance_type,
|
|
146
|
+
backend,
|
|
147
|
+
ngc_api_key,
|
|
148
|
+
**kwargs,
|
|
149
|
+
):
|
|
150
|
+
def echo(msg, stream="stderr", _id=None, **kwargs):
|
|
151
|
+
msg = util.to_unicode(msg)
|
|
152
|
+
if _id:
|
|
153
|
+
msg = "[%s] %s" % (_id, msg)
|
|
154
|
+
ctx.obj.echo_always(msg, err=(stream == sys.stderr), **kwargs)
|
|
155
|
+
|
|
156
|
+
executable = ctx.obj.environment.executable(step_name)
|
|
157
|
+
entrypoint = "%s -u %s" % (executable, os.path.basename(sys.argv[0]))
|
|
158
|
+
|
|
159
|
+
top_args = " ".join(util.dict_to_cli_options(ctx.parent.parent.params))
|
|
160
|
+
|
|
161
|
+
input_paths = kwargs.get("input_paths")
|
|
162
|
+
split_vars = None
|
|
163
|
+
if input_paths:
|
|
164
|
+
max_size = 30 * 1024
|
|
165
|
+
split_vars = {
|
|
166
|
+
"METAFLOW_INPUT_PATHS_%d" % (i // max_size): input_paths[i : i + max_size]
|
|
167
|
+
for i in range(0, len(input_paths), max_size)
|
|
168
|
+
}
|
|
169
|
+
kwargs["input_paths"] = "".join("${%s}" % s for s in split_vars.keys())
|
|
170
|
+
|
|
171
|
+
step_args = " ".join(util.dict_to_cli_options(kwargs))
|
|
172
|
+
step_cli = "{entrypoint} {top_args} step {step} {step_args}".format(
|
|
173
|
+
entrypoint=entrypoint,
|
|
174
|
+
top_args=top_args,
|
|
175
|
+
step=step_name,
|
|
176
|
+
step_args=step_args,
|
|
177
|
+
)
|
|
178
|
+
node = ctx.obj.graph[step_name]
|
|
179
|
+
|
|
180
|
+
# Get retry information
|
|
181
|
+
retry_count = kwargs.get("retry_count", 0)
|
|
182
|
+
retry_deco = [deco for deco in node.decorators if deco.name == "retry"]
|
|
183
|
+
minutes_between_retries = None
|
|
184
|
+
if retry_deco:
|
|
185
|
+
minutes_between_retries = int(
|
|
186
|
+
retry_deco[0].attributes.get("minutes_between_retries", 1)
|
|
187
|
+
)
|
|
188
|
+
|
|
189
|
+
task_spec = {
|
|
190
|
+
"flow_name": ctx.obj.flow.name,
|
|
191
|
+
"step_name": step_name,
|
|
192
|
+
"run_id": kwargs["run_id"],
|
|
193
|
+
"task_id": kwargs["task_id"],
|
|
194
|
+
"retry_count": str(retry_count),
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
env = {
|
|
198
|
+
"METAFLOW_CODE_SHA": code_package_sha,
|
|
199
|
+
"METAFLOW_CODE_URL": code_package_url,
|
|
200
|
+
"METAFLOW_CODE_DS": ctx.obj.flow_datastore.TYPE,
|
|
201
|
+
"METAFLOW_SERVICE_URL": SERVICE_URL,
|
|
202
|
+
"METAFLOW_SERVICE_HEADERS": json.dumps(SERVICE_HEADERS),
|
|
203
|
+
"METAFLOW_DATASTORE_SYSROOT_S3": DATASTORE_SYSROOT_S3,
|
|
204
|
+
"METAFLOW_DATATOOLS_S3ROOT": DATATOOLS_S3ROOT,
|
|
205
|
+
"METAFLOW_DEFAULT_DATASTORE": ctx.obj.flow_datastore.TYPE,
|
|
206
|
+
"METAFLOW_USER": util.get_username(),
|
|
207
|
+
"METAFLOW_DEFAULT_METADATA": DEFAULT_METADATA,
|
|
208
|
+
"METAFLOW_CARD_S3ROOT": CARD_S3ROOT,
|
|
209
|
+
"METAFLOW_RUNTIME_ENVIRONMENT": "nvct",
|
|
210
|
+
"METAFLOW_DEFAULT_SECRETS_BACKEND_TYPE": DEFAULT_SECRETS_BACKEND_TYPE,
|
|
211
|
+
"METAFLOW_DEFAULT_AWS_CLIENT_PROVIDER": DEFAULT_AWS_CLIENT_PROVIDER,
|
|
212
|
+
"METAFLOW_AWS_SECRETS_MANAGER_DEFAULT_REGION": AWS_SECRETS_MANAGER_DEFAULT_REGION,
|
|
213
|
+
"METAFLOW_S3_ENDPOINT_URL": S3_ENDPOINT_URL,
|
|
214
|
+
"METAFLOW_AZURE_STORAGE_BLOB_SERVICE_ENDPOINT": AZURE_STORAGE_BLOB_SERVICE_ENDPOINT,
|
|
215
|
+
"METAFLOW_DATASTORE_SYSROOT_AZURE": DATASTORE_SYSROOT_AZURE,
|
|
216
|
+
"METAFLOW_CARD_AZUREROOT": CARD_AZUREROOT,
|
|
217
|
+
"METAFLOW_DATASTORE_SYSROOT_GS": DATASTORE_SYSROOT_GS,
|
|
218
|
+
"METAFLOW_CARD_GSROOT": CARD_GSROOT,
|
|
219
|
+
"METAFLOW_INIT_SCRIPT": KUBERNETES_SANDBOX_INIT_SCRIPT,
|
|
220
|
+
"METAFLOW_OTEL_ENDPOINT": OTEL_ENDPOINT,
|
|
221
|
+
"NVCT_CONTEXT": "1",
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
env_deco = [deco for deco in node.decorators if deco.name == "environment"]
|
|
225
|
+
if env_deco:
|
|
226
|
+
env.update(env_deco[0].attributes["vars"])
|
|
227
|
+
|
|
228
|
+
# Add the environment variables related to the input-paths argument
|
|
229
|
+
if split_vars:
|
|
230
|
+
env.update(split_vars)
|
|
231
|
+
|
|
232
|
+
if retry_count:
|
|
233
|
+
ctx.obj.echo_always(
|
|
234
|
+
"Sleeping %d minutes before the next retry" % minutes_between_retries
|
|
235
|
+
)
|
|
236
|
+
time.sleep(minutes_between_retries * 60)
|
|
237
|
+
|
|
238
|
+
# this information is needed for log tailing
|
|
239
|
+
ds = ctx.obj.flow_datastore.get_task_datastore(
|
|
240
|
+
mode="w",
|
|
241
|
+
run_id=kwargs["run_id"],
|
|
242
|
+
step_name=step_name,
|
|
243
|
+
task_id=kwargs["task_id"],
|
|
244
|
+
attempt=int(retry_count),
|
|
245
|
+
)
|
|
246
|
+
stdout_location = ds.get_log_location(TASK_LOG_SOURCE, "stdout")
|
|
247
|
+
stderr_location = ds.get_log_location(TASK_LOG_SOURCE, "stderr")
|
|
248
|
+
|
|
249
|
+
def _sync_metadata():
|
|
250
|
+
if ctx.obj.metadata.TYPE == "local":
|
|
251
|
+
sync_local_metadata_from_datastore(
|
|
252
|
+
DATASTORE_LOCAL_DIR,
|
|
253
|
+
ctx.obj.flow_datastore.get_task_datastore(
|
|
254
|
+
kwargs["run_id"], step_name, kwargs["task_id"]
|
|
255
|
+
),
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
nvct = NvctRunner(
|
|
259
|
+
ctx.obj.metadata,
|
|
260
|
+
ctx.obj.flow_datastore,
|
|
261
|
+
ctx.obj.environment,
|
|
262
|
+
gpu_type,
|
|
263
|
+
instance_type,
|
|
264
|
+
backend,
|
|
265
|
+
ngc_api_key,
|
|
266
|
+
)
|
|
267
|
+
try:
|
|
268
|
+
with ctx.obj.monitor.measure("metaflow.nvct.launch_task"):
|
|
269
|
+
nvct.launch_task(
|
|
270
|
+
step_name,
|
|
271
|
+
step_cli,
|
|
272
|
+
task_spec,
|
|
273
|
+
code_package_sha,
|
|
274
|
+
code_package_url,
|
|
275
|
+
ctx.obj.flow_datastore.TYPE,
|
|
276
|
+
env=env,
|
|
277
|
+
)
|
|
278
|
+
except Exception:
|
|
279
|
+
traceback.print_exc()
|
|
280
|
+
_sync_metadata()
|
|
281
|
+
sys.exit(METAFLOW_EXIT_DISALLOW_RETRY)
|
|
282
|
+
try:
|
|
283
|
+
nvct.wait_for_completion(stdout_location, stderr_location, echo=echo)
|
|
284
|
+
except NvctKilledException:
|
|
285
|
+
# don't retry killed tasks
|
|
286
|
+
traceback.print_exc()
|
|
287
|
+
sys.exit(METAFLOW_EXIT_DISALLOW_RETRY)
|
|
288
|
+
finally:
|
|
289
|
+
_sync_metadata()
|
|
@@ -0,0 +1,262 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sys
|
|
3
|
+
|
|
4
|
+
from metaflow.exception import MetaflowException
|
|
5
|
+
from metaflow.decorators import StepDecorator
|
|
6
|
+
from metaflow.plugins.parallel_decorator import ParallelDecorator
|
|
7
|
+
from metaflow.metadata_provider.util import sync_local_metadata_to_datastore
|
|
8
|
+
from metaflow.metaflow_config import DATASTORE_LOCAL_DIR
|
|
9
|
+
from metaflow.sidecar import Sidecar
|
|
10
|
+
from metaflow.plugins.timeout_decorator import get_run_time_limit_for_task
|
|
11
|
+
from metaflow.metadata_provider import MetaDatum
|
|
12
|
+
|
|
13
|
+
from .utils import get_ngc_api_key
|
|
14
|
+
from .exceptions import (
|
|
15
|
+
UnsupportedNvctDatastoreException,
|
|
16
|
+
NvctTimeoutTooShortException,
|
|
17
|
+
RequestedGPUTypeUnavailableException,
|
|
18
|
+
UnsupportedNvctConfigurationException,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
DEFAULT_GPU_TYPE = "H100"
|
|
23
|
+
|
|
24
|
+
SUPPORTABLE_GPU_TYPES = {
|
|
25
|
+
"L40": [
|
|
26
|
+
{
|
|
27
|
+
"n_gpus": 1,
|
|
28
|
+
"instance_type": "gl40_1.br20_2xlarge",
|
|
29
|
+
"backend": "GFN",
|
|
30
|
+
},
|
|
31
|
+
],
|
|
32
|
+
"L40S": [
|
|
33
|
+
{
|
|
34
|
+
"n_gpus": 1,
|
|
35
|
+
"instance_type": "gl40s_1.br25_2xlarge",
|
|
36
|
+
"backend": "GFN",
|
|
37
|
+
},
|
|
38
|
+
],
|
|
39
|
+
"L40G": [
|
|
40
|
+
{
|
|
41
|
+
"n_gpus": 1,
|
|
42
|
+
"instance_type": "gl40g_1.br25_2xlarge",
|
|
43
|
+
"backend": "GFN",
|
|
44
|
+
},
|
|
45
|
+
],
|
|
46
|
+
"H100": [
|
|
47
|
+
{
|
|
48
|
+
"n_gpus": 1,
|
|
49
|
+
"instance_type": "GCP.GPU.H100_1x",
|
|
50
|
+
"backend": "gcp-asia-se-1a",
|
|
51
|
+
},
|
|
52
|
+
{
|
|
53
|
+
"n_gpus": 2,
|
|
54
|
+
"instance_type": "GCP.GPU.H100_2x",
|
|
55
|
+
"backend": "gcp-asia-se-1a",
|
|
56
|
+
},
|
|
57
|
+
{
|
|
58
|
+
"n_gpus": 4,
|
|
59
|
+
"instance_type": "GCP.GPU.H100_4x",
|
|
60
|
+
"backend": "gcp-asia-se-1a",
|
|
61
|
+
},
|
|
62
|
+
{
|
|
63
|
+
"n_gpus": 8,
|
|
64
|
+
"instance_type": "GCP.GPU.H100_8x",
|
|
65
|
+
"backend": "gcp-asia-se-1a",
|
|
66
|
+
},
|
|
67
|
+
],
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
class NvctDecorator(StepDecorator):
|
|
72
|
+
|
|
73
|
+
"""
|
|
74
|
+
Specifies that this step should execute on DGX cloud.
|
|
75
|
+
|
|
76
|
+
Parameters
|
|
77
|
+
----------
|
|
78
|
+
gpu : int
|
|
79
|
+
Number of GPUs to use.
|
|
80
|
+
gpu_type : str
|
|
81
|
+
Type of Nvidia GPU to use.
|
|
82
|
+
"""
|
|
83
|
+
|
|
84
|
+
name = "nvct"
|
|
85
|
+
defaults = {
|
|
86
|
+
"gpu": 1,
|
|
87
|
+
"gpu_type": None,
|
|
88
|
+
"ngc_api_key": None,
|
|
89
|
+
"instance_type": None,
|
|
90
|
+
"backend": None,
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
package_url = None
|
|
94
|
+
package_sha = None
|
|
95
|
+
|
|
96
|
+
# Refer https://github.com/Netflix/metaflow/blob/master/docs/lifecycle.png
|
|
97
|
+
# to understand where these functions are invoked in the lifecycle of a
|
|
98
|
+
# Metaflow flow.
|
|
99
|
+
def step_init(self, flow, graph, step, decos, environment, flow_datastore, logger):
|
|
100
|
+
# Executing NVCT functions requires a non-local datastore.
|
|
101
|
+
if flow_datastore.TYPE not in ("s3", "azure", "gs"):
|
|
102
|
+
raise UnsupportedNvctDatastoreException(flow_datastore.TYPE)
|
|
103
|
+
|
|
104
|
+
# Set internal state.
|
|
105
|
+
self.logger = logger
|
|
106
|
+
self.environment = environment
|
|
107
|
+
self.step = step
|
|
108
|
+
self.flow_datastore = flow_datastore
|
|
109
|
+
|
|
110
|
+
if any([deco.name == "kubernetes" for deco in decos]):
|
|
111
|
+
raise MetaflowException(
|
|
112
|
+
"Step *{step}* is marked for execution both on Kubernetes and "
|
|
113
|
+
"Nvct. Please use one or the other.".format(step=step)
|
|
114
|
+
)
|
|
115
|
+
if any([isinstance(deco, ParallelDecorator) for deco in decos]):
|
|
116
|
+
raise MetaflowException(
|
|
117
|
+
"Step *{step}* contains a @parallel decorator "
|
|
118
|
+
"with the @nvct decorator. @parallel decorators are not currently supported with @nvct.".format(
|
|
119
|
+
step=step
|
|
120
|
+
)
|
|
121
|
+
)
|
|
122
|
+
|
|
123
|
+
# Set run time limit for NVCT.
|
|
124
|
+
self.run_time_limit = get_run_time_limit_for_task(decos)
|
|
125
|
+
if self.run_time_limit < 60:
|
|
126
|
+
raise NvctTimeoutTooShortException(step)
|
|
127
|
+
|
|
128
|
+
self.attributes["ngc_api_key"] = get_ngc_api_key()
|
|
129
|
+
|
|
130
|
+
requested_gpu_type = self.attributes["gpu_type"]
|
|
131
|
+
requested_n_gpus = self.attributes["gpu"]
|
|
132
|
+
|
|
133
|
+
if requested_gpu_type is None:
|
|
134
|
+
requested_gpu_type = DEFAULT_GPU_TYPE
|
|
135
|
+
if requested_gpu_type not in SUPPORTABLE_GPU_TYPES:
|
|
136
|
+
raise RequestedGPUTypeUnavailableException(
|
|
137
|
+
requested_gpu_type, list(SUPPORTABLE_GPU_TYPES.keys())
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
valid_config = None
|
|
141
|
+
available_configurations = SUPPORTABLE_GPU_TYPES[requested_gpu_type]
|
|
142
|
+
for each_config in available_configurations:
|
|
143
|
+
if each_config["n_gpus"] == requested_n_gpus:
|
|
144
|
+
valid_config = each_config
|
|
145
|
+
break
|
|
146
|
+
|
|
147
|
+
if valid_config is None:
|
|
148
|
+
raise UnsupportedNvctConfigurationException(
|
|
149
|
+
requested_n_gpus,
|
|
150
|
+
requested_gpu_type,
|
|
151
|
+
available_configurations,
|
|
152
|
+
step,
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
self.attributes["instance_type"] = valid_config["instance_type"]
|
|
156
|
+
self.attributes["gpu_type"] = requested_gpu_type
|
|
157
|
+
self.attributes["backend"] = valid_config["backend"]
|
|
158
|
+
|
|
159
|
+
def runtime_init(self, flow, graph, package, run_id):
|
|
160
|
+
# Set some more internal state.
|
|
161
|
+
self.flow = flow
|
|
162
|
+
self.graph = graph
|
|
163
|
+
self.package = package
|
|
164
|
+
self.run_id = run_id
|
|
165
|
+
|
|
166
|
+
def runtime_task_created(
|
|
167
|
+
self, task_datastore, task_id, split_index, input_paths, is_cloned, ubf_context
|
|
168
|
+
):
|
|
169
|
+
if not is_cloned:
|
|
170
|
+
self._save_package_once(self.flow_datastore, self.package)
|
|
171
|
+
|
|
172
|
+
def runtime_step_cli(
|
|
173
|
+
self, cli_args, retry_count, max_user_code_retries, ubf_context
|
|
174
|
+
):
|
|
175
|
+
if retry_count <= max_user_code_retries:
|
|
176
|
+
# after all attempts to run the user code have failed, we don't need
|
|
177
|
+
# to execute on NVCF anymore. We can execute possible fallback
|
|
178
|
+
# code locally.
|
|
179
|
+
cli_args.commands = ["nvct", "step"]
|
|
180
|
+
cli_args.command_args.append(self.package_sha)
|
|
181
|
+
cli_args.command_args.append(self.package_url)
|
|
182
|
+
cli_options = {
|
|
183
|
+
"gpu_type": self.attributes["gpu_type"],
|
|
184
|
+
"instance_type": self.attributes["instance_type"],
|
|
185
|
+
"backend": self.attributes["backend"],
|
|
186
|
+
"ngc_api_key": self.attributes["ngc_api_key"],
|
|
187
|
+
}
|
|
188
|
+
cli_args.command_options.update(cli_options)
|
|
189
|
+
cli_args.entrypoint[0] = sys.executable
|
|
190
|
+
|
|
191
|
+
def task_pre_step(
|
|
192
|
+
self,
|
|
193
|
+
step_name,
|
|
194
|
+
task_datastore,
|
|
195
|
+
metadata,
|
|
196
|
+
run_id,
|
|
197
|
+
task_id,
|
|
198
|
+
flow,
|
|
199
|
+
graph,
|
|
200
|
+
retry_count,
|
|
201
|
+
max_retries,
|
|
202
|
+
ubf_context,
|
|
203
|
+
inputs,
|
|
204
|
+
):
|
|
205
|
+
self.metadata = metadata
|
|
206
|
+
self.task_datastore = task_datastore
|
|
207
|
+
|
|
208
|
+
# task_pre_step may run locally if fallback is activated for @catch
|
|
209
|
+
# decorator.
|
|
210
|
+
|
|
211
|
+
if "NVCT_CONTEXT" in os.environ:
|
|
212
|
+
meta = {}
|
|
213
|
+
|
|
214
|
+
meta["nvct-task-id"] = os.environ.get("NVCT_TASK_ID")
|
|
215
|
+
meta["nvct-task-name"] = os.environ.get("NVCT_TASK_NAME")
|
|
216
|
+
meta["nvct-ncaid"] = os.environ.get("NVCT_NCA_ID")
|
|
217
|
+
meta["nvct-progress-file-path"] = os.environ.get("NVCT_PROGRESS_FILE_PATH")
|
|
218
|
+
meta["nvct-results-dir"] = os.environ.get("NVCT_RESULTS_DIR")
|
|
219
|
+
|
|
220
|
+
entries = [
|
|
221
|
+
MetaDatum(
|
|
222
|
+
field=k,
|
|
223
|
+
value=v,
|
|
224
|
+
type=k,
|
|
225
|
+
tags=["attempt_id:{0}".format(retry_count)],
|
|
226
|
+
)
|
|
227
|
+
for k, v in meta.items()
|
|
228
|
+
if v is not None
|
|
229
|
+
]
|
|
230
|
+
# Register book-keeping metadata for debugging.
|
|
231
|
+
metadata.register_metadata(run_id, step_name, task_id, entries)
|
|
232
|
+
|
|
233
|
+
self._save_logs_sidecar = Sidecar("save_logs_periodically")
|
|
234
|
+
self._save_logs_sidecar.start()
|
|
235
|
+
|
|
236
|
+
def task_finished(
|
|
237
|
+
self, step_name, flow, graph, is_task_ok, retry_count, max_retries
|
|
238
|
+
):
|
|
239
|
+
# task_finished may run locally if fallback is activated for @catch
|
|
240
|
+
# decorator.
|
|
241
|
+
if "NVCT_CONTEXT" in os.environ:
|
|
242
|
+
# If `local` metadata is configured, we would need to copy task
|
|
243
|
+
# execution metadata from the NVCT container to user's
|
|
244
|
+
# local file system after the user code has finished execution.
|
|
245
|
+
# This happens via datastore as a communication bridge.
|
|
246
|
+
if hasattr(self, "metadata") and self.metadata.TYPE == "local":
|
|
247
|
+
sync_local_metadata_to_datastore(
|
|
248
|
+
DATASTORE_LOCAL_DIR, self.task_datastore
|
|
249
|
+
)
|
|
250
|
+
|
|
251
|
+
try:
|
|
252
|
+
self._save_logs_sidecar.terminate()
|
|
253
|
+
except:
|
|
254
|
+
# Best effort kill
|
|
255
|
+
pass
|
|
256
|
+
|
|
257
|
+
@classmethod
|
|
258
|
+
def _save_package_once(cls, flow_datastore, package):
|
|
259
|
+
if cls.package_url is None:
|
|
260
|
+
cls.package_url, cls.package_sha = flow_datastore.save_data(
|
|
261
|
+
[package.blob], len_hint=1
|
|
262
|
+
)[0]
|
|
@@ -0,0 +1,218 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import re
|
|
3
|
+
import time
|
|
4
|
+
import math
|
|
5
|
+
import shlex
|
|
6
|
+
import atexit
|
|
7
|
+
|
|
8
|
+
from metaflow import util
|
|
9
|
+
from metaflow.mflog import (
|
|
10
|
+
BASH_SAVE_LOGS,
|
|
11
|
+
bash_capture_logs,
|
|
12
|
+
export_mflog_env_vars,
|
|
13
|
+
tail_logs,
|
|
14
|
+
get_log_tailer,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
from .nvct import NVCTClient, NVCTTask, NVCTRequest
|
|
18
|
+
from .exceptions import (
|
|
19
|
+
NvctKilledException,
|
|
20
|
+
NvctExecutionException,
|
|
21
|
+
NvctTaskFailedException,
|
|
22
|
+
)
|
|
23
|
+
|
|
24
|
+
# Constants for Metaflow logs
|
|
25
|
+
LOGS_DIR = "$PWD/.logs"
|
|
26
|
+
STDOUT_FILE = "mflog_stdout"
|
|
27
|
+
STDERR_FILE = "mflog_stderr"
|
|
28
|
+
STDOUT_PATH = os.path.join(LOGS_DIR, STDOUT_FILE)
|
|
29
|
+
STDERR_PATH = os.path.join(LOGS_DIR, STDERR_FILE)
|
|
30
|
+
NVCT_WRAPPER = "/usr/local/bin/nvct-wrapper.sh"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class NvctRunner:
|
|
34
|
+
def __init__(
|
|
35
|
+
self,
|
|
36
|
+
metadata,
|
|
37
|
+
datastore,
|
|
38
|
+
environment,
|
|
39
|
+
gpu_type,
|
|
40
|
+
instance_type,
|
|
41
|
+
backend,
|
|
42
|
+
ngc_api_key,
|
|
43
|
+
):
|
|
44
|
+
self.metadata = metadata
|
|
45
|
+
self.datastore = datastore
|
|
46
|
+
self.environment = environment
|
|
47
|
+
self.gpu_type = gpu_type
|
|
48
|
+
self.instance_type = instance_type
|
|
49
|
+
self.backend = backend
|
|
50
|
+
self._ngc_api_key = ngc_api_key
|
|
51
|
+
self.client = None
|
|
52
|
+
self.task = None
|
|
53
|
+
atexit.register(lambda: self.task.cancel() if hasattr(self, "task") else None)
|
|
54
|
+
|
|
55
|
+
def launch_task(
|
|
56
|
+
self,
|
|
57
|
+
step_name,
|
|
58
|
+
step_cli,
|
|
59
|
+
task_spec,
|
|
60
|
+
code_package_sha,
|
|
61
|
+
code_package_url,
|
|
62
|
+
code_package_ds,
|
|
63
|
+
env={},
|
|
64
|
+
max_runtime="PT7H", # <8H allowed for GFN backend
|
|
65
|
+
max_queued="PT120H", # 5 days
|
|
66
|
+
):
|
|
67
|
+
mflog_expr = export_mflog_env_vars(
|
|
68
|
+
datastore_type=code_package_ds,
|
|
69
|
+
stdout_path=STDOUT_PATH,
|
|
70
|
+
stderr_path=STDERR_PATH,
|
|
71
|
+
**task_spec,
|
|
72
|
+
)
|
|
73
|
+
init_cmds = self.environment.get_package_commands(
|
|
74
|
+
code_package_url, code_package_ds
|
|
75
|
+
)
|
|
76
|
+
init_expr = " && ".join(init_cmds)
|
|
77
|
+
step_expr = bash_capture_logs(
|
|
78
|
+
" && ".join(
|
|
79
|
+
self.environment.bootstrap_commands(step_name, code_package_ds)
|
|
80
|
+
+ [step_cli]
|
|
81
|
+
)
|
|
82
|
+
)
|
|
83
|
+
cmd_str = "mkdir -p %s && %s && %s && %s; c=$?; %s; exit $c" % (
|
|
84
|
+
LOGS_DIR,
|
|
85
|
+
mflog_expr,
|
|
86
|
+
init_expr,
|
|
87
|
+
step_expr,
|
|
88
|
+
BASH_SAVE_LOGS,
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
# Add optional initialization script execution
|
|
92
|
+
cmd_str = (
|
|
93
|
+
'${METAFLOW_INIT_SCRIPT:+eval \\"${METAFLOW_INIT_SCRIPT}\\"} && %s'
|
|
94
|
+
% cmd_str
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
cmd_str = shlex.split('bash -c "%s"' % cmd_str)[-1]
|
|
98
|
+
|
|
99
|
+
def modify_python_c(match):
|
|
100
|
+
content = match.group(1)
|
|
101
|
+
# Escape double quotes within the python -c command
|
|
102
|
+
content = content.replace('"', r"\"")
|
|
103
|
+
# Replace outermost double quotes with single quotes
|
|
104
|
+
return 'python -c "%s"' % content
|
|
105
|
+
|
|
106
|
+
# Convert python -c single quotes to double quotes
|
|
107
|
+
cmd_str = re.sub(r"python -c '(.*?)'", modify_python_c, cmd_str)
|
|
108
|
+
cmd_str = cmd_str.replace("'", '"')
|
|
109
|
+
# Create the final command with outer single quotes to pass to NVCT wrapper
|
|
110
|
+
nvct_cmd = f"{NVCT_WRAPPER} bash -c '{cmd_str}'"
|
|
111
|
+
|
|
112
|
+
flow_name = task_spec.get("flow_name")
|
|
113
|
+
run_id = task_spec.get("run_id")
|
|
114
|
+
task_id = task_spec.get("task_id")
|
|
115
|
+
retry_count = task_spec.get("retry_count")
|
|
116
|
+
task_name = f"{flow_name}-{run_id}-{step_name}-{task_id}-{retry_count}"
|
|
117
|
+
|
|
118
|
+
if self.backend != "GFN":
|
|
119
|
+
# if maxRuntimeDuration exceeds 8 hours for a Task on the GFN backend,
|
|
120
|
+
# the request will be rejected.
|
|
121
|
+
# (https://docs.nvidia.com/cloud-functions/user-guide/latest/cloud-function/tasks.html#create-task)
|
|
122
|
+
## thus, if it is non GFN backend, we increase it to 3 days
|
|
123
|
+
max_runtime = "PT72H"
|
|
124
|
+
|
|
125
|
+
request = (
|
|
126
|
+
NVCTRequest(task_name)
|
|
127
|
+
.container_image("nvcr.io/zhxkmsaasxhw/nvct-base:1.0-jovyan")
|
|
128
|
+
.container_args(nvct_cmd)
|
|
129
|
+
.gpu(
|
|
130
|
+
gpu=self.gpu_type,
|
|
131
|
+
instance_type=self.instance_type,
|
|
132
|
+
backend=self.backend,
|
|
133
|
+
)
|
|
134
|
+
.max_runtime(max_runtime)
|
|
135
|
+
.max_queued(max_queued)
|
|
136
|
+
)
|
|
137
|
+
|
|
138
|
+
for k, v in env.items():
|
|
139
|
+
if v is not None:
|
|
140
|
+
request.env(k, str(v))
|
|
141
|
+
|
|
142
|
+
self.client = NVCTClient(self._ngc_api_key)
|
|
143
|
+
self.task = NVCTTask(self.client, request.to_dict())
|
|
144
|
+
|
|
145
|
+
self.task.submit()
|
|
146
|
+
return self.task.id
|
|
147
|
+
|
|
148
|
+
def wait_for_completion(self, stdout_location, stderr_location, echo=None):
|
|
149
|
+
if not self.task:
|
|
150
|
+
raise NvctExecutionException("No task has been launched")
|
|
151
|
+
|
|
152
|
+
def update_delay(secs_since_start):
|
|
153
|
+
# this sigmoid function reaches
|
|
154
|
+
# - 0.1 after 11 minutes
|
|
155
|
+
# - 0.5 after 15 minutes
|
|
156
|
+
# - 1.0 after 23 minutes
|
|
157
|
+
# in other words, the user will see very frequent updates
|
|
158
|
+
# during the first 10 minutes
|
|
159
|
+
sigmoid = 1.0 / (1.0 + math.exp(-0.01 * secs_since_start + 9.0))
|
|
160
|
+
return 0.5 + sigmoid * 30.0
|
|
161
|
+
|
|
162
|
+
def wait_for_launch(task):
|
|
163
|
+
status = task.status
|
|
164
|
+
echo(
|
|
165
|
+
"Task is starting (%s)..." % status,
|
|
166
|
+
"stderr",
|
|
167
|
+
_id=task.id,
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
t = time.time()
|
|
171
|
+
start_time = time.time()
|
|
172
|
+
while task.is_waiting:
|
|
173
|
+
new_status = task.status
|
|
174
|
+
if status != new_status or (time.time() - t) > 30:
|
|
175
|
+
status = new_status
|
|
176
|
+
echo(
|
|
177
|
+
"Task is starting (%s)..." % status,
|
|
178
|
+
"stderr",
|
|
179
|
+
_id=task.id,
|
|
180
|
+
)
|
|
181
|
+
t = time.time()
|
|
182
|
+
time.sleep(update_delay(time.time() - start_time))
|
|
183
|
+
|
|
184
|
+
_make_prefix = lambda: b"[%s] " % util.to_bytes(self.task.id)
|
|
185
|
+
stdout_tail = get_log_tailer(stdout_location, self.datastore.TYPE)
|
|
186
|
+
stderr_tail = get_log_tailer(stderr_location, self.datastore.TYPE)
|
|
187
|
+
|
|
188
|
+
# 1) Loop until the job has started
|
|
189
|
+
wait_for_launch(self.task)
|
|
190
|
+
|
|
191
|
+
echo(
|
|
192
|
+
"Task is starting (%s)..." % self.task.status,
|
|
193
|
+
"stderr",
|
|
194
|
+
_id=self.task.id,
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
# 2) Tail logs until the job has finished
|
|
198
|
+
tail_logs(
|
|
199
|
+
prefix=_make_prefix(),
|
|
200
|
+
stdout_tail=stdout_tail,
|
|
201
|
+
stderr_tail=stderr_tail,
|
|
202
|
+
echo=echo,
|
|
203
|
+
has_log_updates=lambda: self.task.is_running,
|
|
204
|
+
)
|
|
205
|
+
|
|
206
|
+
if self.task.has_failed:
|
|
207
|
+
raise NvctTaskFailedException(
|
|
208
|
+
f"Task failed with status: {self.task.status}. This could be a transient error. Use @retry to retry."
|
|
209
|
+
)
|
|
210
|
+
else:
|
|
211
|
+
if self.task.is_running:
|
|
212
|
+
# Kill the job if it is still running by throwing an exception.
|
|
213
|
+
raise NvctKilledException("Task failed!")
|
|
214
|
+
echo(
|
|
215
|
+
f"Task finished with status: {self.task.status}",
|
|
216
|
+
"stderr",
|
|
217
|
+
_id=self.task.id,
|
|
218
|
+
)
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
import requests
|
|
4
|
+
from urllib.parse import urlparse
|
|
5
|
+
from metaflow.metaflow_config import SERVICE_URL
|
|
6
|
+
from metaflow.metaflow_config_funcs import init_config
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def get_ngc_api_key():
|
|
10
|
+
conf = init_config()
|
|
11
|
+
if "OBP_AUTH_SERVER" in conf:
|
|
12
|
+
auth_host = conf["OBP_AUTH_SERVER"]
|
|
13
|
+
else:
|
|
14
|
+
auth_host = "auth." + urlparse(SERVICE_URL).hostname.split(".", 1)[1]
|
|
15
|
+
|
|
16
|
+
# NOTE: reusing the same auth_host as the one used in NimMetadata,
|
|
17
|
+
# however, user should not need to use nim container to use @nvct.
|
|
18
|
+
# May want to refactor this to a common endpoint.
|
|
19
|
+
nim_info_url = "https://" + auth_host + "/generate/nim"
|
|
20
|
+
|
|
21
|
+
if "METAFLOW_SERVICE_AUTH_KEY" in conf:
|
|
22
|
+
headers = {"x-api-key": conf["METAFLOW_SERVICE_AUTH_KEY"]}
|
|
23
|
+
res = requests.get(nim_info_url, headers=headers)
|
|
24
|
+
else:
|
|
25
|
+
headers = json.loads(os.environ.get("METAFLOW_SERVICE_HEADERS"))
|
|
26
|
+
res = requests.get(nim_info_url, headers=headers)
|
|
27
|
+
|
|
28
|
+
res.raise_for_status()
|
|
29
|
+
return res.json()["nvcf"]["api_key"]
|
{ob_metaflow_extensions-1.1.150.dist-info → ob_metaflow_extensions-1.1.151.dist-info}/RECORD
RENAMED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
metaflow_extensions/outerbounds/__init__.py,sha256=TRGvIUMjkfneWtYUFSWoubu_Kf2ekAL4WLbV3IxOj9k,499
|
|
2
2
|
metaflow_extensions/outerbounds/remote_config.py,sha256=Zpfpjgz68_ZgxlXezjzlsDLo4840rkWuZgwDB_5H57U,4059
|
|
3
3
|
metaflow_extensions/outerbounds/config/__init__.py,sha256=JsQGRuGFz28fQWjUvxUgR8EKBLGRdLUIk_buPLJplJY,1225
|
|
4
|
-
metaflow_extensions/outerbounds/plugins/__init__.py,sha256=
|
|
4
|
+
metaflow_extensions/outerbounds/plugins/__init__.py,sha256=eHcM_t2Mzlge7B9Dv3VGVM5x8qNZYdLyqBOAC6uRxec,13228
|
|
5
5
|
metaflow_extensions/outerbounds/plugins/auth_server.py,sha256=_Q9_2EL0Xy77bCRphkwT1aSu8gQXRDOH-Z-RxTUO8N4,2202
|
|
6
6
|
metaflow_extensions/outerbounds/plugins/perimeters.py,sha256=QXh3SFP7GQbS-RAIxUOPbhPzQ7KDFVxZkTdKqFKgXjI,2697
|
|
7
7
|
metaflow_extensions/outerbounds/plugins/apps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -35,6 +35,13 @@ metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py,sha256=3ZFdYItVpFWnHMOeyV1n
|
|
|
35
35
|
metaflow_extensions/outerbounds/plugins/nvcf/nvcf_cli.py,sha256=3D-r5XO88Yh2k1EAZFJTe_PwdbhWp5qXflG8AgE4ZIU,9500
|
|
36
36
|
metaflow_extensions/outerbounds/plugins/nvcf/nvcf_decorator.py,sha256=pSWKaPyGXBEfxG35QA1FQljio8ADjwf-DnPgEsqXoUM,9251
|
|
37
37
|
metaflow_extensions/outerbounds/plugins/nvcf/utils.py,sha256=DxWSCayfa95e0HJkWacey1s1nxoTpaunGhrb_0Ayv28,133
|
|
38
|
+
metaflow_extensions/outerbounds/plugins/nvct/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
39
|
+
metaflow_extensions/outerbounds/plugins/nvct/exceptions.py,sha256=1PiV6FdH36CvkmHh5jtsfrsoe3Q_Fo1NomHw5wvgoDM,2886
|
|
40
|
+
metaflow_extensions/outerbounds/plugins/nvct/nvct.py,sha256=Z2ZPWGuHe58au_d6GfHiw6Nl5d8INdLDI5exlsPEOSA,3564
|
|
41
|
+
metaflow_extensions/outerbounds/plugins/nvct/nvct_cli.py,sha256=bB9AURhRep9PV_-b-qLHpgw_GPG_xFoq1PeHEgFP1mQ,10104
|
|
42
|
+
metaflow_extensions/outerbounds/plugins/nvct/nvct_decorator.py,sha256=LaJ_Tk-vNjvrglzSTR-U6pk8f9MtQRKObU9m7vBYtkI,8695
|
|
43
|
+
metaflow_extensions/outerbounds/plugins/nvct/nvct_runner.py,sha256=D2sEtVFVWXBqaQEsminZYI_WesR2kADLmwv4lsxVBhk,7091
|
|
44
|
+
metaflow_extensions/outerbounds/plugins/nvct/utils.py,sha256=U4_Fu8H94j_Bbox7mmMhNnlRhlYHqnK28R5w_TMWEFM,1029
|
|
38
45
|
metaflow_extensions/outerbounds/plugins/ollama/__init__.py,sha256=HEsI5U4ckQby7K2NsGBOdizhPY3WWqXSnXx_IHL7_No,2307
|
|
39
46
|
metaflow_extensions/outerbounds/plugins/ollama/ollama.py,sha256=KlP8_EmnUoi8-PidyU0IDuENYxKjQaHFC33yGsvaeic,13320
|
|
40
47
|
metaflow_extensions/outerbounds/plugins/profilers/deco_injector.py,sha256=oI_C3c64XBm7n88FILqHwn-Nnc5DeT_68I67lM9rXaI,2434
|
|
@@ -61,7 +68,7 @@ metaflow_extensions/outerbounds/toplevel/plugins/gcp/__init__.py,sha256=BbZiaH3u
|
|
|
61
68
|
metaflow_extensions/outerbounds/toplevel/plugins/kubernetes/__init__.py,sha256=5zG8gShSj8m7rgF4xgWBZFuY3GDP5n1T0ktjRpGJLHA,69
|
|
62
69
|
metaflow_extensions/outerbounds/toplevel/plugins/ollama/__init__.py,sha256=GRSz2zwqkvlmFS6bcfYD_CX6CMko9DHQokMaH1iBshA,47
|
|
63
70
|
metaflow_extensions/outerbounds/toplevel/plugins/snowflake/__init__.py,sha256=LptpH-ziXHrednMYUjIaosS1SXD3sOtF_9_eRqd8SJw,50
|
|
64
|
-
ob_metaflow_extensions-1.1.
|
|
65
|
-
ob_metaflow_extensions-1.1.
|
|
66
|
-
ob_metaflow_extensions-1.1.
|
|
67
|
-
ob_metaflow_extensions-1.1.
|
|
71
|
+
ob_metaflow_extensions-1.1.151.dist-info/METADATA,sha256=x5PLR9aAaWhuhLDERP5mSl44te5I0ZNmNxmjBsRgrzg,521
|
|
72
|
+
ob_metaflow_extensions-1.1.151.dist-info/WHEEL,sha256=bb2Ot9scclHKMOLDEHY6B2sicWOgugjFKaJsT7vwMQo,110
|
|
73
|
+
ob_metaflow_extensions-1.1.151.dist-info/top_level.txt,sha256=NwG0ukwjygtanDETyp_BUdtYtqIA_lOjzFFh1TsnxvI,20
|
|
74
|
+
ob_metaflow_extensions-1.1.151.dist-info/RECORD,,
|
|
File without changes
|
{ob_metaflow_extensions-1.1.150.dist-info → ob_metaflow_extensions-1.1.151.dist-info}/top_level.txt
RENAMED
|
File without changes
|