ob-metaflow-extensions 1.1.69__py2.py3-none-any.whl → 1.1.70__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ob-metaflow-extensions might be problematic. Click here for more details.
- metaflow_extensions/outerbounds/plugins/__init__.py +2 -1
- metaflow_extensions/outerbounds/plugins/nvcf/__init__.py +0 -0
- metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py +247 -0
- metaflow_extensions/outerbounds/plugins/nvcf/nvcf_cli.py +202 -0
- metaflow_extensions/outerbounds/plugins/nvcf/nvcf_decorator.py +156 -0
- {ob_metaflow_extensions-1.1.69.dist-info → ob_metaflow_extensions-1.1.70.dist-info}/METADATA +1 -1
- {ob_metaflow_extensions-1.1.69.dist-info → ob_metaflow_extensions-1.1.70.dist-info}/RECORD +9 -5
- {ob_metaflow_extensions-1.1.69.dist-info → ob_metaflow_extensions-1.1.70.dist-info}/WHEEL +0 -0
- {ob_metaflow_extensions-1.1.69.dist-info → ob_metaflow_extensions-1.1.70.dist-info}/top_level.txt +0 -0
|
@@ -240,5 +240,6 @@ class ObpGcpAuthProvider(object):
|
|
|
240
240
|
|
|
241
241
|
|
|
242
242
|
GCP_CLIENT_PROVIDERS_DESC = [("obp", ".ObpGcpAuthProvider")]
|
|
243
|
-
|
|
243
|
+
CLIS_DESC = [("nvcf", ".nvcf.nvcf_cli.cli")]
|
|
244
|
+
STEP_DECORATORS_DESC = [("nvidia", ".nvcf.nvcf_decorator.NvcfDecorator")]
|
|
244
245
|
FLOW_DECORATORS_DESC = [("nim", ".nim.NimDecorator")]
|
|
File without changes
|
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import time
|
|
4
|
+
from urllib.request import HTTPError, Request, URLError, urlopen
|
|
5
|
+
|
|
6
|
+
from metaflow import util
|
|
7
|
+
from metaflow.exception import MetaflowException
|
|
8
|
+
from metaflow.mflog import (
|
|
9
|
+
BASH_SAVE_LOGS,
|
|
10
|
+
bash_capture_logs,
|
|
11
|
+
export_mflog_env_vars,
|
|
12
|
+
tail_logs,
|
|
13
|
+
get_log_tailer,
|
|
14
|
+
)
|
|
15
|
+
import requests
|
|
16
|
+
from metaflow.metaflow_config_funcs import init_config
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class NvcfException(MetaflowException):
|
|
20
|
+
headline = "Nvidia error"
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class NvcfKilledException(MetaflowException):
|
|
24
|
+
headline = "Nvidia job killed"
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
# Redirect structured logs to $PWD/.logs/
|
|
28
|
+
LOGS_DIR = "$PWD/.logs"
|
|
29
|
+
STDOUT_FILE = "mflog_stdout"
|
|
30
|
+
STDERR_FILE = "mflog_stderr"
|
|
31
|
+
STDOUT_PATH = os.path.join(LOGS_DIR, STDOUT_FILE)
|
|
32
|
+
STDERR_PATH = os.path.join(LOGS_DIR, STDERR_FILE)
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class Nvcf(object):
|
|
36
|
+
def __init__(self, metadata, datastore, environment):
|
|
37
|
+
self.metadata = metadata
|
|
38
|
+
self.datastore = datastore
|
|
39
|
+
self.environment = environment
|
|
40
|
+
|
|
41
|
+
def launch_job(
|
|
42
|
+
self,
|
|
43
|
+
step_name,
|
|
44
|
+
step_cli,
|
|
45
|
+
task_spec,
|
|
46
|
+
code_package_sha,
|
|
47
|
+
code_package_url,
|
|
48
|
+
code_package_ds,
|
|
49
|
+
env={},
|
|
50
|
+
):
|
|
51
|
+
mflog_expr = export_mflog_env_vars(
|
|
52
|
+
datastore_type=code_package_ds,
|
|
53
|
+
stdout_path=STDOUT_PATH,
|
|
54
|
+
stderr_path=STDERR_PATH,
|
|
55
|
+
**task_spec,
|
|
56
|
+
)
|
|
57
|
+
init_cmds = self.environment.get_package_commands(
|
|
58
|
+
code_package_url, code_package_ds
|
|
59
|
+
)
|
|
60
|
+
init_expr = " && ".join(init_cmds)
|
|
61
|
+
step_expr = bash_capture_logs(
|
|
62
|
+
" && ".join(
|
|
63
|
+
self.environment.bootstrap_commands(step_name, code_package_ds)
|
|
64
|
+
+ [step_cli]
|
|
65
|
+
)
|
|
66
|
+
)
|
|
67
|
+
|
|
68
|
+
# construct an entry point that
|
|
69
|
+
# 1) initializes the mflog environment (mflog_expr)
|
|
70
|
+
# 2) bootstraps a metaflow environment (init_expr)
|
|
71
|
+
# 3) executes a task (step_expr)
|
|
72
|
+
|
|
73
|
+
cmd_str = "mkdir -p %s && %s && %s && %s; " % (
|
|
74
|
+
LOGS_DIR,
|
|
75
|
+
mflog_expr,
|
|
76
|
+
init_expr,
|
|
77
|
+
step_expr,
|
|
78
|
+
)
|
|
79
|
+
# after the task has finished, we save its exit code (fail/success)
|
|
80
|
+
# and persist the final logs. The whole entrypoint should exit
|
|
81
|
+
# with the exit code (c) of the task.
|
|
82
|
+
#
|
|
83
|
+
# Note that if step_expr OOMs, this tail expression is never executed.
|
|
84
|
+
# We lose the last logs in this scenario.
|
|
85
|
+
cmd_str += "c=$?; %s; exit $c" % BASH_SAVE_LOGS
|
|
86
|
+
cmd_str = (
|
|
87
|
+
'${METAFLOW_INIT_SCRIPT:+eval \\"${METAFLOW_INIT_SCRIPT}\\"} && %s'
|
|
88
|
+
% cmd_str
|
|
89
|
+
)
|
|
90
|
+
self.job = Job('bash -c "%s"' % cmd_str, env)
|
|
91
|
+
self.job.submit()
|
|
92
|
+
|
|
93
|
+
def wait(self, stdout_location, stderr_location, echo=None):
|
|
94
|
+
def wait_for_launch(job):
|
|
95
|
+
status = job.status
|
|
96
|
+
echo(
|
|
97
|
+
"Task status: %s..." % status,
|
|
98
|
+
"stderr",
|
|
99
|
+
_id=job.id,
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
prefix = b"[%s] " % util.to_bytes(self.job.id)
|
|
103
|
+
stdout_tail = get_log_tailer(stdout_location, self.datastore.TYPE)
|
|
104
|
+
stderr_tail = get_log_tailer(stderr_location, self.datastore.TYPE)
|
|
105
|
+
|
|
106
|
+
# 1) Loop until the job has started
|
|
107
|
+
wait_for_launch(self.job)
|
|
108
|
+
|
|
109
|
+
# 2) Tail logs until the job has finished
|
|
110
|
+
tail_logs(
|
|
111
|
+
prefix=prefix,
|
|
112
|
+
stdout_tail=stdout_tail,
|
|
113
|
+
stderr_tail=stderr_tail,
|
|
114
|
+
echo=echo,
|
|
115
|
+
has_log_updates=lambda: self.job.is_running,
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
echo(
|
|
119
|
+
"Task finished with exit code %s." % self.job.result.get("exit_code"),
|
|
120
|
+
"stderr",
|
|
121
|
+
_id=self.job.id,
|
|
122
|
+
)
|
|
123
|
+
if self.job.has_failed:
|
|
124
|
+
raise NvcfException("This could be a transient error. Use @retry to retry.")
|
|
125
|
+
|
|
126
|
+
|
|
127
|
+
class JobStatus(object):
|
|
128
|
+
SUBMITTED = "SUBMITTED"
|
|
129
|
+
RUNNING = "RUNNING"
|
|
130
|
+
SUCCESSFUL = "SUCCESSFUL"
|
|
131
|
+
FAILED = "FAILED"
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
nvcf_url = "https://api.nvcf.nvidia.com"
|
|
135
|
+
submit_endpoint = f"{nvcf_url}/v2/nvcf/pexec/functions"
|
|
136
|
+
result_endpoint = f"{nvcf_url}/v2/nvcf/pexec/status"
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
class Job(object):
|
|
140
|
+
def __init__(self, command, env):
|
|
141
|
+
|
|
142
|
+
self._payload = {
|
|
143
|
+
"command": command,
|
|
144
|
+
"env": {k: v for k, v in env.items() if v is not None},
|
|
145
|
+
}
|
|
146
|
+
self._result = {}
|
|
147
|
+
|
|
148
|
+
conf = init_config()
|
|
149
|
+
if "OBP_AUTH_SERVER" in conf:
|
|
150
|
+
auth_host = conf["OBP_AUTH_SERVER"]
|
|
151
|
+
else:
|
|
152
|
+
auth_host = "auth." + urlparse(SERVICE_URL).hostname.split(".", 1)[1]
|
|
153
|
+
|
|
154
|
+
# NOTE: reusing the same auth_host as the one used in NimMetadata,
|
|
155
|
+
# however, user should not need to use nim container to use @nvidia.
|
|
156
|
+
# May want to refactor this to a common endpoint.
|
|
157
|
+
nim_info_url = "https://" + auth_host + "/generate/nim"
|
|
158
|
+
|
|
159
|
+
if "METAFLOW_SERVICE_AUTH_KEY" in conf:
|
|
160
|
+
headers = {"x-api-key": conf["METAFLOW_SERVICE_AUTH_KEY"]}
|
|
161
|
+
res = requests.get(nim_info_url, headers=headers)
|
|
162
|
+
else:
|
|
163
|
+
headers = json.loads(os.environ.get("METAFLOW_SERVICE_HEADERS"))
|
|
164
|
+
res = requests.get(nim_info_url, headers=headers)
|
|
165
|
+
|
|
166
|
+
res.raise_for_status()
|
|
167
|
+
self._ngc_api_key = res.json()["nvcf"]["api_key"]
|
|
168
|
+
|
|
169
|
+
for f in res.json()["nvcf"]["functions"]:
|
|
170
|
+
if f["model_key"] == "metaflow_task_executor":
|
|
171
|
+
self._function_id = f["id"]
|
|
172
|
+
|
|
173
|
+
def submit(self):
|
|
174
|
+
try:
|
|
175
|
+
headers = {
|
|
176
|
+
"Authorization": f"Bearer {self._ngc_api_key}",
|
|
177
|
+
"Content-Type": "application/json",
|
|
178
|
+
}
|
|
179
|
+
request_data = json.dumps(self._payload).encode()
|
|
180
|
+
request = Request(
|
|
181
|
+
f"{submit_endpoint}/{self._function_id}",
|
|
182
|
+
data=request_data,
|
|
183
|
+
headers=headers,
|
|
184
|
+
)
|
|
185
|
+
response = urlopen(request)
|
|
186
|
+
self._invocation_id = response.headers.get("NVCF-REQID")
|
|
187
|
+
if response.getcode() == 200:
|
|
188
|
+
data = json.loads(response.read())
|
|
189
|
+
if data["status"].startswith("Oops"):
|
|
190
|
+
self._status = JobStatus.FAILED
|
|
191
|
+
else:
|
|
192
|
+
self._status = JobStatus.SUCCESSFUL
|
|
193
|
+
self._result = data
|
|
194
|
+
elif response.getcode() == 202:
|
|
195
|
+
self._status = JobStatus.SUBMITTED
|
|
196
|
+
else:
|
|
197
|
+
self._status = JobStatus.FAILED
|
|
198
|
+
# TODO: Handle 404s nicely
|
|
199
|
+
except (HTTPError, URLError) as e:
|
|
200
|
+
self._state = JobStatus.FAILED
|
|
201
|
+
raise e
|
|
202
|
+
|
|
203
|
+
@property
|
|
204
|
+
def status(self):
|
|
205
|
+
if self._status not in [JobStatus.SUCCESSFUL, JobStatus.FAILED]:
|
|
206
|
+
self._poll()
|
|
207
|
+
return self._status
|
|
208
|
+
|
|
209
|
+
@property
|
|
210
|
+
def id(self):
|
|
211
|
+
return self._invocation_id
|
|
212
|
+
|
|
213
|
+
@property
|
|
214
|
+
def is_running(self):
|
|
215
|
+
return self.status == JobStatus.SUBMITTED
|
|
216
|
+
|
|
217
|
+
@property
|
|
218
|
+
def has_failed(self):
|
|
219
|
+
return self.status == JobStatus.FAILED
|
|
220
|
+
|
|
221
|
+
@property
|
|
222
|
+
def result(self):
|
|
223
|
+
return self._result
|
|
224
|
+
|
|
225
|
+
def _poll(self):
|
|
226
|
+
try:
|
|
227
|
+
invocation_id = self._invocation_id
|
|
228
|
+
headers = {
|
|
229
|
+
"Authorization": f"Bearer {self._ngc_api_key}",
|
|
230
|
+
"Content-Type": "application/json",
|
|
231
|
+
}
|
|
232
|
+
request = Request(
|
|
233
|
+
f"{result_endpoint}/{self._invocation_id}", headers=headers
|
|
234
|
+
)
|
|
235
|
+
response = urlopen(request)
|
|
236
|
+
if response.getcode() == 200:
|
|
237
|
+
data = json.loads(response.read())
|
|
238
|
+
if data["status"].startswith("Oops"):
|
|
239
|
+
# TODO: Propagate the internal error forward
|
|
240
|
+
self._status = JobStatus.FAILED
|
|
241
|
+
else:
|
|
242
|
+
self._status = JobStatus.SUCCESSFUL
|
|
243
|
+
self._result = data
|
|
244
|
+
elif response.getcode() in [400, 500]:
|
|
245
|
+
self._status = JobStatus.FAILED
|
|
246
|
+
except (HTTPError, URLError) as e:
|
|
247
|
+
print(f"Error occurred while polling for result: {e}")
|
|
@@ -0,0 +1,202 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import os
|
|
3
|
+
import sys
|
|
4
|
+
import time
|
|
5
|
+
import traceback
|
|
6
|
+
|
|
7
|
+
from metaflow import util
|
|
8
|
+
from metaflow._vendor import click
|
|
9
|
+
from metaflow.exception import METAFLOW_EXIT_DISALLOW_RETRY
|
|
10
|
+
from metaflow.metadata.util import sync_local_metadata_from_datastore
|
|
11
|
+
from metaflow.metaflow_config import (
|
|
12
|
+
CARD_S3ROOT,
|
|
13
|
+
DATASTORE_LOCAL_DIR,
|
|
14
|
+
DATASTORE_SYSROOT_S3,
|
|
15
|
+
DATATOOLS_S3ROOT,
|
|
16
|
+
DEFAULT_METADATA,
|
|
17
|
+
SERVICE_HEADERS,
|
|
18
|
+
SERVICE_URL,
|
|
19
|
+
DEFAULT_SECRETS_BACKEND_TYPE,
|
|
20
|
+
DEFAULT_AWS_CLIENT_PROVIDER,
|
|
21
|
+
AWS_SECRETS_MANAGER_DEFAULT_REGION,
|
|
22
|
+
S3_ENDPOINT_URL,
|
|
23
|
+
AZURE_STORAGE_BLOB_SERVICE_ENDPOINT,
|
|
24
|
+
DATASTORE_SYSROOT_AZURE,
|
|
25
|
+
CARD_AZUREROOT,
|
|
26
|
+
DATASTORE_SYSROOT_GS,
|
|
27
|
+
CARD_GSROOT,
|
|
28
|
+
KUBERNETES_SANDBOX_INIT_SCRIPT,
|
|
29
|
+
OTEL_ENDPOINT,
|
|
30
|
+
)
|
|
31
|
+
from metaflow.mflog import TASK_LOG_SOURCE
|
|
32
|
+
|
|
33
|
+
from .nvcf import Nvcf, NvcfKilledException
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
@click.group()
|
|
37
|
+
def cli():
|
|
38
|
+
pass
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@cli.group(help="Commands related to NVCF.")
|
|
42
|
+
def nvcf():
|
|
43
|
+
pass
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
@nvcf.command(
|
|
47
|
+
help="Execute a single task using NVCF. This command calls the "
|
|
48
|
+
"top-level step command inside a NVCF job with the given options. "
|
|
49
|
+
"Typically you do not call this command directly; it is used internally by "
|
|
50
|
+
"Metaflow."
|
|
51
|
+
)
|
|
52
|
+
@click.argument("step-name")
|
|
53
|
+
@click.argument("code-package-sha")
|
|
54
|
+
@click.argument("code-package-url")
|
|
55
|
+
@click.option("--run-id", help="Passed to the top-level 'step'.")
|
|
56
|
+
@click.option("--task-id", help="Passed to the top-level 'step'.")
|
|
57
|
+
@click.option("--input-paths", help="Passed to the top-level 'step'.")
|
|
58
|
+
@click.option("--split-index", help="Passed to the top-level 'step'.")
|
|
59
|
+
@click.option("--clone-path", help="Passed to the top-level 'step'.")
|
|
60
|
+
@click.option("--clone-run-id", help="Passed to the top-level 'step'.")
|
|
61
|
+
@click.option(
|
|
62
|
+
"--tag", multiple=True, default=None, help="Passed to the top-level 'step'."
|
|
63
|
+
)
|
|
64
|
+
@click.option("--namespace", default=None, help="Passed to the top-level 'step'.")
|
|
65
|
+
@click.option("--retry-count", default=0, help="Passed to the top-level 'step'.")
|
|
66
|
+
@click.option(
|
|
67
|
+
"--max-user-code-retries", default=0, help="Passed to the top-level 'step'."
|
|
68
|
+
)
|
|
69
|
+
@click.pass_context
|
|
70
|
+
def step(ctx, step_name, code_package_sha, code_package_url, **kwargs):
|
|
71
|
+
def echo(msg, stream="stderr", _id=None, **kwargs):
|
|
72
|
+
msg = util.to_unicode(msg)
|
|
73
|
+
if _id:
|
|
74
|
+
msg = "[%s] %s" % (_id, msg)
|
|
75
|
+
ctx.obj.echo_always(msg, err=(stream == sys.stderr), **kwargs)
|
|
76
|
+
|
|
77
|
+
executable = ctx.obj.environment.executable(step_name)
|
|
78
|
+
entrypoint = "%s -u %s" % (executable, os.path.basename(sys.argv[0]))
|
|
79
|
+
|
|
80
|
+
top_args = " ".join(util.dict_to_cli_options(ctx.parent.parent.params))
|
|
81
|
+
|
|
82
|
+
input_paths = kwargs.get("input_paths")
|
|
83
|
+
split_vars = None
|
|
84
|
+
if input_paths:
|
|
85
|
+
max_size = 30 * 1024
|
|
86
|
+
split_vars = {
|
|
87
|
+
"METAFLOW_INPUT_PATHS_%d" % (i // max_size): input_paths[i : i + max_size]
|
|
88
|
+
for i in range(0, len(input_paths), max_size)
|
|
89
|
+
}
|
|
90
|
+
kwargs["input_paths"] = "".join("${%s}" % s for s in split_vars.keys())
|
|
91
|
+
|
|
92
|
+
step_args = " ".join(util.dict_to_cli_options(kwargs))
|
|
93
|
+
step_cli = "{entrypoint} {top_args} step {step} {step_args}".format(
|
|
94
|
+
entrypoint=entrypoint,
|
|
95
|
+
top_args=top_args,
|
|
96
|
+
step=step_name,
|
|
97
|
+
step_args=step_args,
|
|
98
|
+
)
|
|
99
|
+
node = ctx.obj.graph[step_name]
|
|
100
|
+
|
|
101
|
+
# Get retry information
|
|
102
|
+
retry_count = kwargs.get("retry_count", 0)
|
|
103
|
+
retry_deco = [deco for deco in node.decorators if deco.name == "retry"]
|
|
104
|
+
minutes_between_retries = None
|
|
105
|
+
if retry_deco:
|
|
106
|
+
minutes_between_retries = int(
|
|
107
|
+
retry_deco[0].attributes.get("minutes_between_retries", 1)
|
|
108
|
+
)
|
|
109
|
+
|
|
110
|
+
task_spec = {
|
|
111
|
+
"flow_name": ctx.obj.flow.name,
|
|
112
|
+
"step_name": step_name,
|
|
113
|
+
"run_id": kwargs["run_id"],
|
|
114
|
+
"task_id": kwargs["task_id"],
|
|
115
|
+
"retry_count": str(retry_count),
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
env = {
|
|
119
|
+
"METAFLOW_CODE_SHA": code_package_sha,
|
|
120
|
+
"METAFLOW_CODE_URL": code_package_url,
|
|
121
|
+
"METAFLOW_CODE_DS": ctx.obj.flow_datastore.TYPE,
|
|
122
|
+
"METAFLOW_SERVICE_URL": SERVICE_URL,
|
|
123
|
+
"METAFLOW_SERVICE_HEADERS": json.dumps(SERVICE_HEADERS),
|
|
124
|
+
"METAFLOW_DATASTORE_SYSROOT_S3": DATASTORE_SYSROOT_S3,
|
|
125
|
+
"METAFLOW_DATATOOLS_S3ROOT": DATATOOLS_S3ROOT,
|
|
126
|
+
"METAFLOW_DEFAULT_DATASTORE": ctx.obj.flow_datastore.TYPE,
|
|
127
|
+
"METAFLOW_USER": util.get_username(),
|
|
128
|
+
"METAFLOW_DEFAULT_METADATA": DEFAULT_METADATA,
|
|
129
|
+
"METAFLOW_CARD_S3ROOT": CARD_S3ROOT,
|
|
130
|
+
"METAFLOW_RUNTIME_ENVIRONMENT": "nvcf",
|
|
131
|
+
"METAFLOW_DEFAULT_SECRETS_BACKEND_TYPE": DEFAULT_SECRETS_BACKEND_TYPE,
|
|
132
|
+
"METAFLOW_DEFAULT_AWS_CLIENT_PROVIDER": DEFAULT_AWS_CLIENT_PROVIDER,
|
|
133
|
+
"METAFLOW_AWS_SECRETS_MANAGER_DEFAULT_REGION": AWS_SECRETS_MANAGER_DEFAULT_REGION,
|
|
134
|
+
"METAFLOW_S3_ENDPOINT_URL": S3_ENDPOINT_URL,
|
|
135
|
+
"METAFLOW_AZURE_STORAGE_BLOB_SERVICE_ENDPOINT": AZURE_STORAGE_BLOB_SERVICE_ENDPOINT,
|
|
136
|
+
"METAFLOW_DATASTORE_SYSROOT_AZURE": DATASTORE_SYSROOT_AZURE,
|
|
137
|
+
"METAFLOW_CARD_AZUREROOT": CARD_AZUREROOT,
|
|
138
|
+
"METAFLOW_DATASTORE_SYSROOT_GS": DATASTORE_SYSROOT_GS,
|
|
139
|
+
"METAFLOW_CARD_GSROOT": CARD_GSROOT,
|
|
140
|
+
"METAFLOW_INIT_SCRIPT": KUBERNETES_SANDBOX_INIT_SCRIPT,
|
|
141
|
+
"METAFLOW_OTEL_ENDPOINT": OTEL_ENDPOINT,
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
env_deco = [deco for deco in node.decorators if deco.name == "environment"]
|
|
145
|
+
if env_deco:
|
|
146
|
+
env.update(env_deco[0].attributes["vars"])
|
|
147
|
+
|
|
148
|
+
# Add the environment variables related to the input-paths argument
|
|
149
|
+
if split_vars:
|
|
150
|
+
env.update(split_vars)
|
|
151
|
+
|
|
152
|
+
if retry_count:
|
|
153
|
+
ctx.obj.echo_always(
|
|
154
|
+
"Sleeping %d minutes before the next retry" % minutes_between_retries
|
|
155
|
+
)
|
|
156
|
+
time.sleep(minutes_between_retries * 60)
|
|
157
|
+
|
|
158
|
+
# this information is needed for log tailing
|
|
159
|
+
ds = ctx.obj.flow_datastore.get_task_datastore(
|
|
160
|
+
mode="w",
|
|
161
|
+
run_id=kwargs["run_id"],
|
|
162
|
+
step_name=step_name,
|
|
163
|
+
task_id=kwargs["task_id"],
|
|
164
|
+
attempt=int(retry_count),
|
|
165
|
+
)
|
|
166
|
+
stdout_location = ds.get_log_location(TASK_LOG_SOURCE, "stdout")
|
|
167
|
+
stderr_location = ds.get_log_location(TASK_LOG_SOURCE, "stderr")
|
|
168
|
+
|
|
169
|
+
def _sync_metadata():
|
|
170
|
+
if ctx.obj.metadata.TYPE == "local":
|
|
171
|
+
sync_local_metadata_from_datastore(
|
|
172
|
+
DATASTORE_LOCAL_DIR,
|
|
173
|
+
ctx.obj.flow_datastore.get_task_datastore(
|
|
174
|
+
kwargs["run_id"], step_name, kwargs["task_id"]
|
|
175
|
+
),
|
|
176
|
+
)
|
|
177
|
+
|
|
178
|
+
nvcf = Nvcf(ctx.obj.metadata, ctx.obj.flow_datastore, ctx.obj.environment)
|
|
179
|
+
try:
|
|
180
|
+
with ctx.obj.monitor.measure("metaflow.nvcf.launch_job"):
|
|
181
|
+
nvcf.launch_job(
|
|
182
|
+
step_name,
|
|
183
|
+
step_cli,
|
|
184
|
+
task_spec,
|
|
185
|
+
code_package_sha,
|
|
186
|
+
code_package_url,
|
|
187
|
+
ctx.obj.flow_datastore.TYPE,
|
|
188
|
+
# function_id=function_id,
|
|
189
|
+
env=env,
|
|
190
|
+
)
|
|
191
|
+
except Exception as e:
|
|
192
|
+
traceback.print_exc()
|
|
193
|
+
_sync_metadata()
|
|
194
|
+
sys.exit(METAFLOW_EXIT_DISALLOW_RETRY)
|
|
195
|
+
try:
|
|
196
|
+
nvcf.wait(stdout_location, stderr_location, echo=echo)
|
|
197
|
+
except NvcfKilledException:
|
|
198
|
+
# don't retry killed tasks
|
|
199
|
+
traceback.print_exc()
|
|
200
|
+
sys.exit(METAFLOW_EXIT_DISALLOW_RETRY)
|
|
201
|
+
finally:
|
|
202
|
+
_sync_metadata()
|
|
@@ -0,0 +1,156 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import sys
|
|
3
|
+
|
|
4
|
+
from metaflow.decorators import StepDecorator
|
|
5
|
+
from metaflow.metadata.util import sync_local_metadata_to_datastore
|
|
6
|
+
from metaflow.metaflow_config import DATASTORE_LOCAL_DIR
|
|
7
|
+
from metaflow.sidecar import Sidecar
|
|
8
|
+
from metaflow.plugins.timeout_decorator import get_run_time_limit_for_task
|
|
9
|
+
from .nvcf import NvcfException
|
|
10
|
+
|
|
11
|
+
from metaflow.metadata import MetaDatum
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class NvcfDecorator(StepDecorator):
|
|
15
|
+
name = "nvidia"
|
|
16
|
+
# defaults = {"function_id": "9e5647f2-740f-4101-a129-1c961a075575"}
|
|
17
|
+
defaults = {}
|
|
18
|
+
# "0817006f-018b-4590-b2a5-6cf9d64d9d9a"}
|
|
19
|
+
#
|
|
20
|
+
|
|
21
|
+
package_url = None
|
|
22
|
+
package_sha = None
|
|
23
|
+
|
|
24
|
+
# Refer https://github.com/Netflix/metaflow/blob/master/docs/lifecycle.png
|
|
25
|
+
# to understand where these functions are invoked in the lifecycle of a
|
|
26
|
+
# Metaflow flow.
|
|
27
|
+
def step_init(self, flow, graph, step, decos, environment, flow_datastore, logger):
|
|
28
|
+
# Executing NVCF functions requires a non-local datastore.
|
|
29
|
+
if flow_datastore.TYPE not in ("s3", "azure", "gs"):
|
|
30
|
+
raise NvcfException(
|
|
31
|
+
"The *@nvidia* decorator requires --datastore=s3 or --datastore=azure or --datastore=gs at the moment."
|
|
32
|
+
)
|
|
33
|
+
# if self.attributes["function_id"] is None:
|
|
34
|
+
# raise NvcfException(
|
|
35
|
+
# "The *@nvidia* decorator requires a function_id. Please reach out to Outerbounds if you are unsure how to get access to one."
|
|
36
|
+
# )
|
|
37
|
+
# Set internal state.
|
|
38
|
+
self.logger = logger
|
|
39
|
+
self.environment = environment
|
|
40
|
+
self.step = step
|
|
41
|
+
self.flow_datastore = flow_datastore
|
|
42
|
+
|
|
43
|
+
# TODO:
|
|
44
|
+
# 1. Ensure that @batch and @kubernetes decorators are not applied to this step.
|
|
45
|
+
# 2. Ensure @parallel is not applied to this step.
|
|
46
|
+
|
|
47
|
+
# Set run time limit for the NVCF function.
|
|
48
|
+
self.run_time_limit = get_run_time_limit_for_task(decos)
|
|
49
|
+
if self.run_time_limit < 60:
|
|
50
|
+
raise NvcfException(
|
|
51
|
+
"The timeout for step *{step}* should be at least 60 seconds for "
|
|
52
|
+
"execution with @nvidia.".format(step=step)
|
|
53
|
+
)
|
|
54
|
+
|
|
55
|
+
def runtime_init(self, flow, graph, package, run_id):
|
|
56
|
+
# Set some more internal state.
|
|
57
|
+
self.flow = flow
|
|
58
|
+
self.graph = graph
|
|
59
|
+
self.package = package
|
|
60
|
+
self.run_id = run_id
|
|
61
|
+
|
|
62
|
+
def runtime_task_created(
|
|
63
|
+
self, task_datastore, task_id, split_index, input_paths, is_cloned, ubf_context
|
|
64
|
+
):
|
|
65
|
+
if not is_cloned:
|
|
66
|
+
self._save_package_once(self.flow_datastore, self.package)
|
|
67
|
+
|
|
68
|
+
def runtime_step_cli(
|
|
69
|
+
self, cli_args, retry_count, max_user_code_retries, ubf_context
|
|
70
|
+
):
|
|
71
|
+
if retry_count <= max_user_code_retries:
|
|
72
|
+
# after all attempts to run the user code have failed, we don't need
|
|
73
|
+
# to execute on NVCF anymore. We can execute possible fallback
|
|
74
|
+
# code locally.
|
|
75
|
+
cli_args.commands = ["nvcf", "step"]
|
|
76
|
+
cli_args.command_args.append(self.package_sha)
|
|
77
|
+
cli_args.command_args.append(self.package_url)
|
|
78
|
+
cli_args.command_options.update(self.attributes)
|
|
79
|
+
# cli_args.command_options["run-time-limit"] = self.run_time_limit
|
|
80
|
+
cli_args.entrypoint[0] = sys.executable
|
|
81
|
+
|
|
82
|
+
def task_pre_step(
|
|
83
|
+
self,
|
|
84
|
+
step_name,
|
|
85
|
+
task_datastore,
|
|
86
|
+
metadata,
|
|
87
|
+
run_id,
|
|
88
|
+
task_id,
|
|
89
|
+
flow,
|
|
90
|
+
graph,
|
|
91
|
+
retry_count,
|
|
92
|
+
max_retries,
|
|
93
|
+
ubf_context,
|
|
94
|
+
inputs,
|
|
95
|
+
):
|
|
96
|
+
self.metadata = metadata
|
|
97
|
+
self.task_datastore = task_datastore
|
|
98
|
+
|
|
99
|
+
# task_pre_step may run locally if fallback is activated for @catch
|
|
100
|
+
# decorator.
|
|
101
|
+
|
|
102
|
+
if "NVCF_CONTEXT" in os.environ:
|
|
103
|
+
meta = {}
|
|
104
|
+
|
|
105
|
+
meta["nvcf-function-id"] = os.environ.get("NVCF_FUNCTION_ID")
|
|
106
|
+
meta["nvcf-function-version-id"] = os.environ.get(
|
|
107
|
+
"NVCF_FUNCTION_VERSION_ID"
|
|
108
|
+
)
|
|
109
|
+
meta["nvcf-region"] = os.environ.get("NVCF_REGION")
|
|
110
|
+
meta["nvcf-ncaid"] = os.environ.get("NVCF_NCAID")
|
|
111
|
+
meta["nvcf-sub"] = os.environ.get("NVCF_SUB")
|
|
112
|
+
meta["nvcf-instancetype"] = os.environ.get("NVCF_INSTANCETYPE")
|
|
113
|
+
meta["nvcf-reqid"] = os.environ.get("NVCF_REQID")
|
|
114
|
+
meta["nvcf-env"] = os.environ.get("NVCF_ENV")
|
|
115
|
+
meta["nvcf-backend"] = os.environ.get("NVCF_BACKEND")
|
|
116
|
+
meta["nvcf-function-name"] = os.environ.get("NVCF_FUNCTION_NAME")
|
|
117
|
+
meta["nvcf-nspectid"] = os.environ.get("NVCF_NSPECTID")
|
|
118
|
+
|
|
119
|
+
entries = [
|
|
120
|
+
MetaDatum(field=k, value=v, type=k, tags=[])
|
|
121
|
+
for k, v in meta.items()
|
|
122
|
+
if v is not None
|
|
123
|
+
]
|
|
124
|
+
# Register book-keeping metadata for debugging.
|
|
125
|
+
metadata.register_metadata(run_id, step_name, task_id, entries)
|
|
126
|
+
|
|
127
|
+
self._save_logs_sidecar = Sidecar("save_logs_periodically")
|
|
128
|
+
self._save_logs_sidecar.start()
|
|
129
|
+
|
|
130
|
+
def task_finished(
|
|
131
|
+
self, step_name, flow, graph, is_task_ok, retry_count, max_retries
|
|
132
|
+
):
|
|
133
|
+
# task_finished may run locally if fallback is activated for @catch
|
|
134
|
+
# decorator.
|
|
135
|
+
if "NVCF_CONTEXT" in os.environ:
|
|
136
|
+
# If `local` metadata is configured, we would need to copy task
|
|
137
|
+
# execution metadata from the NVCF container to user's
|
|
138
|
+
# local file system after the user code has finished execution.
|
|
139
|
+
# This happens via datastore as a communication bridge.
|
|
140
|
+
if hasattr(self, "metadata") and self.metadata.TYPE == "local":
|
|
141
|
+
sync_local_metadata_to_datastore(
|
|
142
|
+
DATASTORE_LOCAL_DIR, self.task_datastore
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
try:
|
|
146
|
+
self._save_logs_sidecar.terminate()
|
|
147
|
+
except:
|
|
148
|
+
# Best effort kill
|
|
149
|
+
pass
|
|
150
|
+
|
|
151
|
+
@classmethod
|
|
152
|
+
def _save_package_once(cls, flow_datastore, package):
|
|
153
|
+
if cls.package_url is None:
|
|
154
|
+
cls.package_url, cls.package_sha = flow_datastore.save_data(
|
|
155
|
+
[package.blob], len_hint=1
|
|
156
|
+
)[0]
|
|
@@ -1,13 +1,17 @@
|
|
|
1
1
|
metaflow_extensions/outerbounds/__init__.py,sha256=TRGvIUMjkfneWtYUFSWoubu_Kf2ekAL4WLbV3IxOj9k,499
|
|
2
2
|
metaflow_extensions/outerbounds/remote_config.py,sha256=HPFH4e3ZK3p-wS5HlS75fhR8_2avdD1AHQIZl2KnjeQ,4059
|
|
3
3
|
metaflow_extensions/outerbounds/config/__init__.py,sha256=mYo95obHU1IE1wbPkeVz_pfTzNqlNabp1QBEMTGllbE,112
|
|
4
|
-
metaflow_extensions/outerbounds/plugins/__init__.py,sha256=
|
|
4
|
+
metaflow_extensions/outerbounds/plugins/__init__.py,sha256=46NgbJBhVowDR6FyQrZPF2jHHqRTSyCBCYIQAyQ4Ryo,9516
|
|
5
5
|
metaflow_extensions/outerbounds/plugins/auth_server.py,sha256=JhlMFcR7SPSfR1C9w6GlqJq-NYNhOfISmHl2PdkYUok,2212
|
|
6
6
|
metaflow_extensions/outerbounds/plugins/perimeters.py,sha256=z8tSAkWtiITB-JtSQS7fkhlBwvxSxeTgEwFjahAzv-U,2238
|
|
7
7
|
metaflow_extensions/outerbounds/plugins/kubernetes/__init__.py,sha256=5zG8gShSj8m7rgF4xgWBZFuY3GDP5n1T0ktjRpGJLHA,69
|
|
8
8
|
metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py,sha256=gj6Iaz26bGbZm3aQuNS18Mqh_80iJp5PgFwFSlJRcn8,1968
|
|
9
9
|
metaflow_extensions/outerbounds/plugins/nim/__init__.py,sha256=GVnvSTjqYVj5oG2yh8KJFt7iZ33cEadDD5HbdmC9hJ0,1457
|
|
10
10
|
metaflow_extensions/outerbounds/plugins/nim/nim_manager.py,sha256=l8WDfVtsMt7aZaOaeIPT5ySidxfxXU8gmwLoKUP3f04,7044
|
|
11
|
+
metaflow_extensions/outerbounds/plugins/nvcf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
12
|
+
metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py,sha256=ftxC5SCo64P5Ycpv5vudluTnQi3-VCZW0umdsPP326A,7926
|
|
13
|
+
metaflow_extensions/outerbounds/plugins/nvcf/nvcf_cli.py,sha256=ow3lonclEDoZEUQCDV_L8lEr6HopXqjNXzubRrfdIm4,7219
|
|
14
|
+
metaflow_extensions/outerbounds/plugins/nvcf/nvcf_decorator.py,sha256=0xNA4aRTPJ4SKpRIFKZzlL9a7lf367KGTrVWVXd-uGE,6052
|
|
11
15
|
metaflow_extensions/outerbounds/profilers/__init__.py,sha256=wa_jhnCBr82TBxoS0e8b6_6sLyZX0fdHicuGJZNTqKw,29
|
|
12
16
|
metaflow_extensions/outerbounds/profilers/gpu.py,sha256=a5YZAepujuP0uDqG9UpXBlZS3wjUt4Yv8CjybXqeT2c,24342
|
|
13
17
|
metaflow_extensions/outerbounds/toplevel/__init__.py,sha256=qWUJSv_r5hXJ7jV_On4nEasKIfUCm6_UjkjXWA_A1Ts,90
|
|
@@ -15,7 +19,7 @@ metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py,
|
|
|
15
19
|
metaflow_extensions/outerbounds/toplevel/plugins/azure/__init__.py,sha256=WUuhz2YQfI4fz7nIcipwwWq781eaoHEk7n4GAn1npDg,63
|
|
16
20
|
metaflow_extensions/outerbounds/toplevel/plugins/gcp/__init__.py,sha256=BbZiaH3uILlEZ6ntBLKeNyqn3If8nIXZFq_Apd7Dhco,70
|
|
17
21
|
metaflow_extensions/outerbounds/toplevel/plugins/kubernetes/__init__.py,sha256=5zG8gShSj8m7rgF4xgWBZFuY3GDP5n1T0ktjRpGJLHA,69
|
|
18
|
-
ob_metaflow_extensions-1.1.
|
|
19
|
-
ob_metaflow_extensions-1.1.
|
|
20
|
-
ob_metaflow_extensions-1.1.
|
|
21
|
-
ob_metaflow_extensions-1.1.
|
|
22
|
+
ob_metaflow_extensions-1.1.70.dist-info/METADATA,sha256=oh54d2W1t23zvb-nWBXXp5CiCTn2wRpUKVrtgWERn5c,519
|
|
23
|
+
ob_metaflow_extensions-1.1.70.dist-info/WHEEL,sha256=bb2Ot9scclHKMOLDEHY6B2sicWOgugjFKaJsT7vwMQo,110
|
|
24
|
+
ob_metaflow_extensions-1.1.70.dist-info/top_level.txt,sha256=NwG0ukwjygtanDETyp_BUdtYtqIA_lOjzFFh1TsnxvI,20
|
|
25
|
+
ob_metaflow_extensions-1.1.70.dist-info/RECORD,,
|
|
File without changes
|
{ob_metaflow_extensions-1.1.69.dist-info → ob_metaflow_extensions-1.1.70.dist-info}/top_level.txt
RENAMED
|
File without changes
|