ob-metaflow-extensions 1.1.149__py2.py3-none-any.whl → 1.1.151__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -312,11 +312,13 @@ class ObpGcpAuthProvider(object):
312
312
  GCP_CLIENT_PROVIDERS_DESC = [("obp", ".ObpGcpAuthProvider")]
313
313
  CLIS_DESC = [
314
314
  ("nvidia", ".nvcf.nvcf_cli.cli"),
315
+ ("nvct", ".nvct.nvct_cli.cli"),
315
316
  ("fast-bakery", ".fast_bakery.fast_bakery_cli.cli"),
316
317
  ("snowpark", ".snowpark.snowpark_cli.cli"),
317
318
  ]
318
319
  STEP_DECORATORS_DESC = [
319
320
  ("nvidia", ".nvcf.nvcf_decorator.NvcfDecorator"),
321
+ ("nvct", ".nvct.nvct_decorator.NvctDecorator"),
320
322
  (
321
323
  "fast_bakery_internal",
322
324
  ".fast_bakery.fast_bakery_decorator.InternalFastBakeryDecorator",
@@ -0,0 +1,71 @@
1
+ from metaflow.exception import MetaflowException
2
+
3
+
4
+ class NvctExecutionException(MetaflowException):
5
+ headline = "Nvct task couldn't be executed"
6
+
7
+
8
+ class NvctTaskFailedException(MetaflowException):
9
+ headline = "Nvct task failed"
10
+
11
+
12
+ class NvctKilledException(MetaflowException):
13
+ headline = "Nvct job killed"
14
+
15
+
16
+ class RequestedGPUTypeUnavailableException(MetaflowException):
17
+ headline = "[@nvct RequestedGPUTypeUnavailableException] GPU type unavailable."
18
+
19
+ def __init__(self, requested_gpu_type, available_gpus):
20
+ msg = (
21
+ f"The requested GPU type @nvct(..., gpu_type='{requested_gpu_type}') is not available. "
22
+ f"Please choose from the following supported GPU types when using @nvct: {available_gpus}"
23
+ )
24
+ super(RequestedGPUTypeUnavailableException, self).__init__(msg)
25
+
26
+
27
+ class UnsupportedNvctConfigurationException(MetaflowException):
28
+ headline = (
29
+ "[@nvct UnsupportedNvctConfigurationException] Unsupported GPU configuration"
30
+ )
31
+
32
+ def __init__(self, n_gpu, gpu_type, available_configurations, step):
33
+ msg = f"The requested configuration of @nvct(gpu={n_gpu}, gpu_type='{gpu_type}') for @step {step} is not available."
34
+ if len(available_configurations) == 0:
35
+ msg += (
36
+ "\n\nNo configurations are available in your Outerbounds deployment."
37
+ " Please contact Outerbounds support if you wish to use @nvct."
38
+ )
39
+ else:
40
+ msg += f"\n\nAvailable configurations for your deployment with {gpu_type} include: \n\t- {self._display(gpu_type, available_configurations)}"
41
+ msg += "\n\nPlease contact Outerbounds support if you wish to use a configuration not listed above."
42
+ super(UnsupportedNvctConfigurationException, self).__init__(msg)
43
+
44
+ def _display(self, gpu_type, configs):
45
+ _available_decos = []
46
+ for cfg in configs:
47
+ n_gpu = cfg["n_gpus"]
48
+ _available_decos.append(f"@nvct(gpu={n_gpu}, gpu_type='{gpu_type}')")
49
+ return "\n\t- ".join(_available_decos)
50
+
51
+
52
+ class UnsupportedNvctDatastoreException(MetaflowException):
53
+ headline = "[@nvct UnsupportedNvctDatastoreException] Unsupported datastore"
54
+
55
+ def __init__(self, ds_type):
56
+ msg = (
57
+ "The *@nvct* decorator requires --datastore=s3 or --datastore=azure or --datastore=gs at the moment."
58
+ f"Current datastore type: {ds_type}."
59
+ )
60
+ super(UnsupportedNvctDatastoreException, self).__init__(msg)
61
+
62
+
63
+ class NvctTimeoutTooShortException(MetaflowException):
64
+ headline = "[@nvct NvctTimeoutTooShortException] Timeout too short"
65
+
66
+ def __init__(self, step):
67
+ msg = (
68
+ "The timeout for step *{step}* should be at least 60 seconds for "
69
+ "execution with @nvct".format(step=step)
70
+ )
71
+ super(NvctTimeoutTooShortException, self).__init__(msg)
@@ -0,0 +1,131 @@
1
+ import requests
2
+ from requests.adapters import HTTPAdapter, Retry
3
+
4
+ BASE_URL = "https://api.ngc.nvidia.com/v2/orgs/zhxkmsaasxhw/"
5
+ POLL_SEC = 1
6
+
7
+
8
+ def _session(api_key):
9
+ s = requests.Session()
10
+ s.headers.update(
11
+ {
12
+ "Authorization": f"Bearer {api_key}",
13
+ "Content-Type": "application/json",
14
+ }
15
+ )
16
+ retry = Retry(total=5, backoff_factor=1.5, status_forcelist=[502, 503, 504])
17
+ s.mount("https://", HTTPAdapter(max_retries=retry))
18
+ return s
19
+
20
+
21
+ def _url(path):
22
+ return BASE_URL.rstrip("/") + path
23
+
24
+
25
+ class NVCTClient:
26
+ def __init__(self, api_key):
27
+ self.sess = _session(api_key)
28
+
29
+ # TODO: Handle https://outerboundsco.slack.com/archives/C05QGNR4E06/p1745970955540289
30
+ def create(self, spec):
31
+ r = self.sess.post(_url("/nvct/tasks"), json=spec, timeout=30)
32
+ r.raise_for_status()
33
+ return r.json().get("task", {}).get("id")
34
+
35
+ def get(self, task_id):
36
+ r = self.sess.get(_url(f"/nvct/tasks/{task_id}"), timeout=30)
37
+ r.raise_for_status()
38
+ return r.json().get("task", {})
39
+
40
+ def cancel(self, task_id):
41
+ r = self.sess.post(_url(f"/nvct/tasks/{task_id}/cancel"), timeout=30)
42
+ r.raise_for_status()
43
+
44
+
45
+ class NVCTRequest(object):
46
+ def __init__(self, name):
47
+ self._spec = {}
48
+ self._spec["name"] = name
49
+ self._spec["gpuSpecification"] = {}
50
+ self._spec["resultHandlingStrategy"] = "NONE"
51
+ self._spec["terminationGracePeriodDuration"] = "PT10M"
52
+
53
+ def container_image(self, image):
54
+ self._spec["containerImage"] = image
55
+ return self
56
+
57
+ def container_args(self, args):
58
+ self._spec["containerArgs"] = args
59
+ return self
60
+
61
+ def env(self, key, value):
62
+ env_list = self._spec.setdefault("containerEnvironment", [])
63
+ env_list.append({"key": key, "value": value})
64
+ return self
65
+
66
+ def gpu(self, gpu, instance_type, backend):
67
+ gpu_spec = self._spec["gpuSpecification"]
68
+ gpu_spec["gpu"] = gpu
69
+ gpu_spec["instanceType"] = instance_type
70
+ gpu_spec["backend"] = backend
71
+ return self
72
+
73
+ def max_runtime(self, iso_duration):
74
+ self._spec["maxRuntimeDuration"] = iso_duration
75
+ return self
76
+
77
+ def max_queued(self, iso_duration="PT72H"):
78
+ self._spec["maxQueuedDuration"] = iso_duration
79
+ return self
80
+
81
+ def termination_grace(self, iso_duration="PT10M"):
82
+ self._spec["terminationGracePeriodDuration"] = iso_duration
83
+ return self
84
+
85
+ def extra(self, key, value):
86
+ self._spec[key] = value
87
+ return self
88
+
89
+ def to_dict(self):
90
+ return self._spec
91
+
92
+
93
+ class NVCTTask:
94
+ def __init__(self, client: NVCTClient, spec):
95
+ self.client = client
96
+ self.spec = spec
97
+ self.id = None
98
+ self.record = None
99
+
100
+ def submit(self):
101
+ self.id = self.client.create(self.spec)
102
+ return self.id
103
+
104
+ def cancel(self):
105
+ if not self.has_finished:
106
+ self.client.cancel(self.id)
107
+
108
+ @property
109
+ def status(self):
110
+ self.record = self.client.get(self.id)
111
+ return self.record["status"]
112
+
113
+ @property
114
+ def is_waiting(self):
115
+ return self.status == "QUEUED"
116
+
117
+ @property
118
+ def is_running(self):
119
+ return self.status in {"RUNNING", "LAUNCHED"}
120
+
121
+ @property
122
+ def has_failed(self):
123
+ return self.status in {"ERRORED", "CANCELED"}
124
+
125
+ @property
126
+ def has_succeeded(self):
127
+ return self.status == "COMPLETED"
128
+
129
+ @property
130
+ def has_finished(self):
131
+ return self.has_succeeded or self.has_failed
@@ -0,0 +1,289 @@
1
+ import json
2
+ import os
3
+ import sys
4
+ import time
5
+ import traceback
6
+
7
+ from metaflow import util, Run
8
+ from metaflow._vendor import click
9
+ from metaflow.exception import METAFLOW_EXIT_DISALLOW_RETRY
10
+ from metaflow.metadata_provider.util import sync_local_metadata_from_datastore
11
+ from metaflow.metaflow_config import (
12
+ CARD_S3ROOT,
13
+ DATASTORE_LOCAL_DIR,
14
+ DATASTORE_SYSROOT_S3,
15
+ DATATOOLS_S3ROOT,
16
+ DEFAULT_METADATA,
17
+ SERVICE_HEADERS,
18
+ SERVICE_URL,
19
+ DEFAULT_SECRETS_BACKEND_TYPE,
20
+ DEFAULT_AWS_CLIENT_PROVIDER,
21
+ AWS_SECRETS_MANAGER_DEFAULT_REGION,
22
+ S3_ENDPOINT_URL,
23
+ AZURE_STORAGE_BLOB_SERVICE_ENDPOINT,
24
+ DATASTORE_SYSROOT_AZURE,
25
+ CARD_AZUREROOT,
26
+ DATASTORE_SYSROOT_GS,
27
+ CARD_GSROOT,
28
+ KUBERNETES_SANDBOX_INIT_SCRIPT,
29
+ OTEL_ENDPOINT,
30
+ )
31
+ from metaflow.mflog import TASK_LOG_SOURCE
32
+ from .nvct_runner import NvctRunner
33
+ from .nvct import NVCTClient
34
+ from .utils import get_ngc_api_key
35
+ from .exceptions import NvctKilledException
36
+
37
+
38
+ @click.group()
39
+ def cli():
40
+ pass
41
+
42
+
43
+ @cli.group(help="Commands related to nvct.")
44
+ def nvct():
45
+ pass
46
+
47
+
48
+ @nvct.command(help="List steps / tasks running as an nvct job.")
49
+ @click.option(
50
+ "--run-id",
51
+ default=None,
52
+ required=True,
53
+ help="List unfinished and running tasks corresponding to the run id.",
54
+ )
55
+ @click.pass_context
56
+ def list(ctx, run_id):
57
+ flow_name = ctx.obj.flow.name
58
+ run_obj = Run(pathspec=f"{flow_name}/{run_id}", _namespace_check=False)
59
+ running_invocations = []
60
+
61
+ for each_step in run_obj:
62
+ for each_task in each_step:
63
+ if not each_task.finished and "nvct-task-id" in each_task.metadata_dict:
64
+ task_pathspec = each_task.pathspec
65
+ attempt = each_task.metadata_dict.get("attempt")
66
+ flow_name, run_id, step_name, task_id = task_pathspec.split("/")
67
+ running_invocations.append(
68
+ f"Flow Name: {flow_name}, Run ID: {run_id}, Step Name: {step_name}, Task ID: {task_id}, Retry Count: {attempt}"
69
+ )
70
+
71
+ if running_invocations:
72
+ for each_invocation in running_invocations:
73
+ ctx.obj.echo(each_invocation)
74
+
75
+
76
+ @nvct.command(help="Cancel steps / tasks running as an nvct job.")
77
+ @click.option(
78
+ "--run-id",
79
+ default=None,
80
+ required=True,
81
+ help="Terminate unfinished tasks corresponding to the run id.",
82
+ )
83
+ @click.pass_context
84
+ def kill(ctx, run_id):
85
+ ngc_api_key = get_ngc_api_key()
86
+ nvct_client = NVCTClient(api_key=ngc_api_key)
87
+
88
+ flow_name = ctx.obj.flow.name
89
+ run_obj = Run(pathspec=f"{flow_name}/{run_id}", _namespace_check=False)
90
+ tasks_cancelled = []
91
+
92
+ for each_step in run_obj:
93
+ for each_task in each_step:
94
+ if not each_task.finished and "nvct-task-id" in each_task.metadata_dict:
95
+ task_pathspec = each_task.pathspec
96
+ attempt = each_task.metadata_dict.get("attempt")
97
+ _, _, step_name, task_id = task_pathspec.split("/")
98
+
99
+ nvct_task_id = each_task.metadata_dict.get("nvct-task-id")
100
+ nvct_client.cancel(nvct_task_id)
101
+
102
+ tasks_cancelled.append(
103
+ f"[{nvct_task_id}] -- Flow Name: {flow_name}, Run ID: {run_id}, Step Name: {step_name}, Task ID: {task_id}, Retry Count: {attempt} is cancelled."
104
+ )
105
+
106
+ if tasks_cancelled:
107
+ for each_cancelled_task in tasks_cancelled:
108
+ ctx.obj.echo(each_cancelled_task)
109
+
110
+
111
+ @nvct.command(
112
+ help="Execute a single task using @nvct. This command calls the "
113
+ "top-level step command inside an nvct job with the given options. "
114
+ "Typically you do not call this command directly; it is used internally by "
115
+ "Metaflow."
116
+ )
117
+ @click.argument("step-name")
118
+ @click.argument("code-package-sha")
119
+ @click.argument("code-package-url")
120
+ @click.option("--gpu-type", help="Type of Nvidia GPU to use.")
121
+ @click.option("--instance-type", help="Instance type to use.")
122
+ @click.option("--backend", help="Backend to use.")
123
+ @click.option("--ngc-api-key", help="NGC API key.")
124
+ @click.option("--run-id", help="Passed to the top-level 'step'.")
125
+ @click.option("--task-id", help="Passed to the top-level 'step'.")
126
+ @click.option("--input-paths", help="Passed to the top-level 'step'.")
127
+ @click.option("--split-index", help="Passed to the top-level 'step'.")
128
+ @click.option("--clone-path", help="Passed to the top-level 'step'.")
129
+ @click.option("--clone-run-id", help="Passed to the top-level 'step'.")
130
+ @click.option(
131
+ "--tag", multiple=True, default=None, help="Passed to the top-level 'step'."
132
+ )
133
+ @click.option("--namespace", default=None, help="Passed to the top-level 'step'.")
134
+ @click.option("--retry-count", default=0, help="Passed to the top-level 'step'.")
135
+ @click.option(
136
+ "--max-user-code-retries", default=0, help="Passed to the top-level 'step'."
137
+ )
138
+ @click.pass_context
139
+ def step(
140
+ ctx,
141
+ step_name,
142
+ code_package_sha,
143
+ code_package_url,
144
+ gpu_type,
145
+ instance_type,
146
+ backend,
147
+ ngc_api_key,
148
+ **kwargs,
149
+ ):
150
+ def echo(msg, stream="stderr", _id=None, **kwargs):
151
+ msg = util.to_unicode(msg)
152
+ if _id:
153
+ msg = "[%s] %s" % (_id, msg)
154
+ ctx.obj.echo_always(msg, err=(stream == sys.stderr), **kwargs)
155
+
156
+ executable = ctx.obj.environment.executable(step_name)
157
+ entrypoint = "%s -u %s" % (executable, os.path.basename(sys.argv[0]))
158
+
159
+ top_args = " ".join(util.dict_to_cli_options(ctx.parent.parent.params))
160
+
161
+ input_paths = kwargs.get("input_paths")
162
+ split_vars = None
163
+ if input_paths:
164
+ max_size = 30 * 1024
165
+ split_vars = {
166
+ "METAFLOW_INPUT_PATHS_%d" % (i // max_size): input_paths[i : i + max_size]
167
+ for i in range(0, len(input_paths), max_size)
168
+ }
169
+ kwargs["input_paths"] = "".join("${%s}" % s for s in split_vars.keys())
170
+
171
+ step_args = " ".join(util.dict_to_cli_options(kwargs))
172
+ step_cli = "{entrypoint} {top_args} step {step} {step_args}".format(
173
+ entrypoint=entrypoint,
174
+ top_args=top_args,
175
+ step=step_name,
176
+ step_args=step_args,
177
+ )
178
+ node = ctx.obj.graph[step_name]
179
+
180
+ # Get retry information
181
+ retry_count = kwargs.get("retry_count", 0)
182
+ retry_deco = [deco for deco in node.decorators if deco.name == "retry"]
183
+ minutes_between_retries = None
184
+ if retry_deco:
185
+ minutes_between_retries = int(
186
+ retry_deco[0].attributes.get("minutes_between_retries", 1)
187
+ )
188
+
189
+ task_spec = {
190
+ "flow_name": ctx.obj.flow.name,
191
+ "step_name": step_name,
192
+ "run_id": kwargs["run_id"],
193
+ "task_id": kwargs["task_id"],
194
+ "retry_count": str(retry_count),
195
+ }
196
+
197
+ env = {
198
+ "METAFLOW_CODE_SHA": code_package_sha,
199
+ "METAFLOW_CODE_URL": code_package_url,
200
+ "METAFLOW_CODE_DS": ctx.obj.flow_datastore.TYPE,
201
+ "METAFLOW_SERVICE_URL": SERVICE_URL,
202
+ "METAFLOW_SERVICE_HEADERS": json.dumps(SERVICE_HEADERS),
203
+ "METAFLOW_DATASTORE_SYSROOT_S3": DATASTORE_SYSROOT_S3,
204
+ "METAFLOW_DATATOOLS_S3ROOT": DATATOOLS_S3ROOT,
205
+ "METAFLOW_DEFAULT_DATASTORE": ctx.obj.flow_datastore.TYPE,
206
+ "METAFLOW_USER": util.get_username(),
207
+ "METAFLOW_DEFAULT_METADATA": DEFAULT_METADATA,
208
+ "METAFLOW_CARD_S3ROOT": CARD_S3ROOT,
209
+ "METAFLOW_RUNTIME_ENVIRONMENT": "nvct",
210
+ "METAFLOW_DEFAULT_SECRETS_BACKEND_TYPE": DEFAULT_SECRETS_BACKEND_TYPE,
211
+ "METAFLOW_DEFAULT_AWS_CLIENT_PROVIDER": DEFAULT_AWS_CLIENT_PROVIDER,
212
+ "METAFLOW_AWS_SECRETS_MANAGER_DEFAULT_REGION": AWS_SECRETS_MANAGER_DEFAULT_REGION,
213
+ "METAFLOW_S3_ENDPOINT_URL": S3_ENDPOINT_URL,
214
+ "METAFLOW_AZURE_STORAGE_BLOB_SERVICE_ENDPOINT": AZURE_STORAGE_BLOB_SERVICE_ENDPOINT,
215
+ "METAFLOW_DATASTORE_SYSROOT_AZURE": DATASTORE_SYSROOT_AZURE,
216
+ "METAFLOW_CARD_AZUREROOT": CARD_AZUREROOT,
217
+ "METAFLOW_DATASTORE_SYSROOT_GS": DATASTORE_SYSROOT_GS,
218
+ "METAFLOW_CARD_GSROOT": CARD_GSROOT,
219
+ "METAFLOW_INIT_SCRIPT": KUBERNETES_SANDBOX_INIT_SCRIPT,
220
+ "METAFLOW_OTEL_ENDPOINT": OTEL_ENDPOINT,
221
+ "NVCT_CONTEXT": "1",
222
+ }
223
+
224
+ env_deco = [deco for deco in node.decorators if deco.name == "environment"]
225
+ if env_deco:
226
+ env.update(env_deco[0].attributes["vars"])
227
+
228
+ # Add the environment variables related to the input-paths argument
229
+ if split_vars:
230
+ env.update(split_vars)
231
+
232
+ if retry_count:
233
+ ctx.obj.echo_always(
234
+ "Sleeping %d minutes before the next retry" % minutes_between_retries
235
+ )
236
+ time.sleep(minutes_between_retries * 60)
237
+
238
+ # this information is needed for log tailing
239
+ ds = ctx.obj.flow_datastore.get_task_datastore(
240
+ mode="w",
241
+ run_id=kwargs["run_id"],
242
+ step_name=step_name,
243
+ task_id=kwargs["task_id"],
244
+ attempt=int(retry_count),
245
+ )
246
+ stdout_location = ds.get_log_location(TASK_LOG_SOURCE, "stdout")
247
+ stderr_location = ds.get_log_location(TASK_LOG_SOURCE, "stderr")
248
+
249
+ def _sync_metadata():
250
+ if ctx.obj.metadata.TYPE == "local":
251
+ sync_local_metadata_from_datastore(
252
+ DATASTORE_LOCAL_DIR,
253
+ ctx.obj.flow_datastore.get_task_datastore(
254
+ kwargs["run_id"], step_name, kwargs["task_id"]
255
+ ),
256
+ )
257
+
258
+ nvct = NvctRunner(
259
+ ctx.obj.metadata,
260
+ ctx.obj.flow_datastore,
261
+ ctx.obj.environment,
262
+ gpu_type,
263
+ instance_type,
264
+ backend,
265
+ ngc_api_key,
266
+ )
267
+ try:
268
+ with ctx.obj.monitor.measure("metaflow.nvct.launch_task"):
269
+ nvct.launch_task(
270
+ step_name,
271
+ step_cli,
272
+ task_spec,
273
+ code_package_sha,
274
+ code_package_url,
275
+ ctx.obj.flow_datastore.TYPE,
276
+ env=env,
277
+ )
278
+ except Exception:
279
+ traceback.print_exc()
280
+ _sync_metadata()
281
+ sys.exit(METAFLOW_EXIT_DISALLOW_RETRY)
282
+ try:
283
+ nvct.wait_for_completion(stdout_location, stderr_location, echo=echo)
284
+ except NvctKilledException:
285
+ # don't retry killed tasks
286
+ traceback.print_exc()
287
+ sys.exit(METAFLOW_EXIT_DISALLOW_RETRY)
288
+ finally:
289
+ _sync_metadata()
@@ -0,0 +1,262 @@
1
+ import os
2
+ import sys
3
+
4
+ from metaflow.exception import MetaflowException
5
+ from metaflow.decorators import StepDecorator
6
+ from metaflow.plugins.parallel_decorator import ParallelDecorator
7
+ from metaflow.metadata_provider.util import sync_local_metadata_to_datastore
8
+ from metaflow.metaflow_config import DATASTORE_LOCAL_DIR
9
+ from metaflow.sidecar import Sidecar
10
+ from metaflow.plugins.timeout_decorator import get_run_time_limit_for_task
11
+ from metaflow.metadata_provider import MetaDatum
12
+
13
+ from .utils import get_ngc_api_key
14
+ from .exceptions import (
15
+ UnsupportedNvctDatastoreException,
16
+ NvctTimeoutTooShortException,
17
+ RequestedGPUTypeUnavailableException,
18
+ UnsupportedNvctConfigurationException,
19
+ )
20
+
21
+
22
+ DEFAULT_GPU_TYPE = "H100"
23
+
24
+ SUPPORTABLE_GPU_TYPES = {
25
+ "L40": [
26
+ {
27
+ "n_gpus": 1,
28
+ "instance_type": "gl40_1.br20_2xlarge",
29
+ "backend": "GFN",
30
+ },
31
+ ],
32
+ "L40S": [
33
+ {
34
+ "n_gpus": 1,
35
+ "instance_type": "gl40s_1.br25_2xlarge",
36
+ "backend": "GFN",
37
+ },
38
+ ],
39
+ "L40G": [
40
+ {
41
+ "n_gpus": 1,
42
+ "instance_type": "gl40g_1.br25_2xlarge",
43
+ "backend": "GFN",
44
+ },
45
+ ],
46
+ "H100": [
47
+ {
48
+ "n_gpus": 1,
49
+ "instance_type": "GCP.GPU.H100_1x",
50
+ "backend": "gcp-asia-se-1a",
51
+ },
52
+ {
53
+ "n_gpus": 2,
54
+ "instance_type": "GCP.GPU.H100_2x",
55
+ "backend": "gcp-asia-se-1a",
56
+ },
57
+ {
58
+ "n_gpus": 4,
59
+ "instance_type": "GCP.GPU.H100_4x",
60
+ "backend": "gcp-asia-se-1a",
61
+ },
62
+ {
63
+ "n_gpus": 8,
64
+ "instance_type": "GCP.GPU.H100_8x",
65
+ "backend": "gcp-asia-se-1a",
66
+ },
67
+ ],
68
+ }
69
+
70
+
71
+ class NvctDecorator(StepDecorator):
72
+
73
+ """
74
+ Specifies that this step should execute on DGX cloud.
75
+
76
+ Parameters
77
+ ----------
78
+ gpu : int
79
+ Number of GPUs to use.
80
+ gpu_type : str
81
+ Type of Nvidia GPU to use.
82
+ """
83
+
84
+ name = "nvct"
85
+ defaults = {
86
+ "gpu": 1,
87
+ "gpu_type": None,
88
+ "ngc_api_key": None,
89
+ "instance_type": None,
90
+ "backend": None,
91
+ }
92
+
93
+ package_url = None
94
+ package_sha = None
95
+
96
+ # Refer https://github.com/Netflix/metaflow/blob/master/docs/lifecycle.png
97
+ # to understand where these functions are invoked in the lifecycle of a
98
+ # Metaflow flow.
99
+ def step_init(self, flow, graph, step, decos, environment, flow_datastore, logger):
100
+ # Executing NVCT functions requires a non-local datastore.
101
+ if flow_datastore.TYPE not in ("s3", "azure", "gs"):
102
+ raise UnsupportedNvctDatastoreException(flow_datastore.TYPE)
103
+
104
+ # Set internal state.
105
+ self.logger = logger
106
+ self.environment = environment
107
+ self.step = step
108
+ self.flow_datastore = flow_datastore
109
+
110
+ if any([deco.name == "kubernetes" for deco in decos]):
111
+ raise MetaflowException(
112
+ "Step *{step}* is marked for execution both on Kubernetes and "
113
+ "Nvct. Please use one or the other.".format(step=step)
114
+ )
115
+ if any([isinstance(deco, ParallelDecorator) for deco in decos]):
116
+ raise MetaflowException(
117
+ "Step *{step}* contains a @parallel decorator "
118
+ "with the @nvct decorator. @parallel decorators are not currently supported with @nvct.".format(
119
+ step=step
120
+ )
121
+ )
122
+
123
+ # Set run time limit for NVCT.
124
+ self.run_time_limit = get_run_time_limit_for_task(decos)
125
+ if self.run_time_limit < 60:
126
+ raise NvctTimeoutTooShortException(step)
127
+
128
+ self.attributes["ngc_api_key"] = get_ngc_api_key()
129
+
130
+ requested_gpu_type = self.attributes["gpu_type"]
131
+ requested_n_gpus = self.attributes["gpu"]
132
+
133
+ if requested_gpu_type is None:
134
+ requested_gpu_type = DEFAULT_GPU_TYPE
135
+ if requested_gpu_type not in SUPPORTABLE_GPU_TYPES:
136
+ raise RequestedGPUTypeUnavailableException(
137
+ requested_gpu_type, list(SUPPORTABLE_GPU_TYPES.keys())
138
+ )
139
+
140
+ valid_config = None
141
+ available_configurations = SUPPORTABLE_GPU_TYPES[requested_gpu_type]
142
+ for each_config in available_configurations:
143
+ if each_config["n_gpus"] == requested_n_gpus:
144
+ valid_config = each_config
145
+ break
146
+
147
+ if valid_config is None:
148
+ raise UnsupportedNvctConfigurationException(
149
+ requested_n_gpus,
150
+ requested_gpu_type,
151
+ available_configurations,
152
+ step,
153
+ )
154
+
155
+ self.attributes["instance_type"] = valid_config["instance_type"]
156
+ self.attributes["gpu_type"] = requested_gpu_type
157
+ self.attributes["backend"] = valid_config["backend"]
158
+
159
+ def runtime_init(self, flow, graph, package, run_id):
160
+ # Set some more internal state.
161
+ self.flow = flow
162
+ self.graph = graph
163
+ self.package = package
164
+ self.run_id = run_id
165
+
166
+ def runtime_task_created(
167
+ self, task_datastore, task_id, split_index, input_paths, is_cloned, ubf_context
168
+ ):
169
+ if not is_cloned:
170
+ self._save_package_once(self.flow_datastore, self.package)
171
+
172
+ def runtime_step_cli(
173
+ self, cli_args, retry_count, max_user_code_retries, ubf_context
174
+ ):
175
+ if retry_count <= max_user_code_retries:
176
+ # after all attempts to run the user code have failed, we don't need
177
+ # to execute on NVCF anymore. We can execute possible fallback
178
+ # code locally.
179
+ cli_args.commands = ["nvct", "step"]
180
+ cli_args.command_args.append(self.package_sha)
181
+ cli_args.command_args.append(self.package_url)
182
+ cli_options = {
183
+ "gpu_type": self.attributes["gpu_type"],
184
+ "instance_type": self.attributes["instance_type"],
185
+ "backend": self.attributes["backend"],
186
+ "ngc_api_key": self.attributes["ngc_api_key"],
187
+ }
188
+ cli_args.command_options.update(cli_options)
189
+ cli_args.entrypoint[0] = sys.executable
190
+
191
+ def task_pre_step(
192
+ self,
193
+ step_name,
194
+ task_datastore,
195
+ metadata,
196
+ run_id,
197
+ task_id,
198
+ flow,
199
+ graph,
200
+ retry_count,
201
+ max_retries,
202
+ ubf_context,
203
+ inputs,
204
+ ):
205
+ self.metadata = metadata
206
+ self.task_datastore = task_datastore
207
+
208
+ # task_pre_step may run locally if fallback is activated for @catch
209
+ # decorator.
210
+
211
+ if "NVCT_CONTEXT" in os.environ:
212
+ meta = {}
213
+
214
+ meta["nvct-task-id"] = os.environ.get("NVCT_TASK_ID")
215
+ meta["nvct-task-name"] = os.environ.get("NVCT_TASK_NAME")
216
+ meta["nvct-ncaid"] = os.environ.get("NVCT_NCA_ID")
217
+ meta["nvct-progress-file-path"] = os.environ.get("NVCT_PROGRESS_FILE_PATH")
218
+ meta["nvct-results-dir"] = os.environ.get("NVCT_RESULTS_DIR")
219
+
220
+ entries = [
221
+ MetaDatum(
222
+ field=k,
223
+ value=v,
224
+ type=k,
225
+ tags=["attempt_id:{0}".format(retry_count)],
226
+ )
227
+ for k, v in meta.items()
228
+ if v is not None
229
+ ]
230
+ # Register book-keeping metadata for debugging.
231
+ metadata.register_metadata(run_id, step_name, task_id, entries)
232
+
233
+ self._save_logs_sidecar = Sidecar("save_logs_periodically")
234
+ self._save_logs_sidecar.start()
235
+
236
+ def task_finished(
237
+ self, step_name, flow, graph, is_task_ok, retry_count, max_retries
238
+ ):
239
+ # task_finished may run locally if fallback is activated for @catch
240
+ # decorator.
241
+ if "NVCT_CONTEXT" in os.environ:
242
+ # If `local` metadata is configured, we would need to copy task
243
+ # execution metadata from the NVCT container to user's
244
+ # local file system after the user code has finished execution.
245
+ # This happens via datastore as a communication bridge.
246
+ if hasattr(self, "metadata") and self.metadata.TYPE == "local":
247
+ sync_local_metadata_to_datastore(
248
+ DATASTORE_LOCAL_DIR, self.task_datastore
249
+ )
250
+
251
+ try:
252
+ self._save_logs_sidecar.terminate()
253
+ except:
254
+ # Best effort kill
255
+ pass
256
+
257
+ @classmethod
258
+ def _save_package_once(cls, flow_datastore, package):
259
+ if cls.package_url is None:
260
+ cls.package_url, cls.package_sha = flow_datastore.save_data(
261
+ [package.blob], len_hint=1
262
+ )[0]
@@ -0,0 +1,218 @@
1
+ import os
2
+ import re
3
+ import time
4
+ import math
5
+ import shlex
6
+ import atexit
7
+
8
+ from metaflow import util
9
+ from metaflow.mflog import (
10
+ BASH_SAVE_LOGS,
11
+ bash_capture_logs,
12
+ export_mflog_env_vars,
13
+ tail_logs,
14
+ get_log_tailer,
15
+ )
16
+
17
+ from .nvct import NVCTClient, NVCTTask, NVCTRequest
18
+ from .exceptions import (
19
+ NvctKilledException,
20
+ NvctExecutionException,
21
+ NvctTaskFailedException,
22
+ )
23
+
24
+ # Constants for Metaflow logs
25
+ LOGS_DIR = "$PWD/.logs"
26
+ STDOUT_FILE = "mflog_stdout"
27
+ STDERR_FILE = "mflog_stderr"
28
+ STDOUT_PATH = os.path.join(LOGS_DIR, STDOUT_FILE)
29
+ STDERR_PATH = os.path.join(LOGS_DIR, STDERR_FILE)
30
+ NVCT_WRAPPER = "/usr/local/bin/nvct-wrapper.sh"
31
+
32
+
33
+ class NvctRunner:
34
+ def __init__(
35
+ self,
36
+ metadata,
37
+ datastore,
38
+ environment,
39
+ gpu_type,
40
+ instance_type,
41
+ backend,
42
+ ngc_api_key,
43
+ ):
44
+ self.metadata = metadata
45
+ self.datastore = datastore
46
+ self.environment = environment
47
+ self.gpu_type = gpu_type
48
+ self.instance_type = instance_type
49
+ self.backend = backend
50
+ self._ngc_api_key = ngc_api_key
51
+ self.client = None
52
+ self.task = None
53
+ atexit.register(lambda: self.task.cancel() if hasattr(self, "task") else None)
54
+
55
+ def launch_task(
56
+ self,
57
+ step_name,
58
+ step_cli,
59
+ task_spec,
60
+ code_package_sha,
61
+ code_package_url,
62
+ code_package_ds,
63
+ env={},
64
+ max_runtime="PT7H", # <8H allowed for GFN backend
65
+ max_queued="PT120H", # 5 days
66
+ ):
67
+ mflog_expr = export_mflog_env_vars(
68
+ datastore_type=code_package_ds,
69
+ stdout_path=STDOUT_PATH,
70
+ stderr_path=STDERR_PATH,
71
+ **task_spec,
72
+ )
73
+ init_cmds = self.environment.get_package_commands(
74
+ code_package_url, code_package_ds
75
+ )
76
+ init_expr = " && ".join(init_cmds)
77
+ step_expr = bash_capture_logs(
78
+ " && ".join(
79
+ self.environment.bootstrap_commands(step_name, code_package_ds)
80
+ + [step_cli]
81
+ )
82
+ )
83
+ cmd_str = "mkdir -p %s && %s && %s && %s; c=$?; %s; exit $c" % (
84
+ LOGS_DIR,
85
+ mflog_expr,
86
+ init_expr,
87
+ step_expr,
88
+ BASH_SAVE_LOGS,
89
+ )
90
+
91
+ # Add optional initialization script execution
92
+ cmd_str = (
93
+ '${METAFLOW_INIT_SCRIPT:+eval \\"${METAFLOW_INIT_SCRIPT}\\"} && %s'
94
+ % cmd_str
95
+ )
96
+
97
+ cmd_str = shlex.split('bash -c "%s"' % cmd_str)[-1]
98
+
99
+ def modify_python_c(match):
100
+ content = match.group(1)
101
+ # Escape double quotes within the python -c command
102
+ content = content.replace('"', r"\"")
103
+ # Replace outermost double quotes with single quotes
104
+ return 'python -c "%s"' % content
105
+
106
+ # Convert python -c single quotes to double quotes
107
+ cmd_str = re.sub(r"python -c '(.*?)'", modify_python_c, cmd_str)
108
+ cmd_str = cmd_str.replace("'", '"')
109
+ # Create the final command with outer single quotes to pass to NVCT wrapper
110
+ nvct_cmd = f"{NVCT_WRAPPER} bash -c '{cmd_str}'"
111
+
112
+ flow_name = task_spec.get("flow_name")
113
+ run_id = task_spec.get("run_id")
114
+ task_id = task_spec.get("task_id")
115
+ retry_count = task_spec.get("retry_count")
116
+ task_name = f"{flow_name}-{run_id}-{step_name}-{task_id}-{retry_count}"
117
+
118
+ if self.backend != "GFN":
119
+ # if maxRuntimeDuration exceeds 8 hours for a Task on the GFN backend,
120
+ # the request will be rejected.
121
+ # (https://docs.nvidia.com/cloud-functions/user-guide/latest/cloud-function/tasks.html#create-task)
122
+ ## thus, if it is non GFN backend, we increase it to 3 days
123
+ max_runtime = "PT72H"
124
+
125
+ request = (
126
+ NVCTRequest(task_name)
127
+ .container_image("nvcr.io/zhxkmsaasxhw/nvct-base:1.0-jovyan")
128
+ .container_args(nvct_cmd)
129
+ .gpu(
130
+ gpu=self.gpu_type,
131
+ instance_type=self.instance_type,
132
+ backend=self.backend,
133
+ )
134
+ .max_runtime(max_runtime)
135
+ .max_queued(max_queued)
136
+ )
137
+
138
+ for k, v in env.items():
139
+ if v is not None:
140
+ request.env(k, str(v))
141
+
142
+ self.client = NVCTClient(self._ngc_api_key)
143
+ self.task = NVCTTask(self.client, request.to_dict())
144
+
145
+ self.task.submit()
146
+ return self.task.id
147
+
148
+ def wait_for_completion(self, stdout_location, stderr_location, echo=None):
149
+ if not self.task:
150
+ raise NvctExecutionException("No task has been launched")
151
+
152
+ def update_delay(secs_since_start):
153
+ # this sigmoid function reaches
154
+ # - 0.1 after 11 minutes
155
+ # - 0.5 after 15 minutes
156
+ # - 1.0 after 23 minutes
157
+ # in other words, the user will see very frequent updates
158
+ # during the first 10 minutes
159
+ sigmoid = 1.0 / (1.0 + math.exp(-0.01 * secs_since_start + 9.0))
160
+ return 0.5 + sigmoid * 30.0
161
+
162
+ def wait_for_launch(task):
163
+ status = task.status
164
+ echo(
165
+ "Task is starting (%s)..." % status,
166
+ "stderr",
167
+ _id=task.id,
168
+ )
169
+
170
+ t = time.time()
171
+ start_time = time.time()
172
+ while task.is_waiting:
173
+ new_status = task.status
174
+ if status != new_status or (time.time() - t) > 30:
175
+ status = new_status
176
+ echo(
177
+ "Task is starting (%s)..." % status,
178
+ "stderr",
179
+ _id=task.id,
180
+ )
181
+ t = time.time()
182
+ time.sleep(update_delay(time.time() - start_time))
183
+
184
+ _make_prefix = lambda: b"[%s] " % util.to_bytes(self.task.id)
185
+ stdout_tail = get_log_tailer(stdout_location, self.datastore.TYPE)
186
+ stderr_tail = get_log_tailer(stderr_location, self.datastore.TYPE)
187
+
188
+ # 1) Loop until the job has started
189
+ wait_for_launch(self.task)
190
+
191
+ echo(
192
+ "Task is starting (%s)..." % self.task.status,
193
+ "stderr",
194
+ _id=self.task.id,
195
+ )
196
+
197
+ # 2) Tail logs until the job has finished
198
+ tail_logs(
199
+ prefix=_make_prefix(),
200
+ stdout_tail=stdout_tail,
201
+ stderr_tail=stderr_tail,
202
+ echo=echo,
203
+ has_log_updates=lambda: self.task.is_running,
204
+ )
205
+
206
+ if self.task.has_failed:
207
+ raise NvctTaskFailedException(
208
+ f"Task failed with status: {self.task.status}. This could be a transient error. Use @retry to retry."
209
+ )
210
+ else:
211
+ if self.task.is_running:
212
+ # Kill the job if it is still running by throwing an exception.
213
+ raise NvctKilledException("Task failed!")
214
+ echo(
215
+ f"Task finished with status: {self.task.status}",
216
+ "stderr",
217
+ _id=self.task.id,
218
+ )
@@ -0,0 +1,29 @@
1
+ import os
2
+ import json
3
+ import requests
4
+ from urllib.parse import urlparse
5
+ from metaflow.metaflow_config import SERVICE_URL
6
+ from metaflow.metaflow_config_funcs import init_config
7
+
8
+
9
+ def get_ngc_api_key():
10
+ conf = init_config()
11
+ if "OBP_AUTH_SERVER" in conf:
12
+ auth_host = conf["OBP_AUTH_SERVER"]
13
+ else:
14
+ auth_host = "auth." + urlparse(SERVICE_URL).hostname.split(".", 1)[1]
15
+
16
+ # NOTE: reusing the same auth_host as the one used in NimMetadata,
17
+ # however, user should not need to use nim container to use @nvct.
18
+ # May want to refactor this to a common endpoint.
19
+ nim_info_url = "https://" + auth_host + "/generate/nim"
20
+
21
+ if "METAFLOW_SERVICE_AUTH_KEY" in conf:
22
+ headers = {"x-api-key": conf["METAFLOW_SERVICE_AUTH_KEY"]}
23
+ res = requests.get(nim_info_url, headers=headers)
24
+ else:
25
+ headers = json.loads(os.environ.get("METAFLOW_SERVICE_HEADERS"))
26
+ res = requests.get(nim_info_url, headers=headers)
27
+
28
+ res.raise_for_status()
29
+ return res.json()["nvcf"]["api_key"]
@@ -1,13 +1,13 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ob-metaflow-extensions
3
- Version: 1.1.149
3
+ Version: 1.1.151
4
4
  Summary: Outerbounds Platform Extensions for Metaflow
5
5
  Author: Outerbounds, Inc.
6
6
  License: Commercial
7
7
  Description-Content-Type: text/markdown
8
8
  Requires-Dist: boto3
9
9
  Requires-Dist: kubernetes
10
- Requires-Dist: ob-metaflow (==2.15.10.1)
10
+ Requires-Dist: ob-metaflow (==2.15.11.1)
11
11
 
12
12
  # Outerbounds platform package
13
13
 
@@ -1,7 +1,7 @@
1
1
  metaflow_extensions/outerbounds/__init__.py,sha256=TRGvIUMjkfneWtYUFSWoubu_Kf2ekAL4WLbV3IxOj9k,499
2
2
  metaflow_extensions/outerbounds/remote_config.py,sha256=Zpfpjgz68_ZgxlXezjzlsDLo4840rkWuZgwDB_5H57U,4059
3
3
  metaflow_extensions/outerbounds/config/__init__.py,sha256=JsQGRuGFz28fQWjUvxUgR8EKBLGRdLUIk_buPLJplJY,1225
4
- metaflow_extensions/outerbounds/plugins/__init__.py,sha256=LOQLHsbvHWhhjZUQ99kMVl57wWCZFOxFQGblQ5kYarQ,13140
4
+ metaflow_extensions/outerbounds/plugins/__init__.py,sha256=eHcM_t2Mzlge7B9Dv3VGVM5x8qNZYdLyqBOAC6uRxec,13228
5
5
  metaflow_extensions/outerbounds/plugins/auth_server.py,sha256=_Q9_2EL0Xy77bCRphkwT1aSu8gQXRDOH-Z-RxTUO8N4,2202
6
6
  metaflow_extensions/outerbounds/plugins/perimeters.py,sha256=QXh3SFP7GQbS-RAIxUOPbhPzQ7KDFVxZkTdKqFKgXjI,2697
7
7
  metaflow_extensions/outerbounds/plugins/apps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -35,6 +35,13 @@ metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py,sha256=3ZFdYItVpFWnHMOeyV1n
35
35
  metaflow_extensions/outerbounds/plugins/nvcf/nvcf_cli.py,sha256=3D-r5XO88Yh2k1EAZFJTe_PwdbhWp5qXflG8AgE4ZIU,9500
36
36
  metaflow_extensions/outerbounds/plugins/nvcf/nvcf_decorator.py,sha256=pSWKaPyGXBEfxG35QA1FQljio8ADjwf-DnPgEsqXoUM,9251
37
37
  metaflow_extensions/outerbounds/plugins/nvcf/utils.py,sha256=DxWSCayfa95e0HJkWacey1s1nxoTpaunGhrb_0Ayv28,133
38
+ metaflow_extensions/outerbounds/plugins/nvct/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
39
+ metaflow_extensions/outerbounds/plugins/nvct/exceptions.py,sha256=1PiV6FdH36CvkmHh5jtsfrsoe3Q_Fo1NomHw5wvgoDM,2886
40
+ metaflow_extensions/outerbounds/plugins/nvct/nvct.py,sha256=Z2ZPWGuHe58au_d6GfHiw6Nl5d8INdLDI5exlsPEOSA,3564
41
+ metaflow_extensions/outerbounds/plugins/nvct/nvct_cli.py,sha256=bB9AURhRep9PV_-b-qLHpgw_GPG_xFoq1PeHEgFP1mQ,10104
42
+ metaflow_extensions/outerbounds/plugins/nvct/nvct_decorator.py,sha256=LaJ_Tk-vNjvrglzSTR-U6pk8f9MtQRKObU9m7vBYtkI,8695
43
+ metaflow_extensions/outerbounds/plugins/nvct/nvct_runner.py,sha256=D2sEtVFVWXBqaQEsminZYI_WesR2kADLmwv4lsxVBhk,7091
44
+ metaflow_extensions/outerbounds/plugins/nvct/utils.py,sha256=U4_Fu8H94j_Bbox7mmMhNnlRhlYHqnK28R5w_TMWEFM,1029
38
45
  metaflow_extensions/outerbounds/plugins/ollama/__init__.py,sha256=HEsI5U4ckQby7K2NsGBOdizhPY3WWqXSnXx_IHL7_No,2307
39
46
  metaflow_extensions/outerbounds/plugins/ollama/ollama.py,sha256=KlP8_EmnUoi8-PidyU0IDuENYxKjQaHFC33yGsvaeic,13320
40
47
  metaflow_extensions/outerbounds/plugins/profilers/deco_injector.py,sha256=oI_C3c64XBm7n88FILqHwn-Nnc5DeT_68I67lM9rXaI,2434
@@ -61,7 +68,7 @@ metaflow_extensions/outerbounds/toplevel/plugins/gcp/__init__.py,sha256=BbZiaH3u
61
68
  metaflow_extensions/outerbounds/toplevel/plugins/kubernetes/__init__.py,sha256=5zG8gShSj8m7rgF4xgWBZFuY3GDP5n1T0ktjRpGJLHA,69
62
69
  metaflow_extensions/outerbounds/toplevel/plugins/ollama/__init__.py,sha256=GRSz2zwqkvlmFS6bcfYD_CX6CMko9DHQokMaH1iBshA,47
63
70
  metaflow_extensions/outerbounds/toplevel/plugins/snowflake/__init__.py,sha256=LptpH-ziXHrednMYUjIaosS1SXD3sOtF_9_eRqd8SJw,50
64
- ob_metaflow_extensions-1.1.149.dist-info/METADATA,sha256=e-FFRtty5ShVlDzjNq_jpVLZbMA7Ul5LtSDa9_oGcaQ,521
65
- ob_metaflow_extensions-1.1.149.dist-info/WHEEL,sha256=bb2Ot9scclHKMOLDEHY6B2sicWOgugjFKaJsT7vwMQo,110
66
- ob_metaflow_extensions-1.1.149.dist-info/top_level.txt,sha256=NwG0ukwjygtanDETyp_BUdtYtqIA_lOjzFFh1TsnxvI,20
67
- ob_metaflow_extensions-1.1.149.dist-info/RECORD,,
71
+ ob_metaflow_extensions-1.1.151.dist-info/METADATA,sha256=x5PLR9aAaWhuhLDERP5mSl44te5I0ZNmNxmjBsRgrzg,521
72
+ ob_metaflow_extensions-1.1.151.dist-info/WHEEL,sha256=bb2Ot9scclHKMOLDEHY6B2sicWOgugjFKaJsT7vwMQo,110
73
+ ob_metaflow_extensions-1.1.151.dist-info/top_level.txt,sha256=NwG0ukwjygtanDETyp_BUdtYtqIA_lOjzFFh1TsnxvI,20
74
+ ob_metaflow_extensions-1.1.151.dist-info/RECORD,,