ob-metaflow-extensions 1.1.45rc3__py2.py3-none-any.whl → 1.5.1__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ob-metaflow-extensions might be problematic. Click here for more details.

Files changed (128) hide show
  1. metaflow_extensions/outerbounds/__init__.py +1 -7
  2. metaflow_extensions/outerbounds/config/__init__.py +35 -0
  3. metaflow_extensions/outerbounds/plugins/__init__.py +186 -57
  4. metaflow_extensions/outerbounds/plugins/apps/__init__.py +0 -0
  5. metaflow_extensions/outerbounds/plugins/apps/app_cli.py +0 -0
  6. metaflow_extensions/outerbounds/plugins/apps/app_utils.py +187 -0
  7. metaflow_extensions/outerbounds/plugins/apps/consts.py +3 -0
  8. metaflow_extensions/outerbounds/plugins/apps/core/__init__.py +15 -0
  9. metaflow_extensions/outerbounds/plugins/apps/core/_state_machine.py +506 -0
  10. metaflow_extensions/outerbounds/plugins/apps/core/_vendor/__init__.py +0 -0
  11. metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/__init__.py +4 -0
  12. metaflow_extensions/outerbounds/plugins/apps/core/_vendor/spinner/spinners.py +478 -0
  13. metaflow_extensions/outerbounds/plugins/apps/core/app_config.py +128 -0
  14. metaflow_extensions/outerbounds/plugins/apps/core/app_deploy_decorator.py +330 -0
  15. metaflow_extensions/outerbounds/plugins/apps/core/artifacts.py +0 -0
  16. metaflow_extensions/outerbounds/plugins/apps/core/capsule.py +958 -0
  17. metaflow_extensions/outerbounds/plugins/apps/core/click_importer.py +24 -0
  18. metaflow_extensions/outerbounds/plugins/apps/core/code_package/__init__.py +3 -0
  19. metaflow_extensions/outerbounds/plugins/apps/core/code_package/code_packager.py +618 -0
  20. metaflow_extensions/outerbounds/plugins/apps/core/code_package/examples.py +125 -0
  21. metaflow_extensions/outerbounds/plugins/apps/core/config/__init__.py +15 -0
  22. metaflow_extensions/outerbounds/plugins/apps/core/config/cli_generator.py +165 -0
  23. metaflow_extensions/outerbounds/plugins/apps/core/config/config_utils.py +966 -0
  24. metaflow_extensions/outerbounds/plugins/apps/core/config/schema_export.py +299 -0
  25. metaflow_extensions/outerbounds/plugins/apps/core/config/typed_configs.py +233 -0
  26. metaflow_extensions/outerbounds/plugins/apps/core/config/typed_init_generator.py +537 -0
  27. metaflow_extensions/outerbounds/plugins/apps/core/config/unified_config.py +1125 -0
  28. metaflow_extensions/outerbounds/plugins/apps/core/config_schema.yaml +337 -0
  29. metaflow_extensions/outerbounds/plugins/apps/core/dependencies.py +115 -0
  30. metaflow_extensions/outerbounds/plugins/apps/core/deployer.py +959 -0
  31. metaflow_extensions/outerbounds/plugins/apps/core/experimental/__init__.py +89 -0
  32. metaflow_extensions/outerbounds/plugins/apps/core/perimeters.py +87 -0
  33. metaflow_extensions/outerbounds/plugins/apps/core/secrets.py +164 -0
  34. metaflow_extensions/outerbounds/plugins/apps/core/utils.py +233 -0
  35. metaflow_extensions/outerbounds/plugins/apps/core/validations.py +17 -0
  36. metaflow_extensions/outerbounds/plugins/apps/deploy_decorator.py +201 -0
  37. metaflow_extensions/outerbounds/plugins/apps/supervisord_utils.py +243 -0
  38. metaflow_extensions/outerbounds/plugins/auth_server.py +28 -8
  39. metaflow_extensions/outerbounds/plugins/aws/__init__.py +4 -0
  40. metaflow_extensions/outerbounds/plugins/aws/assume_role.py +3 -0
  41. metaflow_extensions/outerbounds/plugins/aws/assume_role_decorator.py +118 -0
  42. metaflow_extensions/outerbounds/plugins/card_utilities/__init__.py +0 -0
  43. metaflow_extensions/outerbounds/plugins/card_utilities/async_cards.py +142 -0
  44. metaflow_extensions/outerbounds/plugins/card_utilities/extra_components.py +545 -0
  45. metaflow_extensions/outerbounds/plugins/card_utilities/injector.py +70 -0
  46. metaflow_extensions/outerbounds/plugins/checkpoint_datastores/__init__.py +2 -0
  47. metaflow_extensions/outerbounds/plugins/checkpoint_datastores/coreweave.py +71 -0
  48. metaflow_extensions/outerbounds/plugins/checkpoint_datastores/external_chckpt.py +85 -0
  49. metaflow_extensions/outerbounds/plugins/checkpoint_datastores/nebius.py +73 -0
  50. metaflow_extensions/outerbounds/plugins/fast_bakery/__init__.py +0 -0
  51. metaflow_extensions/outerbounds/plugins/fast_bakery/baker.py +110 -0
  52. metaflow_extensions/outerbounds/plugins/fast_bakery/docker_environment.py +391 -0
  53. metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery.py +188 -0
  54. metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_cli.py +54 -0
  55. metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_decorator.py +50 -0
  56. metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py +79 -0
  57. metaflow_extensions/outerbounds/plugins/kubernetes/pod_killer.py +374 -0
  58. metaflow_extensions/outerbounds/plugins/nim/card.py +140 -0
  59. metaflow_extensions/outerbounds/plugins/nim/nim_decorator.py +101 -0
  60. metaflow_extensions/outerbounds/plugins/nim/nim_manager.py +379 -0
  61. metaflow_extensions/outerbounds/plugins/nim/utils.py +36 -0
  62. metaflow_extensions/outerbounds/plugins/nvcf/__init__.py +0 -0
  63. metaflow_extensions/outerbounds/plugins/nvcf/constants.py +3 -0
  64. metaflow_extensions/outerbounds/plugins/nvcf/exceptions.py +94 -0
  65. metaflow_extensions/outerbounds/plugins/nvcf/heartbeat_store.py +178 -0
  66. metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py +417 -0
  67. metaflow_extensions/outerbounds/plugins/nvcf/nvcf_cli.py +280 -0
  68. metaflow_extensions/outerbounds/plugins/nvcf/nvcf_decorator.py +242 -0
  69. metaflow_extensions/outerbounds/plugins/nvcf/utils.py +6 -0
  70. metaflow_extensions/outerbounds/plugins/nvct/__init__.py +0 -0
  71. metaflow_extensions/outerbounds/plugins/nvct/exceptions.py +71 -0
  72. metaflow_extensions/outerbounds/plugins/nvct/nvct.py +131 -0
  73. metaflow_extensions/outerbounds/plugins/nvct/nvct_cli.py +289 -0
  74. metaflow_extensions/outerbounds/plugins/nvct/nvct_decorator.py +286 -0
  75. metaflow_extensions/outerbounds/plugins/nvct/nvct_runner.py +218 -0
  76. metaflow_extensions/outerbounds/plugins/nvct/utils.py +29 -0
  77. metaflow_extensions/outerbounds/plugins/ollama/__init__.py +225 -0
  78. metaflow_extensions/outerbounds/plugins/ollama/constants.py +1 -0
  79. metaflow_extensions/outerbounds/plugins/ollama/exceptions.py +22 -0
  80. metaflow_extensions/outerbounds/plugins/ollama/ollama.py +1924 -0
  81. metaflow_extensions/outerbounds/plugins/ollama/status_card.py +292 -0
  82. metaflow_extensions/outerbounds/plugins/optuna/__init__.py +48 -0
  83. metaflow_extensions/outerbounds/plugins/perimeters.py +19 -5
  84. metaflow_extensions/outerbounds/plugins/profilers/deco_injector.py +70 -0
  85. metaflow_extensions/outerbounds/plugins/profilers/gpu_profile_decorator.py +88 -0
  86. metaflow_extensions/outerbounds/plugins/profilers/simple_card_decorator.py +96 -0
  87. metaflow_extensions/outerbounds/plugins/s3_proxy/__init__.py +7 -0
  88. metaflow_extensions/outerbounds/plugins/s3_proxy/binary_caller.py +132 -0
  89. metaflow_extensions/outerbounds/plugins/s3_proxy/constants.py +11 -0
  90. metaflow_extensions/outerbounds/plugins/s3_proxy/exceptions.py +13 -0
  91. metaflow_extensions/outerbounds/plugins/s3_proxy/proxy_bootstrap.py +59 -0
  92. metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_api.py +93 -0
  93. metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_decorator.py +250 -0
  94. metaflow_extensions/outerbounds/plugins/s3_proxy/s3_proxy_manager.py +225 -0
  95. metaflow_extensions/outerbounds/plugins/secrets/__init__.py +0 -0
  96. metaflow_extensions/outerbounds/plugins/secrets/secrets.py +204 -0
  97. metaflow_extensions/outerbounds/plugins/snowflake/__init__.py +3 -0
  98. metaflow_extensions/outerbounds/plugins/snowflake/snowflake.py +378 -0
  99. metaflow_extensions/outerbounds/plugins/snowpark/__init__.py +0 -0
  100. metaflow_extensions/outerbounds/plugins/snowpark/snowpark.py +309 -0
  101. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_cli.py +277 -0
  102. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_client.py +150 -0
  103. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_decorator.py +273 -0
  104. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_exceptions.py +13 -0
  105. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_job.py +241 -0
  106. metaflow_extensions/outerbounds/plugins/snowpark/snowpark_service_spec.py +259 -0
  107. metaflow_extensions/outerbounds/plugins/tensorboard/__init__.py +50 -0
  108. metaflow_extensions/outerbounds/plugins/torchtune/__init__.py +163 -0
  109. metaflow_extensions/outerbounds/plugins/vllm/__init__.py +255 -0
  110. metaflow_extensions/outerbounds/plugins/vllm/constants.py +1 -0
  111. metaflow_extensions/outerbounds/plugins/vllm/exceptions.py +1 -0
  112. metaflow_extensions/outerbounds/plugins/vllm/status_card.py +352 -0
  113. metaflow_extensions/outerbounds/plugins/vllm/vllm_manager.py +621 -0
  114. metaflow_extensions/outerbounds/profilers/gpu.py +131 -47
  115. metaflow_extensions/outerbounds/remote_config.py +53 -16
  116. metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py +138 -2
  117. metaflow_extensions/outerbounds/toplevel/ob_internal.py +4 -0
  118. metaflow_extensions/outerbounds/toplevel/plugins/ollama/__init__.py +1 -0
  119. metaflow_extensions/outerbounds/toplevel/plugins/optuna/__init__.py +1 -0
  120. metaflow_extensions/outerbounds/toplevel/plugins/snowflake/__init__.py +1 -0
  121. metaflow_extensions/outerbounds/toplevel/plugins/torchtune/__init__.py +1 -0
  122. metaflow_extensions/outerbounds/toplevel/plugins/vllm/__init__.py +1 -0
  123. metaflow_extensions/outerbounds/toplevel/s3_proxy.py +88 -0
  124. {ob_metaflow_extensions-1.1.45rc3.dist-info → ob_metaflow_extensions-1.5.1.dist-info}/METADATA +2 -2
  125. ob_metaflow_extensions-1.5.1.dist-info/RECORD +133 -0
  126. ob_metaflow_extensions-1.1.45rc3.dist-info/RECORD +0 -19
  127. {ob_metaflow_extensions-1.1.45rc3.dist-info → ob_metaflow_extensions-1.5.1.dist-info}/WHEEL +0 -0
  128. {ob_metaflow_extensions-1.1.45rc3.dist-info → ob_metaflow_extensions-1.5.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,280 @@
1
+ import json
2
+ import os
3
+ import sys
4
+ import time
5
+ import traceback
6
+
7
+ from metaflow import util, Run
8
+ from metaflow._vendor import click
9
+ from metaflow.exception import METAFLOW_EXIT_DISALLOW_RETRY
10
+ from metaflow.metadata_provider.util import sync_local_metadata_from_datastore
11
+ from metaflow.metaflow_config import (
12
+ CARD_S3ROOT,
13
+ DATASTORE_LOCAL_DIR,
14
+ DATASTORE_SYSROOT_S3,
15
+ DATATOOLS_S3ROOT,
16
+ DEFAULT_METADATA,
17
+ SERVICE_HEADERS,
18
+ SERVICE_URL,
19
+ DEFAULT_SECRETS_BACKEND_TYPE,
20
+ DEFAULT_AWS_CLIENT_PROVIDER,
21
+ AWS_SECRETS_MANAGER_DEFAULT_REGION,
22
+ S3_ENDPOINT_URL,
23
+ AZURE_STORAGE_BLOB_SERVICE_ENDPOINT,
24
+ DATASTORE_SYSROOT_AZURE,
25
+ CARD_AZUREROOT,
26
+ DATASTORE_SYSROOT_GS,
27
+ CARD_GSROOT,
28
+ KUBERNETES_SANDBOX_INIT_SCRIPT,
29
+ OTEL_ENDPOINT,
30
+ NVIDIA_HEARTBEAT_THRESHOLD,
31
+ )
32
+ from metaflow.mflog import TASK_LOG_SOURCE
33
+
34
+ from .nvcf import Nvcf
35
+ from .exceptions import NvcfKilledException
36
+
37
+
38
+ @click.group()
39
+ def cli():
40
+ pass
41
+
42
+
43
+ @cli.group(help="Commands related to nvidia.")
44
+ def nvidia():
45
+ pass
46
+
47
+
48
+ @nvidia.command(help="List steps / tasks running as an nvidia job.")
49
+ @click.option(
50
+ "--run-id",
51
+ default=None,
52
+ required=True,
53
+ help="List unfinished tasks corresponding to the run id.",
54
+ )
55
+ @click.pass_context
56
+ def list(ctx, run_id):
57
+ flow_name = ctx.obj.flow.name
58
+ run_obj = Run(pathspec=f"{flow_name}/{run_id}", _namespace_check=False)
59
+ running_invocations = []
60
+
61
+ for each_step in run_obj:
62
+ for each_task in each_step:
63
+ if not each_task.finished and "nvcf-function-id" in each_task.metadata_dict:
64
+
65
+ task_pathspec = each_task.pathspec
66
+ attempt = each_task.metadata_dict.get("attempt")
67
+ flow_name, run_id, step_name, task_id = task_pathspec.split("/")
68
+ running_invocations.append(
69
+ f"Flow Name: {flow_name}, Run ID: {run_id}, Step Name: {step_name}, Task ID: {task_id}, Retry Count: {attempt}"
70
+ )
71
+
72
+ if running_invocations:
73
+ for each_invocation in running_invocations:
74
+ ctx.obj.echo(each_invocation)
75
+
76
+
77
+ @nvidia.command(help="Kill steps / tasks running as an nvidia job.")
78
+ @click.option(
79
+ "--run-id",
80
+ default=None,
81
+ required=True,
82
+ help="Terminate unfinished tasks corresponding to the run id.",
83
+ )
84
+ @click.pass_context
85
+ def kill(ctx, run_id):
86
+ from metaflow_extensions.outerbounds.plugins.nvcf.heartbeat_store import (
87
+ HeartbeatStore,
88
+ )
89
+
90
+ datastore_root = ctx.obj.datastore_impl.datastore_root
91
+ store = HeartbeatStore(
92
+ main_pid=None,
93
+ storage_backend=ctx.obj.datastore_impl(datastore_root),
94
+ )
95
+
96
+ flow_name = ctx.obj.flow.name
97
+ tombstone_prefix = f"{flow_name}/{run_id}"
98
+ store.emit_tombstone(
99
+ tombstone_prefix=tombstone_prefix, folder_name="nvcf_heartbeats"
100
+ )
101
+
102
+
103
+ @nvidia.command(
104
+ help="Execute a single task using @nvidia. This command calls the "
105
+ "top-level step command inside an nvidia job with the given options. "
106
+ "Typically you do not call this command directly; it is used internally by "
107
+ "Metaflow."
108
+ )
109
+ @click.argument("step-name")
110
+ @click.argument("code-package-sha")
111
+ @click.argument("code-package-url")
112
+ @click.option("--function-id", help="NVCF function id.")
113
+ @click.option("--ngc-api-key", help="NGC API key.")
114
+ @click.option(
115
+ "--queue-timeout", default=5 * 24 * 3600, help="Queue timeout in seconds."
116
+ )
117
+ @click.option("--run-id", help="Passed to the top-level 'step'.")
118
+ @click.option("--task-id", help="Passed to the top-level 'step'.")
119
+ @click.option("--input-paths", help="Passed to the top-level 'step'.")
120
+ @click.option("--split-index", help="Passed to the top-level 'step'.")
121
+ @click.option("--clone-path", help="Passed to the top-level 'step'.")
122
+ @click.option("--clone-run-id", help="Passed to the top-level 'step'.")
123
+ @click.option(
124
+ "--tag", multiple=True, default=None, help="Passed to the top-level 'step'."
125
+ )
126
+ @click.option("--namespace", default=None, help="Passed to the top-level 'step'.")
127
+ @click.option("--retry-count", default=0, help="Passed to the top-level 'step'.")
128
+ @click.option(
129
+ "--max-user-code-retries", default=0, help="Passed to the top-level 'step'."
130
+ )
131
+ @click.pass_context
132
+ def step(
133
+ ctx,
134
+ step_name,
135
+ code_package_sha,
136
+ code_package_url,
137
+ function_id,
138
+ ngc_api_key,
139
+ queue_timeout,
140
+ **kwargs,
141
+ ):
142
+ def echo(msg, stream="stderr", _id=None, **kwargs):
143
+ msg = util.to_unicode(msg)
144
+ if _id:
145
+ msg = "[%s] %s" % (_id, msg)
146
+ ctx.obj.echo_always(msg, err=(stream == sys.stderr), **kwargs)
147
+
148
+ executable = ctx.obj.environment.executable(step_name)
149
+ entrypoint = "%s -u %s" % (executable, os.path.basename(sys.argv[0]))
150
+
151
+ top_args = " ".join(util.dict_to_cli_options(ctx.parent.parent.params))
152
+
153
+ input_paths = kwargs.get("input_paths")
154
+ split_vars = None
155
+ if input_paths:
156
+ max_size = 30 * 1024
157
+ split_vars = {
158
+ "METAFLOW_INPUT_PATHS_%d" % (i // max_size): input_paths[i : i + max_size]
159
+ for i in range(0, len(input_paths), max_size)
160
+ }
161
+ kwargs["input_paths"] = "".join("${%s}" % s for s in split_vars.keys())
162
+
163
+ step_args = " ".join(util.dict_to_cli_options(kwargs))
164
+ step_cli = "{entrypoint} {top_args} step {step} {step_args}".format(
165
+ entrypoint=entrypoint,
166
+ top_args=top_args,
167
+ step=step_name,
168
+ step_args=step_args,
169
+ )
170
+ node = ctx.obj.graph[step_name]
171
+
172
+ # Get retry information
173
+ retry_count = kwargs.get("retry_count", 0)
174
+ retry_deco = [deco for deco in node.decorators if deco.name == "retry"]
175
+ minutes_between_retries = None
176
+ if retry_deco:
177
+ minutes_between_retries = int(
178
+ retry_deco[0].attributes.get("minutes_between_retries", 1)
179
+ )
180
+
181
+ task_spec = {
182
+ "flow_name": ctx.obj.flow.name,
183
+ "step_name": step_name,
184
+ "run_id": kwargs["run_id"],
185
+ "task_id": kwargs["task_id"],
186
+ "retry_count": str(retry_count),
187
+ }
188
+
189
+ env = {
190
+ "METAFLOW_CODE_SHA": code_package_sha,
191
+ "METAFLOW_CODE_URL": code_package_url,
192
+ "METAFLOW_CODE_DS": ctx.obj.flow_datastore.TYPE,
193
+ "METAFLOW_SERVICE_URL": SERVICE_URL,
194
+ "METAFLOW_SERVICE_HEADERS": json.dumps(SERVICE_HEADERS),
195
+ "METAFLOW_DATASTORE_SYSROOT_S3": DATASTORE_SYSROOT_S3,
196
+ "METAFLOW_DATATOOLS_S3ROOT": DATATOOLS_S3ROOT,
197
+ "METAFLOW_DEFAULT_DATASTORE": ctx.obj.flow_datastore.TYPE,
198
+ "METAFLOW_USER": util.get_username(),
199
+ "METAFLOW_DEFAULT_METADATA": DEFAULT_METADATA,
200
+ "METAFLOW_CARD_S3ROOT": CARD_S3ROOT,
201
+ "METAFLOW_RUNTIME_ENVIRONMENT": "nvcf",
202
+ "METAFLOW_DEFAULT_SECRETS_BACKEND_TYPE": DEFAULT_SECRETS_BACKEND_TYPE,
203
+ "METAFLOW_DEFAULT_AWS_CLIENT_PROVIDER": DEFAULT_AWS_CLIENT_PROVIDER,
204
+ "METAFLOW_AWS_SECRETS_MANAGER_DEFAULT_REGION": AWS_SECRETS_MANAGER_DEFAULT_REGION,
205
+ "METAFLOW_S3_ENDPOINT_URL": S3_ENDPOINT_URL,
206
+ "METAFLOW_AZURE_STORAGE_BLOB_SERVICE_ENDPOINT": AZURE_STORAGE_BLOB_SERVICE_ENDPOINT,
207
+ "METAFLOW_DATASTORE_SYSROOT_AZURE": DATASTORE_SYSROOT_AZURE,
208
+ "METAFLOW_CARD_AZUREROOT": CARD_AZUREROOT,
209
+ "METAFLOW_DATASTORE_SYSROOT_GS": DATASTORE_SYSROOT_GS,
210
+ "METAFLOW_CARD_GSROOT": CARD_GSROOT,
211
+ "METAFLOW_INIT_SCRIPT": KUBERNETES_SANDBOX_INIT_SCRIPT,
212
+ "METAFLOW_OTEL_ENDPOINT": OTEL_ENDPOINT,
213
+ "METAFLOW_NVIDIA_HEARTBEAT_THRESHOLD": str(NVIDIA_HEARTBEAT_THRESHOLD),
214
+ }
215
+
216
+ env_deco = [deco for deco in node.decorators if deco.name == "environment"]
217
+ if env_deco:
218
+ env.update(env_deco[0].attributes["vars"])
219
+
220
+ # Add the environment variables related to the input-paths argument
221
+ if split_vars:
222
+ env.update(split_vars)
223
+
224
+ if retry_count:
225
+ ctx.obj.echo_always(
226
+ "Sleeping %d minutes before the next retry" % minutes_between_retries
227
+ )
228
+ time.sleep(minutes_between_retries * 60)
229
+
230
+ # this information is needed for log tailing
231
+ ds = ctx.obj.flow_datastore.get_task_datastore(
232
+ mode="w",
233
+ run_id=kwargs["run_id"],
234
+ step_name=step_name,
235
+ task_id=kwargs["task_id"],
236
+ attempt=int(retry_count),
237
+ )
238
+ stdout_location = ds.get_log_location(TASK_LOG_SOURCE, "stdout")
239
+ stderr_location = ds.get_log_location(TASK_LOG_SOURCE, "stderr")
240
+
241
+ def _sync_metadata():
242
+ if ctx.obj.metadata.TYPE == "local":
243
+ sync_local_metadata_from_datastore(
244
+ DATASTORE_LOCAL_DIR,
245
+ ctx.obj.flow_datastore.get_task_datastore(
246
+ kwargs["run_id"], step_name, kwargs["task_id"]
247
+ ),
248
+ )
249
+
250
+ nvcf = Nvcf(
251
+ ctx.obj.metadata,
252
+ ctx.obj.flow_datastore,
253
+ ctx.obj.environment,
254
+ function_id,
255
+ ngc_api_key,
256
+ queue_timeout,
257
+ )
258
+ try:
259
+ with ctx.obj.monitor.measure("metaflow.nvcf.launch_job"):
260
+ nvcf.launch_job(
261
+ step_name,
262
+ step_cli,
263
+ task_spec,
264
+ code_package_sha,
265
+ code_package_url,
266
+ ctx.obj.flow_datastore.TYPE,
267
+ env=env,
268
+ )
269
+ except Exception as e:
270
+ traceback.print_exc()
271
+ _sync_metadata()
272
+ sys.exit(METAFLOW_EXIT_DISALLOW_RETRY)
273
+ try:
274
+ nvcf.wait(stdout_location, stderr_location, echo=echo)
275
+ except NvcfKilledException:
276
+ # don't retry killed tasks
277
+ traceback.print_exc()
278
+ sys.exit(METAFLOW_EXIT_DISALLOW_RETRY)
279
+ finally:
280
+ _sync_metadata()
@@ -0,0 +1,242 @@
1
+ import os
2
+ import sys
3
+ import json
4
+ import requests
5
+ from urllib.parse import urlparse
6
+
7
+ from metaflow import current
8
+ from metaflow.exception import MetaflowException
9
+ from metaflow.decorators import StepDecorator
10
+ from metaflow.plugins.parallel_decorator import ParallelDecorator
11
+ from metaflow.metadata_provider.util import sync_local_metadata_to_datastore
12
+ from metaflow.metaflow_config import DATASTORE_LOCAL_DIR
13
+ from metaflow.sidecar import Sidecar
14
+ from metaflow.plugins.timeout_decorator import get_run_time_limit_for_task
15
+ from metaflow.metadata_provider import MetaDatum
16
+ from metaflow.metaflow_config_funcs import init_config
17
+ from .constants import SUPPORTABLE_GPU_TYPES, DEFAULT_GPU_TYPE
18
+ from .exceptions import (
19
+ RequestedGPUTypeUnavailableException,
20
+ UnsupportedNvcfConfigurationException,
21
+ UnsupportedNvcfDatastoreException,
22
+ NvcfTimeoutTooShortException,
23
+ NvcfQueueTimeoutTooShortException,
24
+ )
25
+
26
+ from metaflow.metaflow_config import SERVICE_URL
27
+
28
+
29
+ class NvcfDecorator(StepDecorator):
30
+
31
+ """
32
+ Specifies that this step should execute on DGX cloud.
33
+
34
+ Parameters
35
+ ----------
36
+ gpu : int
37
+ Number of GPUs to use.
38
+ gpu_type : str
39
+ Type of Nvidia GPU to use.
40
+ queue_timeout : int
41
+ Time to keep the job in NVCF's queue.
42
+ """
43
+
44
+ name = "nvidia"
45
+ defaults = {
46
+ "gpu": 1,
47
+ "gpu_type": None,
48
+ "queue_timeout": 5 * 24 * 3600, # Default 5 days in seconds
49
+ }
50
+
51
+ package_url = None
52
+ package_sha = None
53
+
54
+ # Refer https://github.com/Netflix/metaflow/blob/master/docs/lifecycle.png
55
+ # to understand where these functions are invoked in the lifecycle of a
56
+ # Metaflow flow.
57
+ def step_init(self, flow, graph, step, decos, environment, flow_datastore, logger):
58
+ # Executing NVCF functions requires a non-local datastore.
59
+ if flow_datastore.TYPE not in ("s3", "azure", "gs"):
60
+ raise UnsupportedNvcfDatastoreException(flow_datastore.TYPE)
61
+
62
+ # Set internal state.
63
+ self.logger = logger
64
+ self.environment = environment
65
+ self.step = step
66
+ self.flow_datastore = flow_datastore
67
+
68
+ if any([deco.name == "kubernetes" for deco in decos]):
69
+ raise MetaflowException(
70
+ "Step *{step}* is marked for execution both on Kubernetes and "
71
+ "Nvidia. Please use one or the other.".format(step=step)
72
+ )
73
+ if any([isinstance(deco, ParallelDecorator) for deco in decos]):
74
+ raise MetaflowException(
75
+ "Step *{step}* contains a @parallel decorator "
76
+ "with the @nvidia decorator. @parallel decorators are not currently supported with @nvidia.".format(
77
+ step=step
78
+ )
79
+ )
80
+
81
+ # Set run time limit for the NVCF function.
82
+ self.run_time_limit = get_run_time_limit_for_task(decos)
83
+ if self.run_time_limit < 60:
84
+ raise NvcfTimeoutTooShortException(step)
85
+
86
+ conf = init_config()
87
+ if "OBP_AUTH_SERVER" in conf:
88
+ auth_host = conf["OBP_AUTH_SERVER"]
89
+ else:
90
+ auth_host = "auth." + urlparse(SERVICE_URL).hostname.split(".", 1)[1]
91
+
92
+ # NOTE: reusing the same auth_host as the one used in NimMetadata,
93
+ # however, user should not need to use nim container to use @nvidia.
94
+ # May want to refactor this to a common endpoint.
95
+ nim_info_url = "https://" + auth_host + "/generate/nim"
96
+
97
+ if "METAFLOW_SERVICE_AUTH_KEY" in conf:
98
+ headers = {"x-api-key": conf["METAFLOW_SERVICE_AUTH_KEY"]}
99
+ res = requests.get(nim_info_url, headers=headers)
100
+ else:
101
+ headers = json.loads(os.environ.get("METAFLOW_SERVICE_HEADERS"))
102
+ res = requests.get(nim_info_url, headers=headers)
103
+
104
+ res.raise_for_status()
105
+ self.attributes["ngc_api_key"] = res.json()["nvcf"]["api_key"]
106
+
107
+ available_functions_info = res.json()["nvcf"]["functions"]
108
+ requested_gpu_type = self.attributes["gpu_type"]
109
+ n_gpu = self.attributes["gpu"]
110
+
111
+ if requested_gpu_type is None:
112
+ requested_gpu_type = DEFAULT_GPU_TYPE
113
+ if requested_gpu_type not in SUPPORTABLE_GPU_TYPES:
114
+ raise RequestedGPUTypeUnavailableException(requested_gpu_type)
115
+
116
+ desired_configuration = (n_gpu, requested_gpu_type)
117
+ available_configurations = {}
118
+ for f in available_functions_info:
119
+ if f["model_key"] == "metaflow_task_executor":
120
+ available_configurations[(f["gpu"], f["gpu_type"])] = f["id"]
121
+
122
+ if desired_configuration not in available_configurations:
123
+ raise UnsupportedNvcfConfigurationException(
124
+ n_gpu, requested_gpu_type, available_configurations, step
125
+ )
126
+ self.attributes["function_id"] = available_configurations[desired_configuration]
127
+
128
+ queue_timeout = self.attributes["queue_timeout"]
129
+ if not isinstance(queue_timeout, int) or queue_timeout < 60:
130
+ raise NvcfQueueTimeoutTooShortException(step)
131
+
132
+ def runtime_init(self, flow, graph, package, run_id):
133
+ # Set some more internal state.
134
+ self.flow = flow
135
+ self.graph = graph
136
+ self.package = package
137
+ self.run_id = run_id
138
+
139
+ def runtime_task_created(
140
+ self, task_datastore, task_id, split_index, input_paths, is_cloned, ubf_context
141
+ ):
142
+ if not is_cloned:
143
+ self._save_package_once(self.flow_datastore, self.package)
144
+
145
+ def runtime_step_cli(
146
+ self, cli_args, retry_count, max_user_code_retries, ubf_context
147
+ ):
148
+ if retry_count <= max_user_code_retries:
149
+ # after all attempts to run the user code have failed, we don't need
150
+ # to execute on NVCF anymore. We can execute possible fallback
151
+ # code locally.
152
+ cli_args.commands = ["nvidia", "step"]
153
+ cli_args.command_args.append(self.package_sha)
154
+ cli_args.command_args.append(self.package_url)
155
+ cli_options = {
156
+ "function_id": self.attributes["function_id"],
157
+ "ngc_api_key": self.attributes["ngc_api_key"],
158
+ "queue_timeout": self.attributes["queue_timeout"],
159
+ }
160
+ cli_args.command_options.update(cli_options)
161
+ cli_args.entrypoint[0] = sys.executable
162
+
163
+ def task_pre_step(
164
+ self,
165
+ step_name,
166
+ task_datastore,
167
+ metadata,
168
+ run_id,
169
+ task_id,
170
+ flow,
171
+ graph,
172
+ retry_count,
173
+ max_retries,
174
+ ubf_context,
175
+ inputs,
176
+ ):
177
+ self.metadata = metadata
178
+ self.task_datastore = task_datastore
179
+
180
+ # task_pre_step may run locally if fallback is activated for @catch
181
+ # decorator.
182
+
183
+ if "NVCF_CONTEXT" in os.environ:
184
+ meta = {}
185
+
186
+ meta["nvcf-function-id"] = os.environ.get("NVCF_FUNCTION_ID")
187
+ meta["nvcf-function-version-id"] = os.environ.get(
188
+ "NVCF_FUNCTION_VERSION_ID"
189
+ )
190
+ meta["nvcf-region"] = os.environ.get("NVCF_REGION")
191
+ meta["nvcf-ncaid"] = os.environ.get("NVCF_NCAID")
192
+ meta["nvcf-sub"] = os.environ.get("NVCF_SUB")
193
+ meta["nvcf-instancetype"] = os.environ.get("NVCF_INSTANCETYPE")
194
+ meta["nvcf-reqid"] = os.environ.get("NVCF_REQID")
195
+ meta["nvcf-env"] = os.environ.get("NVCF_ENV")
196
+ meta["nvcf-backend"] = os.environ.get("NVCF_BACKEND")
197
+ meta["nvcf-function-name"] = os.environ.get("NVCF_FUNCTION_NAME")
198
+ meta["nvcf-nspectid"] = os.environ.get("NVCF_NSPECTID")
199
+
200
+ entries = [
201
+ MetaDatum(
202
+ field=k,
203
+ value=v,
204
+ type=k,
205
+ tags=["attempt_id:{0}".format(retry_count)],
206
+ )
207
+ for k, v in meta.items()
208
+ if v is not None
209
+ ]
210
+ # Register book-keeping metadata for debugging.
211
+ metadata.register_metadata(run_id, step_name, task_id, entries)
212
+
213
+ self._save_logs_sidecar = Sidecar("save_logs_periodically")
214
+ self._save_logs_sidecar.start()
215
+
216
+ def task_finished(
217
+ self, step_name, flow, graph, is_task_ok, retry_count, max_retries
218
+ ):
219
+ # task_finished may run locally if fallback is activated for @catch
220
+ # decorator.
221
+ if "NVCF_CONTEXT" in os.environ:
222
+ # If `local` metadata is configured, we would need to copy task
223
+ # execution metadata from the NVCF container to user's
224
+ # local file system after the user code has finished execution.
225
+ # This happens via datastore as a communication bridge.
226
+ if hasattr(self, "metadata") and self.metadata.TYPE == "local":
227
+ sync_local_metadata_to_datastore(
228
+ DATASTORE_LOCAL_DIR, self.task_datastore
229
+ )
230
+
231
+ try:
232
+ self._save_logs_sidecar.terminate()
233
+ except:
234
+ # Best effort kill
235
+ pass
236
+
237
+ @classmethod
238
+ def _save_package_once(cls, flow_datastore, package):
239
+ if cls.package_url is None:
240
+ cls.package_url, cls.package_sha = flow_datastore.save_data(
241
+ [package.blob], len_hint=1
242
+ )[0]
@@ -0,0 +1,6 @@
1
+ import sys
2
+
3
+
4
+ def warning_message(message, prefix="[@nvidia]"):
5
+ msg = "%s %s" % (prefix, message)
6
+ print(msg, file=sys.stderr)
@@ -0,0 +1,71 @@
1
+ from metaflow.exception import MetaflowException
2
+
3
+
4
+ class NvctExecutionException(MetaflowException):
5
+ headline = "Nvct task couldn't be executed"
6
+
7
+
8
+ class NvctTaskFailedException(MetaflowException):
9
+ headline = "Nvct task failed"
10
+
11
+
12
+ class NvctKilledException(MetaflowException):
13
+ headline = "Nvct job killed"
14
+
15
+
16
+ class RequestedGPUTypeUnavailableException(MetaflowException):
17
+ headline = "[@nvct RequestedGPUTypeUnavailableException] GPU type unavailable."
18
+
19
+ def __init__(self, requested_gpu_type, available_gpus):
20
+ msg = (
21
+ f"The requested GPU type @nvct(..., gpu_type='{requested_gpu_type}') is not available. "
22
+ f"Please choose from the following supported GPU types when using @nvct: {available_gpus}"
23
+ )
24
+ super(RequestedGPUTypeUnavailableException, self).__init__(msg)
25
+
26
+
27
+ class UnsupportedNvctConfigurationException(MetaflowException):
28
+ headline = (
29
+ "[@nvct UnsupportedNvctConfigurationException] Unsupported GPU configuration"
30
+ )
31
+
32
+ def __init__(self, n_gpu, gpu_type, available_configurations, step):
33
+ msg = f"The requested configuration of @nvct(gpu={n_gpu}, gpu_type='{gpu_type}') for @step {step} is not available."
34
+ if len(available_configurations) == 0:
35
+ msg += (
36
+ "\n\nNo configurations are available in your Outerbounds deployment."
37
+ " Please contact Outerbounds support if you wish to use @nvct."
38
+ )
39
+ else:
40
+ msg += f"\n\nAvailable configurations for your deployment with {gpu_type} include: \n\t- {self._display(gpu_type, available_configurations)}"
41
+ msg += "\n\nPlease contact Outerbounds support if you wish to use a configuration not listed above."
42
+ super(UnsupportedNvctConfigurationException, self).__init__(msg)
43
+
44
+ def _display(self, gpu_type, configs):
45
+ _available_decos = []
46
+ for cfg in configs:
47
+ n_gpu = cfg["n_gpus"]
48
+ _available_decos.append(f"@nvct(gpu={n_gpu}, gpu_type='{gpu_type}')")
49
+ return "\n\t- ".join(_available_decos)
50
+
51
+
52
+ class UnsupportedNvctDatastoreException(MetaflowException):
53
+ headline = "[@nvct UnsupportedNvctDatastoreException] Unsupported datastore"
54
+
55
+ def __init__(self, ds_type):
56
+ msg = (
57
+ "The *@nvct* decorator requires --datastore=s3 or --datastore=azure or --datastore=gs at the moment."
58
+ f"Current datastore type: {ds_type}."
59
+ )
60
+ super(UnsupportedNvctDatastoreException, self).__init__(msg)
61
+
62
+
63
+ class NvctTimeoutTooShortException(MetaflowException):
64
+ headline = "[@nvct NvctTimeoutTooShortException] Timeout too short"
65
+
66
+ def __init__(self, step):
67
+ msg = (
68
+ "The timeout for step *{step}* should be at least 60 seconds for "
69
+ "execution with @nvct".format(step=step)
70
+ )
71
+ super(NvctTimeoutTooShortException, self).__init__(msg)