ob-metaflow-extensions 1.1.68__py2.py3-none-any.whl → 1.1.70__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ob-metaflow-extensions might be problematic. Click here for more details.

@@ -240,5 +240,6 @@ class ObpGcpAuthProvider(object):
240
240
 
241
241
 
242
242
  GCP_CLIENT_PROVIDERS_DESC = [("obp", ".ObpGcpAuthProvider")]
243
-
243
+ CLIS_DESC = [("nvcf", ".nvcf.nvcf_cli.cli")]
244
+ STEP_DECORATORS_DESC = [("nvidia", ".nvcf.nvcf_decorator.NvcfDecorator")]
244
245
  FLOW_DECORATORS_DESC = [("nim", ".nim.NimDecorator")]
@@ -20,23 +20,18 @@ class NimDecorator(FlowDecorator):
20
20
  - 'managed': Outerbounds selects a compute provider based on the model.
21
21
  - 🚧 'dataplane': Run in your account.
22
22
 
23
-
24
23
  Valid model options
25
24
  ----------------
26
25
  - 'meta/llama3-8b-instruct': 8B parameter model
27
26
  - 'meta/llama3-70b-instruct': 70B parameter model
28
27
  - Upon request, any model here: https://nvcf.ngc.nvidia.com/functions?filter=nvidia-functions
29
28
 
30
- MF Add To Current
31
- -----------------
32
- current.nim
33
-
34
- Parameters
35
- ----------
36
- models: list[NIM]
37
- List of NIM containers running models in sidecars.
38
- backend: str
39
- Compute provider to run the NIM container.
29
+ Parameters
30
+ ----------
31
+ models: list[NIM]
32
+ List of NIM containers running models in sidecars.
33
+ backend: str
34
+ Compute provider to run the NIM container.
40
35
  """
41
36
 
42
37
  name = "nim"
@@ -0,0 +1,247 @@
1
+ import json
2
+ import os
3
+ import time
4
+ from urllib.request import HTTPError, Request, URLError, urlopen
5
+
6
+ from metaflow import util
7
+ from metaflow.exception import MetaflowException
8
+ from metaflow.mflog import (
9
+ BASH_SAVE_LOGS,
10
+ bash_capture_logs,
11
+ export_mflog_env_vars,
12
+ tail_logs,
13
+ get_log_tailer,
14
+ )
15
+ import requests
16
+ from metaflow.metaflow_config_funcs import init_config
17
+
18
+
19
+ class NvcfException(MetaflowException):
20
+ headline = "Nvidia error"
21
+
22
+
23
+ class NvcfKilledException(MetaflowException):
24
+ headline = "Nvidia job killed"
25
+
26
+
27
+ # Redirect structured logs to $PWD/.logs/
28
+ LOGS_DIR = "$PWD/.logs"
29
+ STDOUT_FILE = "mflog_stdout"
30
+ STDERR_FILE = "mflog_stderr"
31
+ STDOUT_PATH = os.path.join(LOGS_DIR, STDOUT_FILE)
32
+ STDERR_PATH = os.path.join(LOGS_DIR, STDERR_FILE)
33
+
34
+
35
+ class Nvcf(object):
36
+ def __init__(self, metadata, datastore, environment):
37
+ self.metadata = metadata
38
+ self.datastore = datastore
39
+ self.environment = environment
40
+
41
+ def launch_job(
42
+ self,
43
+ step_name,
44
+ step_cli,
45
+ task_spec,
46
+ code_package_sha,
47
+ code_package_url,
48
+ code_package_ds,
49
+ env={},
50
+ ):
51
+ mflog_expr = export_mflog_env_vars(
52
+ datastore_type=code_package_ds,
53
+ stdout_path=STDOUT_PATH,
54
+ stderr_path=STDERR_PATH,
55
+ **task_spec,
56
+ )
57
+ init_cmds = self.environment.get_package_commands(
58
+ code_package_url, code_package_ds
59
+ )
60
+ init_expr = " && ".join(init_cmds)
61
+ step_expr = bash_capture_logs(
62
+ " && ".join(
63
+ self.environment.bootstrap_commands(step_name, code_package_ds)
64
+ + [step_cli]
65
+ )
66
+ )
67
+
68
+ # construct an entry point that
69
+ # 1) initializes the mflog environment (mflog_expr)
70
+ # 2) bootstraps a metaflow environment (init_expr)
71
+ # 3) executes a task (step_expr)
72
+
73
+ cmd_str = "mkdir -p %s && %s && %s && %s; " % (
74
+ LOGS_DIR,
75
+ mflog_expr,
76
+ init_expr,
77
+ step_expr,
78
+ )
79
+ # after the task has finished, we save its exit code (fail/success)
80
+ # and persist the final logs. The whole entrypoint should exit
81
+ # with the exit code (c) of the task.
82
+ #
83
+ # Note that if step_expr OOMs, this tail expression is never executed.
84
+ # We lose the last logs in this scenario.
85
+ cmd_str += "c=$?; %s; exit $c" % BASH_SAVE_LOGS
86
+ cmd_str = (
87
+ '${METAFLOW_INIT_SCRIPT:+eval \\"${METAFLOW_INIT_SCRIPT}\\"} && %s'
88
+ % cmd_str
89
+ )
90
+ self.job = Job('bash -c "%s"' % cmd_str, env)
91
+ self.job.submit()
92
+
93
+ def wait(self, stdout_location, stderr_location, echo=None):
94
+ def wait_for_launch(job):
95
+ status = job.status
96
+ echo(
97
+ "Task status: %s..." % status,
98
+ "stderr",
99
+ _id=job.id,
100
+ )
101
+
102
+ prefix = b"[%s] " % util.to_bytes(self.job.id)
103
+ stdout_tail = get_log_tailer(stdout_location, self.datastore.TYPE)
104
+ stderr_tail = get_log_tailer(stderr_location, self.datastore.TYPE)
105
+
106
+ # 1) Loop until the job has started
107
+ wait_for_launch(self.job)
108
+
109
+ # 2) Tail logs until the job has finished
110
+ tail_logs(
111
+ prefix=prefix,
112
+ stdout_tail=stdout_tail,
113
+ stderr_tail=stderr_tail,
114
+ echo=echo,
115
+ has_log_updates=lambda: self.job.is_running,
116
+ )
117
+
118
+ echo(
119
+ "Task finished with exit code %s." % self.job.result.get("exit_code"),
120
+ "stderr",
121
+ _id=self.job.id,
122
+ )
123
+ if self.job.has_failed:
124
+ raise NvcfException("This could be a transient error. Use @retry to retry.")
125
+
126
+
127
+ class JobStatus(object):
128
+ SUBMITTED = "SUBMITTED"
129
+ RUNNING = "RUNNING"
130
+ SUCCESSFUL = "SUCCESSFUL"
131
+ FAILED = "FAILED"
132
+
133
+
134
+ nvcf_url = "https://api.nvcf.nvidia.com"
135
+ submit_endpoint = f"{nvcf_url}/v2/nvcf/pexec/functions"
136
+ result_endpoint = f"{nvcf_url}/v2/nvcf/pexec/status"
137
+
138
+
139
+ class Job(object):
140
+ def __init__(self, command, env):
141
+
142
+ self._payload = {
143
+ "command": command,
144
+ "env": {k: v for k, v in env.items() if v is not None},
145
+ }
146
+ self._result = {}
147
+
148
+ conf = init_config()
149
+ if "OBP_AUTH_SERVER" in conf:
150
+ auth_host = conf["OBP_AUTH_SERVER"]
151
+ else:
152
+ auth_host = "auth." + urlparse(SERVICE_URL).hostname.split(".", 1)[1]
153
+
154
+ # NOTE: reusing the same auth_host as the one used in NimMetadata,
155
+ # however, user should not need to use nim container to use @nvidia.
156
+ # May want to refactor this to a common endpoint.
157
+ nim_info_url = "https://" + auth_host + "/generate/nim"
158
+
159
+ if "METAFLOW_SERVICE_AUTH_KEY" in conf:
160
+ headers = {"x-api-key": conf["METAFLOW_SERVICE_AUTH_KEY"]}
161
+ res = requests.get(nim_info_url, headers=headers)
162
+ else:
163
+ headers = json.loads(os.environ.get("METAFLOW_SERVICE_HEADERS"))
164
+ res = requests.get(nim_info_url, headers=headers)
165
+
166
+ res.raise_for_status()
167
+ self._ngc_api_key = res.json()["nvcf"]["api_key"]
168
+
169
+ for f in res.json()["nvcf"]["functions"]:
170
+ if f["model_key"] == "metaflow_task_executor":
171
+ self._function_id = f["id"]
172
+
173
+ def submit(self):
174
+ try:
175
+ headers = {
176
+ "Authorization": f"Bearer {self._ngc_api_key}",
177
+ "Content-Type": "application/json",
178
+ }
179
+ request_data = json.dumps(self._payload).encode()
180
+ request = Request(
181
+ f"{submit_endpoint}/{self._function_id}",
182
+ data=request_data,
183
+ headers=headers,
184
+ )
185
+ response = urlopen(request)
186
+ self._invocation_id = response.headers.get("NVCF-REQID")
187
+ if response.getcode() == 200:
188
+ data = json.loads(response.read())
189
+ if data["status"].startswith("Oops"):
190
+ self._status = JobStatus.FAILED
191
+ else:
192
+ self._status = JobStatus.SUCCESSFUL
193
+ self._result = data
194
+ elif response.getcode() == 202:
195
+ self._status = JobStatus.SUBMITTED
196
+ else:
197
+ self._status = JobStatus.FAILED
198
+ # TODO: Handle 404s nicely
199
+ except (HTTPError, URLError) as e:
200
+ self._state = JobStatus.FAILED
201
+ raise e
202
+
203
+ @property
204
+ def status(self):
205
+ if self._status not in [JobStatus.SUCCESSFUL, JobStatus.FAILED]:
206
+ self._poll()
207
+ return self._status
208
+
209
+ @property
210
+ def id(self):
211
+ return self._invocation_id
212
+
213
+ @property
214
+ def is_running(self):
215
+ return self.status == JobStatus.SUBMITTED
216
+
217
+ @property
218
+ def has_failed(self):
219
+ return self.status == JobStatus.FAILED
220
+
221
+ @property
222
+ def result(self):
223
+ return self._result
224
+
225
+ def _poll(self):
226
+ try:
227
+ invocation_id = self._invocation_id
228
+ headers = {
229
+ "Authorization": f"Bearer {self._ngc_api_key}",
230
+ "Content-Type": "application/json",
231
+ }
232
+ request = Request(
233
+ f"{result_endpoint}/{self._invocation_id}", headers=headers
234
+ )
235
+ response = urlopen(request)
236
+ if response.getcode() == 200:
237
+ data = json.loads(response.read())
238
+ if data["status"].startswith("Oops"):
239
+ # TODO: Propagate the internal error forward
240
+ self._status = JobStatus.FAILED
241
+ else:
242
+ self._status = JobStatus.SUCCESSFUL
243
+ self._result = data
244
+ elif response.getcode() in [400, 500]:
245
+ self._status = JobStatus.FAILED
246
+ except (HTTPError, URLError) as e:
247
+ print(f"Error occurred while polling for result: {e}")
@@ -0,0 +1,202 @@
1
+ import json
2
+ import os
3
+ import sys
4
+ import time
5
+ import traceback
6
+
7
+ from metaflow import util
8
+ from metaflow._vendor import click
9
+ from metaflow.exception import METAFLOW_EXIT_DISALLOW_RETRY
10
+ from metaflow.metadata.util import sync_local_metadata_from_datastore
11
+ from metaflow.metaflow_config import (
12
+ CARD_S3ROOT,
13
+ DATASTORE_LOCAL_DIR,
14
+ DATASTORE_SYSROOT_S3,
15
+ DATATOOLS_S3ROOT,
16
+ DEFAULT_METADATA,
17
+ SERVICE_HEADERS,
18
+ SERVICE_URL,
19
+ DEFAULT_SECRETS_BACKEND_TYPE,
20
+ DEFAULT_AWS_CLIENT_PROVIDER,
21
+ AWS_SECRETS_MANAGER_DEFAULT_REGION,
22
+ S3_ENDPOINT_URL,
23
+ AZURE_STORAGE_BLOB_SERVICE_ENDPOINT,
24
+ DATASTORE_SYSROOT_AZURE,
25
+ CARD_AZUREROOT,
26
+ DATASTORE_SYSROOT_GS,
27
+ CARD_GSROOT,
28
+ KUBERNETES_SANDBOX_INIT_SCRIPT,
29
+ OTEL_ENDPOINT,
30
+ )
31
+ from metaflow.mflog import TASK_LOG_SOURCE
32
+
33
+ from .nvcf import Nvcf, NvcfKilledException
34
+
35
+
36
+ @click.group()
37
+ def cli():
38
+ pass
39
+
40
+
41
+ @cli.group(help="Commands related to NVCF.")
42
+ def nvcf():
43
+ pass
44
+
45
+
46
+ @nvcf.command(
47
+ help="Execute a single task using NVCF. This command calls the "
48
+ "top-level step command inside a NVCF job with the given options. "
49
+ "Typically you do not call this command directly; it is used internally by "
50
+ "Metaflow."
51
+ )
52
+ @click.argument("step-name")
53
+ @click.argument("code-package-sha")
54
+ @click.argument("code-package-url")
55
+ @click.option("--run-id", help="Passed to the top-level 'step'.")
56
+ @click.option("--task-id", help="Passed to the top-level 'step'.")
57
+ @click.option("--input-paths", help="Passed to the top-level 'step'.")
58
+ @click.option("--split-index", help="Passed to the top-level 'step'.")
59
+ @click.option("--clone-path", help="Passed to the top-level 'step'.")
60
+ @click.option("--clone-run-id", help="Passed to the top-level 'step'.")
61
+ @click.option(
62
+ "--tag", multiple=True, default=None, help="Passed to the top-level 'step'."
63
+ )
64
+ @click.option("--namespace", default=None, help="Passed to the top-level 'step'.")
65
+ @click.option("--retry-count", default=0, help="Passed to the top-level 'step'.")
66
+ @click.option(
67
+ "--max-user-code-retries", default=0, help="Passed to the top-level 'step'."
68
+ )
69
+ @click.pass_context
70
+ def step(ctx, step_name, code_package_sha, code_package_url, **kwargs):
71
+ def echo(msg, stream="stderr", _id=None, **kwargs):
72
+ msg = util.to_unicode(msg)
73
+ if _id:
74
+ msg = "[%s] %s" % (_id, msg)
75
+ ctx.obj.echo_always(msg, err=(stream == sys.stderr), **kwargs)
76
+
77
+ executable = ctx.obj.environment.executable(step_name)
78
+ entrypoint = "%s -u %s" % (executable, os.path.basename(sys.argv[0]))
79
+
80
+ top_args = " ".join(util.dict_to_cli_options(ctx.parent.parent.params))
81
+
82
+ input_paths = kwargs.get("input_paths")
83
+ split_vars = None
84
+ if input_paths:
85
+ max_size = 30 * 1024
86
+ split_vars = {
87
+ "METAFLOW_INPUT_PATHS_%d" % (i // max_size): input_paths[i : i + max_size]
88
+ for i in range(0, len(input_paths), max_size)
89
+ }
90
+ kwargs["input_paths"] = "".join("${%s}" % s for s in split_vars.keys())
91
+
92
+ step_args = " ".join(util.dict_to_cli_options(kwargs))
93
+ step_cli = "{entrypoint} {top_args} step {step} {step_args}".format(
94
+ entrypoint=entrypoint,
95
+ top_args=top_args,
96
+ step=step_name,
97
+ step_args=step_args,
98
+ )
99
+ node = ctx.obj.graph[step_name]
100
+
101
+ # Get retry information
102
+ retry_count = kwargs.get("retry_count", 0)
103
+ retry_deco = [deco for deco in node.decorators if deco.name == "retry"]
104
+ minutes_between_retries = None
105
+ if retry_deco:
106
+ minutes_between_retries = int(
107
+ retry_deco[0].attributes.get("minutes_between_retries", 1)
108
+ )
109
+
110
+ task_spec = {
111
+ "flow_name": ctx.obj.flow.name,
112
+ "step_name": step_name,
113
+ "run_id": kwargs["run_id"],
114
+ "task_id": kwargs["task_id"],
115
+ "retry_count": str(retry_count),
116
+ }
117
+
118
+ env = {
119
+ "METAFLOW_CODE_SHA": code_package_sha,
120
+ "METAFLOW_CODE_URL": code_package_url,
121
+ "METAFLOW_CODE_DS": ctx.obj.flow_datastore.TYPE,
122
+ "METAFLOW_SERVICE_URL": SERVICE_URL,
123
+ "METAFLOW_SERVICE_HEADERS": json.dumps(SERVICE_HEADERS),
124
+ "METAFLOW_DATASTORE_SYSROOT_S3": DATASTORE_SYSROOT_S3,
125
+ "METAFLOW_DATATOOLS_S3ROOT": DATATOOLS_S3ROOT,
126
+ "METAFLOW_DEFAULT_DATASTORE": ctx.obj.flow_datastore.TYPE,
127
+ "METAFLOW_USER": util.get_username(),
128
+ "METAFLOW_DEFAULT_METADATA": DEFAULT_METADATA,
129
+ "METAFLOW_CARD_S3ROOT": CARD_S3ROOT,
130
+ "METAFLOW_RUNTIME_ENVIRONMENT": "nvcf",
131
+ "METAFLOW_DEFAULT_SECRETS_BACKEND_TYPE": DEFAULT_SECRETS_BACKEND_TYPE,
132
+ "METAFLOW_DEFAULT_AWS_CLIENT_PROVIDER": DEFAULT_AWS_CLIENT_PROVIDER,
133
+ "METAFLOW_AWS_SECRETS_MANAGER_DEFAULT_REGION": AWS_SECRETS_MANAGER_DEFAULT_REGION,
134
+ "METAFLOW_S3_ENDPOINT_URL": S3_ENDPOINT_URL,
135
+ "METAFLOW_AZURE_STORAGE_BLOB_SERVICE_ENDPOINT": AZURE_STORAGE_BLOB_SERVICE_ENDPOINT,
136
+ "METAFLOW_DATASTORE_SYSROOT_AZURE": DATASTORE_SYSROOT_AZURE,
137
+ "METAFLOW_CARD_AZUREROOT": CARD_AZUREROOT,
138
+ "METAFLOW_DATASTORE_SYSROOT_GS": DATASTORE_SYSROOT_GS,
139
+ "METAFLOW_CARD_GSROOT": CARD_GSROOT,
140
+ "METAFLOW_INIT_SCRIPT": KUBERNETES_SANDBOX_INIT_SCRIPT,
141
+ "METAFLOW_OTEL_ENDPOINT": OTEL_ENDPOINT,
142
+ }
143
+
144
+ env_deco = [deco for deco in node.decorators if deco.name == "environment"]
145
+ if env_deco:
146
+ env.update(env_deco[0].attributes["vars"])
147
+
148
+ # Add the environment variables related to the input-paths argument
149
+ if split_vars:
150
+ env.update(split_vars)
151
+
152
+ if retry_count:
153
+ ctx.obj.echo_always(
154
+ "Sleeping %d minutes before the next retry" % minutes_between_retries
155
+ )
156
+ time.sleep(minutes_between_retries * 60)
157
+
158
+ # this information is needed for log tailing
159
+ ds = ctx.obj.flow_datastore.get_task_datastore(
160
+ mode="w",
161
+ run_id=kwargs["run_id"],
162
+ step_name=step_name,
163
+ task_id=kwargs["task_id"],
164
+ attempt=int(retry_count),
165
+ )
166
+ stdout_location = ds.get_log_location(TASK_LOG_SOURCE, "stdout")
167
+ stderr_location = ds.get_log_location(TASK_LOG_SOURCE, "stderr")
168
+
169
+ def _sync_metadata():
170
+ if ctx.obj.metadata.TYPE == "local":
171
+ sync_local_metadata_from_datastore(
172
+ DATASTORE_LOCAL_DIR,
173
+ ctx.obj.flow_datastore.get_task_datastore(
174
+ kwargs["run_id"], step_name, kwargs["task_id"]
175
+ ),
176
+ )
177
+
178
+ nvcf = Nvcf(ctx.obj.metadata, ctx.obj.flow_datastore, ctx.obj.environment)
179
+ try:
180
+ with ctx.obj.monitor.measure("metaflow.nvcf.launch_job"):
181
+ nvcf.launch_job(
182
+ step_name,
183
+ step_cli,
184
+ task_spec,
185
+ code_package_sha,
186
+ code_package_url,
187
+ ctx.obj.flow_datastore.TYPE,
188
+ # function_id=function_id,
189
+ env=env,
190
+ )
191
+ except Exception as e:
192
+ traceback.print_exc()
193
+ _sync_metadata()
194
+ sys.exit(METAFLOW_EXIT_DISALLOW_RETRY)
195
+ try:
196
+ nvcf.wait(stdout_location, stderr_location, echo=echo)
197
+ except NvcfKilledException:
198
+ # don't retry killed tasks
199
+ traceback.print_exc()
200
+ sys.exit(METAFLOW_EXIT_DISALLOW_RETRY)
201
+ finally:
202
+ _sync_metadata()
@@ -0,0 +1,156 @@
1
+ import os
2
+ import sys
3
+
4
+ from metaflow.decorators import StepDecorator
5
+ from metaflow.metadata.util import sync_local_metadata_to_datastore
6
+ from metaflow.metaflow_config import DATASTORE_LOCAL_DIR
7
+ from metaflow.sidecar import Sidecar
8
+ from metaflow.plugins.timeout_decorator import get_run_time_limit_for_task
9
+ from .nvcf import NvcfException
10
+
11
+ from metaflow.metadata import MetaDatum
12
+
13
+
14
+ class NvcfDecorator(StepDecorator):
15
+ name = "nvidia"
16
+ # defaults = {"function_id": "9e5647f2-740f-4101-a129-1c961a075575"}
17
+ defaults = {}
18
+ # "0817006f-018b-4590-b2a5-6cf9d64d9d9a"}
19
+ #
20
+
21
+ package_url = None
22
+ package_sha = None
23
+
24
+ # Refer https://github.com/Netflix/metaflow/blob/master/docs/lifecycle.png
25
+ # to understand where these functions are invoked in the lifecycle of a
26
+ # Metaflow flow.
27
+ def step_init(self, flow, graph, step, decos, environment, flow_datastore, logger):
28
+ # Executing NVCF functions requires a non-local datastore.
29
+ if flow_datastore.TYPE not in ("s3", "azure", "gs"):
30
+ raise NvcfException(
31
+ "The *@nvidia* decorator requires --datastore=s3 or --datastore=azure or --datastore=gs at the moment."
32
+ )
33
+ # if self.attributes["function_id"] is None:
34
+ # raise NvcfException(
35
+ # "The *@nvidia* decorator requires a function_id. Please reach out to Outerbounds if you are unsure how to get access to one."
36
+ # )
37
+ # Set internal state.
38
+ self.logger = logger
39
+ self.environment = environment
40
+ self.step = step
41
+ self.flow_datastore = flow_datastore
42
+
43
+ # TODO:
44
+ # 1. Ensure that @batch and @kubernetes decorators are not applied to this step.
45
+ # 2. Ensure @parallel is not applied to this step.
46
+
47
+ # Set run time limit for the NVCF function.
48
+ self.run_time_limit = get_run_time_limit_for_task(decos)
49
+ if self.run_time_limit < 60:
50
+ raise NvcfException(
51
+ "The timeout for step *{step}* should be at least 60 seconds for "
52
+ "execution with @nvidia.".format(step=step)
53
+ )
54
+
55
+ def runtime_init(self, flow, graph, package, run_id):
56
+ # Set some more internal state.
57
+ self.flow = flow
58
+ self.graph = graph
59
+ self.package = package
60
+ self.run_id = run_id
61
+
62
+ def runtime_task_created(
63
+ self, task_datastore, task_id, split_index, input_paths, is_cloned, ubf_context
64
+ ):
65
+ if not is_cloned:
66
+ self._save_package_once(self.flow_datastore, self.package)
67
+
68
+ def runtime_step_cli(
69
+ self, cli_args, retry_count, max_user_code_retries, ubf_context
70
+ ):
71
+ if retry_count <= max_user_code_retries:
72
+ # after all attempts to run the user code have failed, we don't need
73
+ # to execute on NVCF anymore. We can execute possible fallback
74
+ # code locally.
75
+ cli_args.commands = ["nvcf", "step"]
76
+ cli_args.command_args.append(self.package_sha)
77
+ cli_args.command_args.append(self.package_url)
78
+ cli_args.command_options.update(self.attributes)
79
+ # cli_args.command_options["run-time-limit"] = self.run_time_limit
80
+ cli_args.entrypoint[0] = sys.executable
81
+
82
+ def task_pre_step(
83
+ self,
84
+ step_name,
85
+ task_datastore,
86
+ metadata,
87
+ run_id,
88
+ task_id,
89
+ flow,
90
+ graph,
91
+ retry_count,
92
+ max_retries,
93
+ ubf_context,
94
+ inputs,
95
+ ):
96
+ self.metadata = metadata
97
+ self.task_datastore = task_datastore
98
+
99
+ # task_pre_step may run locally if fallback is activated for @catch
100
+ # decorator.
101
+
102
+ if "NVCF_CONTEXT" in os.environ:
103
+ meta = {}
104
+
105
+ meta["nvcf-function-id"] = os.environ.get("NVCF_FUNCTION_ID")
106
+ meta["nvcf-function-version-id"] = os.environ.get(
107
+ "NVCF_FUNCTION_VERSION_ID"
108
+ )
109
+ meta["nvcf-region"] = os.environ.get("NVCF_REGION")
110
+ meta["nvcf-ncaid"] = os.environ.get("NVCF_NCAID")
111
+ meta["nvcf-sub"] = os.environ.get("NVCF_SUB")
112
+ meta["nvcf-instancetype"] = os.environ.get("NVCF_INSTANCETYPE")
113
+ meta["nvcf-reqid"] = os.environ.get("NVCF_REQID")
114
+ meta["nvcf-env"] = os.environ.get("NVCF_ENV")
115
+ meta["nvcf-backend"] = os.environ.get("NVCF_BACKEND")
116
+ meta["nvcf-function-name"] = os.environ.get("NVCF_FUNCTION_NAME")
117
+ meta["nvcf-nspectid"] = os.environ.get("NVCF_NSPECTID")
118
+
119
+ entries = [
120
+ MetaDatum(field=k, value=v, type=k, tags=[])
121
+ for k, v in meta.items()
122
+ if v is not None
123
+ ]
124
+ # Register book-keeping metadata for debugging.
125
+ metadata.register_metadata(run_id, step_name, task_id, entries)
126
+
127
+ self._save_logs_sidecar = Sidecar("save_logs_periodically")
128
+ self._save_logs_sidecar.start()
129
+
130
+ def task_finished(
131
+ self, step_name, flow, graph, is_task_ok, retry_count, max_retries
132
+ ):
133
+ # task_finished may run locally if fallback is activated for @catch
134
+ # decorator.
135
+ if "NVCF_CONTEXT" in os.environ:
136
+ # If `local` metadata is configured, we would need to copy task
137
+ # execution metadata from the NVCF container to user's
138
+ # local file system after the user code has finished execution.
139
+ # This happens via datastore as a communication bridge.
140
+ if hasattr(self, "metadata") and self.metadata.TYPE == "local":
141
+ sync_local_metadata_to_datastore(
142
+ DATASTORE_LOCAL_DIR, self.task_datastore
143
+ )
144
+
145
+ try:
146
+ self._save_logs_sidecar.terminate()
147
+ except:
148
+ # Best effort kill
149
+ pass
150
+
151
+ @classmethod
152
+ def _save_package_once(cls, flow_datastore, package):
153
+ if cls.package_url is None:
154
+ cls.package_url, cls.package_sha = flow_datastore.save_data(
155
+ [package.blob], len_hint=1
156
+ )[0]
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ob-metaflow-extensions
3
- Version: 1.1.68
3
+ Version: 1.1.70
4
4
  Summary: Outerbounds Platform Extensions for Metaflow
5
5
  Author: Outerbounds, Inc.
6
6
  License: Commercial
@@ -1,13 +1,17 @@
1
1
  metaflow_extensions/outerbounds/__init__.py,sha256=TRGvIUMjkfneWtYUFSWoubu_Kf2ekAL4WLbV3IxOj9k,499
2
2
  metaflow_extensions/outerbounds/remote_config.py,sha256=HPFH4e3ZK3p-wS5HlS75fhR8_2avdD1AHQIZl2KnjeQ,4059
3
3
  metaflow_extensions/outerbounds/config/__init__.py,sha256=mYo95obHU1IE1wbPkeVz_pfTzNqlNabp1QBEMTGllbE,112
4
- metaflow_extensions/outerbounds/plugins/__init__.py,sha256=oR3krG5x3-W4g1sm5ygNPe9KVLBmxg7KEtzJsonQo_4,9398
4
+ metaflow_extensions/outerbounds/plugins/__init__.py,sha256=46NgbJBhVowDR6FyQrZPF2jHHqRTSyCBCYIQAyQ4Ryo,9516
5
5
  metaflow_extensions/outerbounds/plugins/auth_server.py,sha256=JhlMFcR7SPSfR1C9w6GlqJq-NYNhOfISmHl2PdkYUok,2212
6
6
  metaflow_extensions/outerbounds/plugins/perimeters.py,sha256=z8tSAkWtiITB-JtSQS7fkhlBwvxSxeTgEwFjahAzv-U,2238
7
7
  metaflow_extensions/outerbounds/plugins/kubernetes/__init__.py,sha256=5zG8gShSj8m7rgF4xgWBZFuY3GDP5n1T0ktjRpGJLHA,69
8
8
  metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py,sha256=gj6Iaz26bGbZm3aQuNS18Mqh_80iJp5PgFwFSlJRcn8,1968
9
- metaflow_extensions/outerbounds/plugins/nim/__init__.py,sha256=wzeYo7TX6ueIifGMt4lmJ6_LEsrUYNubWqACgHLwTis,1543
9
+ metaflow_extensions/outerbounds/plugins/nim/__init__.py,sha256=GVnvSTjqYVj5oG2yh8KJFt7iZ33cEadDD5HbdmC9hJ0,1457
10
10
  metaflow_extensions/outerbounds/plugins/nim/nim_manager.py,sha256=l8WDfVtsMt7aZaOaeIPT5ySidxfxXU8gmwLoKUP3f04,7044
11
+ metaflow_extensions/outerbounds/plugins/nvcf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
12
+ metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py,sha256=ftxC5SCo64P5Ycpv5vudluTnQi3-VCZW0umdsPP326A,7926
13
+ metaflow_extensions/outerbounds/plugins/nvcf/nvcf_cli.py,sha256=ow3lonclEDoZEUQCDV_L8lEr6HopXqjNXzubRrfdIm4,7219
14
+ metaflow_extensions/outerbounds/plugins/nvcf/nvcf_decorator.py,sha256=0xNA4aRTPJ4SKpRIFKZzlL9a7lf367KGTrVWVXd-uGE,6052
11
15
  metaflow_extensions/outerbounds/profilers/__init__.py,sha256=wa_jhnCBr82TBxoS0e8b6_6sLyZX0fdHicuGJZNTqKw,29
12
16
  metaflow_extensions/outerbounds/profilers/gpu.py,sha256=a5YZAepujuP0uDqG9UpXBlZS3wjUt4Yv8CjybXqeT2c,24342
13
17
  metaflow_extensions/outerbounds/toplevel/__init__.py,sha256=qWUJSv_r5hXJ7jV_On4nEasKIfUCm6_UjkjXWA_A1Ts,90
@@ -15,7 +19,7 @@ metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py,
15
19
  metaflow_extensions/outerbounds/toplevel/plugins/azure/__init__.py,sha256=WUuhz2YQfI4fz7nIcipwwWq781eaoHEk7n4GAn1npDg,63
16
20
  metaflow_extensions/outerbounds/toplevel/plugins/gcp/__init__.py,sha256=BbZiaH3uILlEZ6ntBLKeNyqn3If8nIXZFq_Apd7Dhco,70
17
21
  metaflow_extensions/outerbounds/toplevel/plugins/kubernetes/__init__.py,sha256=5zG8gShSj8m7rgF4xgWBZFuY3GDP5n1T0ktjRpGJLHA,69
18
- ob_metaflow_extensions-1.1.68.dist-info/METADATA,sha256=SJgAqyqfY268HCnd9Nn2XnGkRi918pmao-fFdQnp7tc,519
19
- ob_metaflow_extensions-1.1.68.dist-info/WHEEL,sha256=bb2Ot9scclHKMOLDEHY6B2sicWOgugjFKaJsT7vwMQo,110
20
- ob_metaflow_extensions-1.1.68.dist-info/top_level.txt,sha256=NwG0ukwjygtanDETyp_BUdtYtqIA_lOjzFFh1TsnxvI,20
21
- ob_metaflow_extensions-1.1.68.dist-info/RECORD,,
22
+ ob_metaflow_extensions-1.1.70.dist-info/METADATA,sha256=oh54d2W1t23zvb-nWBXXp5CiCTn2wRpUKVrtgWERn5c,519
23
+ ob_metaflow_extensions-1.1.70.dist-info/WHEEL,sha256=bb2Ot9scclHKMOLDEHY6B2sicWOgugjFKaJsT7vwMQo,110
24
+ ob_metaflow_extensions-1.1.70.dist-info/top_level.txt,sha256=NwG0ukwjygtanDETyp_BUdtYtqIA_lOjzFFh1TsnxvI,20
25
+ ob_metaflow_extensions-1.1.70.dist-info/RECORD,,