ob-metaflow-extensions 1.1.86__py2.py3-none-any.whl → 1.1.89__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ob-metaflow-extensions might be problematic. Click here for more details.
- metaflow_extensions/outerbounds/plugins/__init__.py +3 -3
- metaflow_extensions/outerbounds/plugins/fast_bakery/docker_environment.py +135 -51
- metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery.py +16 -0
- metaflow_extensions/outerbounds/plugins/nvcf/heartbeat_store.py +19 -3
- metaflow_extensions/outerbounds/plugins/nvcf/nvcf_cli.py +22 -12
- metaflow_extensions/outerbounds/plugins/nvcf/nvcf_decorator.py +1 -1
- {ob_metaflow_extensions-1.1.86.dist-info → ob_metaflow_extensions-1.1.89.dist-info}/METADATA +2 -2
- {ob_metaflow_extensions-1.1.86.dist-info → ob_metaflow_extensions-1.1.89.dist-info}/RECORD +10 -10
- {ob_metaflow_extensions-1.1.86.dist-info → ob_metaflow_extensions-1.1.89.dist-info}/WHEEL +0 -0
- {ob_metaflow_extensions-1.1.86.dist-info → ob_metaflow_extensions-1.1.89.dist-info}/top_level.txt +0 -0
|
@@ -104,7 +104,7 @@ def get_boto3_session(role_arn=None, session_vars=None):
|
|
|
104
104
|
tmp_aws_config_file = f.name
|
|
105
105
|
os.rename(tmp_aws_config_file, aws_config_file)
|
|
106
106
|
os.environ["AWS_CONFIG_FILE"] = aws_config_file
|
|
107
|
-
os.environ["
|
|
107
|
+
os.environ["AWS_PROFILE"] = "cspr"
|
|
108
108
|
else:
|
|
109
109
|
os.environ["AWS_WEB_IDENTITY_TOKEN_FILE"] = token_file
|
|
110
110
|
os.environ["AWS_ROLE_ARN"] = token_info["role_arn"]
|
|
@@ -122,7 +122,7 @@ def get_boto3_session(role_arn=None, session_vars=None):
|
|
|
122
122
|
# AWS_CONFIG_FILE environment variable above.
|
|
123
123
|
if role_arn == USE_CSPR_ROLE_ARN_IF_SET:
|
|
124
124
|
# Otherwise start from the default profile, assuming CSPR role
|
|
125
|
-
session = boto3.session.Session(profile_name="
|
|
125
|
+
session = boto3.session.Session(profile_name="cspr")
|
|
126
126
|
else:
|
|
127
127
|
session = boto3.session.Session(profile_name="task")
|
|
128
128
|
else:
|
|
@@ -306,7 +306,7 @@ class ObpGcpAuthProvider(object):
|
|
|
306
306
|
|
|
307
307
|
GCP_CLIENT_PROVIDERS_DESC = [("obp", ".ObpGcpAuthProvider")]
|
|
308
308
|
CLIS_DESC = [
|
|
309
|
-
("
|
|
309
|
+
("nvidia", ".nvcf.nvcf_cli.cli"),
|
|
310
310
|
("fast-bakery", ".fast_bakery.fast_bakery_cli.cli"),
|
|
311
311
|
("snowpark", ".snowpark.snowpark_cli.cli"),
|
|
312
312
|
]
|
|
@@ -1,32 +1,31 @@
|
|
|
1
1
|
import hashlib
|
|
2
2
|
import json
|
|
3
3
|
import os
|
|
4
|
-
|
|
4
|
+
import threading
|
|
5
|
+
import time
|
|
6
|
+
import uuid
|
|
5
7
|
from concurrent.futures import ThreadPoolExecutor
|
|
6
8
|
from typing import Dict
|
|
9
|
+
|
|
7
10
|
from metaflow.exception import MetaflowException
|
|
8
|
-
from metaflow.metaflow_config import
|
|
9
|
-
FAST_BAKERY_URL,
|
|
10
|
-
get_pinned_conda_libs,
|
|
11
|
-
)
|
|
11
|
+
from metaflow.metaflow_config import FAST_BAKERY_URL, get_pinned_conda_libs
|
|
12
12
|
from metaflow.metaflow_environment import MetaflowEnvironment
|
|
13
|
-
from metaflow.plugins.pypi.conda_environment import CondaEnvironment
|
|
14
|
-
from .fast_bakery import FastBakery, FastBakeryApiResponse, FastBakeryException
|
|
15
13
|
from metaflow.plugins.aws.batch.batch_decorator import BatchDecorator
|
|
16
14
|
from metaflow.plugins.kubernetes.kubernetes_decorator import KubernetesDecorator
|
|
17
15
|
from metaflow.plugins.pypi.conda_decorator import CondaStepDecorator
|
|
16
|
+
from metaflow.plugins.pypi.conda_environment import CondaEnvironment
|
|
18
17
|
from metaflow.plugins.pypi.pypi_decorator import PyPIStepDecorator
|
|
18
|
+
from metaflow import decorators
|
|
19
|
+
|
|
20
|
+
from .fast_bakery import FastBakery, FastBakeryApiResponse, FastBakeryException
|
|
19
21
|
|
|
20
22
|
BAKERY_METAFILE = ".imagebakery-cache"
|
|
21
23
|
|
|
24
|
+
import fcntl
|
|
22
25
|
import json
|
|
23
26
|
import os
|
|
24
|
-
import fcntl
|
|
25
|
-
from functools import wraps
|
|
26
27
|
from concurrent.futures import ThreadPoolExecutor
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
# TODO - ensure that both @conda/@pypi are not assigned to the same step
|
|
28
|
+
from functools import wraps
|
|
30
29
|
|
|
31
30
|
|
|
32
31
|
def cache_request(cache_file):
|
|
@@ -36,6 +35,9 @@ def cache_request(cache_file):
|
|
|
36
35
|
call_args = kwargs.copy()
|
|
37
36
|
call_args.update(zip(func.__code__.co_varnames, args))
|
|
38
37
|
call_args.pop("self", None)
|
|
38
|
+
call_args.pop("ref", None)
|
|
39
|
+
# invalidate cache when moving from one deployment to another
|
|
40
|
+
call_args.update({"fast_bakery_url": FAST_BAKERY_URL})
|
|
39
41
|
cache_key = hashlib.md5(
|
|
40
42
|
json.dumps(call_args, sort_keys=True).encode("utf-8")
|
|
41
43
|
).hexdigest()
|
|
@@ -79,7 +81,7 @@ def cache_request(cache_file):
|
|
|
79
81
|
|
|
80
82
|
|
|
81
83
|
class DockerEnvironmentException(MetaflowException):
|
|
82
|
-
headline = "Ran into an error while
|
|
84
|
+
headline = "Ran into an error while baking image"
|
|
83
85
|
|
|
84
86
|
def __init__(self, msg):
|
|
85
87
|
super(DockerEnvironmentException, self).__init__(msg)
|
|
@@ -93,46 +95,83 @@ class DockerEnvironment(MetaflowEnvironment):
|
|
|
93
95
|
self.skipped_steps = set()
|
|
94
96
|
self.flow = flow
|
|
95
97
|
|
|
96
|
-
self.bakery = FastBakery(url=FAST_BAKERY_URL)
|
|
97
98
|
self.results = {}
|
|
99
|
+
self.images_baked = 0
|
|
98
100
|
|
|
99
101
|
def set_local_root(self, local_root):
|
|
100
102
|
self.local_root = local_root
|
|
101
103
|
|
|
102
104
|
def decospecs(self):
|
|
103
|
-
|
|
105
|
+
# Due to conflicts with the CondaEnvironment fallback and bakery,
|
|
106
|
+
# we can not simply attach 'conda' or 'pypi' to all steps here.
|
|
107
|
+
# Instead we do this on a per-step basis in init_environment
|
|
108
|
+
return ("fast_bakery_internal",) + super().decospecs()
|
|
104
109
|
|
|
105
|
-
def validate_environment(self,
|
|
110
|
+
def validate_environment(self, logger, datastore_type):
|
|
106
111
|
self.datastore_type = datastore_type
|
|
107
|
-
self.
|
|
112
|
+
self.logger = logger
|
|
108
113
|
|
|
109
114
|
# Avoiding circular imports.
|
|
110
115
|
from metaflow.plugins import DATASTORES
|
|
111
116
|
|
|
112
117
|
self.datastore = [d for d in DATASTORES if d.TYPE == self.datastore_type][0]
|
|
113
118
|
|
|
119
|
+
# Mixing @pypi/@conda in a single step is not supported yet
|
|
120
|
+
for step in self.flow:
|
|
121
|
+
if sum(1 for deco in step.decorators if _is_env_deco(deco)) > 1:
|
|
122
|
+
raise MetaflowException(
|
|
123
|
+
"Mixing and matching PyPI packages and Conda packages within a\n"
|
|
124
|
+
"step is not yet supported. Use one of @pypi or @conda only for the *%s* step."
|
|
125
|
+
% step.name
|
|
126
|
+
)
|
|
127
|
+
|
|
114
128
|
def init_environment(self, echo):
|
|
115
129
|
self.skipped_steps = {
|
|
116
|
-
step.name
|
|
117
|
-
for step in self.flow
|
|
118
|
-
if not any(
|
|
119
|
-
isinstance(deco, (BatchDecorator, KubernetesDecorator))
|
|
120
|
-
for deco in step.decorators
|
|
121
|
-
)
|
|
130
|
+
step.name for step in self.flow if not _step_executes_remotely(step)
|
|
122
131
|
}
|
|
132
|
+
# Attach environment decorator as needed. This is done on a step-by-step basis
|
|
133
|
+
# as we require a conda decorator for fallback steps, but prefer pypi for the baked ones.
|
|
134
|
+
for step in self.flow:
|
|
135
|
+
if not _step_has_environment_deco(step):
|
|
136
|
+
if step.name in self.skipped_steps:
|
|
137
|
+
# Conda fallback requires a conda decorator as the default for a step
|
|
138
|
+
decorators._attach_decorators_to_step(step, ["conda"])
|
|
139
|
+
else:
|
|
140
|
+
# We default to PyPI for steps that are going to be baked.
|
|
141
|
+
decorators._attach_decorators_to_step(step, ["pypi"])
|
|
142
|
+
# Initialize the decorator we attached.
|
|
143
|
+
# This is crucial for the conda decorator to work properly in the fallback environment
|
|
144
|
+
for deco in step.decorators:
|
|
145
|
+
if _is_env_deco(deco):
|
|
146
|
+
deco.step_init(
|
|
147
|
+
self.flow,
|
|
148
|
+
None, # not passing graph as it is not available, and not required by conda/pypi decorators
|
|
149
|
+
step.name,
|
|
150
|
+
step.decorators,
|
|
151
|
+
self,
|
|
152
|
+
self.datastore,
|
|
153
|
+
echo,
|
|
154
|
+
)
|
|
123
155
|
|
|
124
156
|
steps_to_bake = [
|
|
125
157
|
step for step in self.flow if step.name not in self.skipped_steps
|
|
126
158
|
]
|
|
127
159
|
if steps_to_bake:
|
|
128
|
-
|
|
129
|
-
|
|
160
|
+
self.logger("🚀 Baking container image(s) ...")
|
|
161
|
+
start_time = time.time()
|
|
162
|
+
self.results = self._bake(steps_to_bake)
|
|
130
163
|
for step in self.flow:
|
|
131
164
|
for d in step.decorators:
|
|
132
|
-
if
|
|
165
|
+
if _is_remote_deco(d):
|
|
133
166
|
d.attributes["image"] = self.results[step.name].container_image
|
|
134
167
|
d.attributes["executable"] = self.results[step.name].python_path
|
|
135
|
-
|
|
168
|
+
if self.images_baked > 0:
|
|
169
|
+
bake_time = time.time() - start_time
|
|
170
|
+
self.logger(
|
|
171
|
+
f"🎉 All container image(s) baked in {bake_time:.2f} seconds!"
|
|
172
|
+
)
|
|
173
|
+
else:
|
|
174
|
+
self.logger("🎉 All container image(s) baked!")
|
|
136
175
|
|
|
137
176
|
if self.skipped_steps:
|
|
138
177
|
self.delegate = CondaEnvironment(self.flow)
|
|
@@ -140,29 +179,54 @@ class DockerEnvironment(MetaflowEnvironment):
|
|
|
140
179
|
self.delegate.validate_environment(echo, self.datastore_type)
|
|
141
180
|
self.delegate.init_environment(echo, self.skipped_steps)
|
|
142
181
|
|
|
143
|
-
def _bake(self, steps
|
|
182
|
+
def _bake(self, steps) -> Dict[str, FastBakeryApiResponse]:
|
|
144
183
|
metafile_path = get_fastbakery_metafile_path(self.local_root, self.flow.name)
|
|
184
|
+
logger_lock = threading.Lock()
|
|
145
185
|
|
|
146
186
|
@cache_request(metafile_path)
|
|
147
187
|
def _cached_bake(
|
|
148
|
-
|
|
188
|
+
ref=None,
|
|
189
|
+
python=None,
|
|
190
|
+
pypi_packages=None,
|
|
191
|
+
conda_packages=None,
|
|
192
|
+
base_image=None,
|
|
149
193
|
):
|
|
150
|
-
self.bakery._reset_payload()
|
|
151
|
-
self.bakery.python_version(python)
|
|
152
|
-
self.bakery.pypi_packages(pypi_packages)
|
|
153
|
-
self.bakery.conda_packages(conda_packages)
|
|
154
|
-
self.bakery.base_image(base_image)
|
|
155
|
-
# self.bakery.ignore_cache()
|
|
156
194
|
try:
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
195
|
+
bakery = FastBakery(url=FAST_BAKERY_URL)
|
|
196
|
+
bakery._reset_payload()
|
|
197
|
+
bakery.python_version(python)
|
|
198
|
+
bakery.pypi_packages(pypi_packages)
|
|
199
|
+
bakery.conda_packages(conda_packages)
|
|
200
|
+
bakery.base_image(base_image)
|
|
201
|
+
# bakery.ignore_cache()
|
|
202
|
+
|
|
203
|
+
with logger_lock:
|
|
204
|
+
self.logger(f"🍳 Baking [{ref}] ...")
|
|
205
|
+
self.logger(f" 🐍 Python: {python}")
|
|
206
|
+
|
|
207
|
+
if pypi_packages:
|
|
208
|
+
self.logger(f" 📦 PyPI packages:")
|
|
209
|
+
for package, version in pypi_packages.items():
|
|
210
|
+
self.logger(f" 🔧 {package}: {version}")
|
|
211
|
+
|
|
212
|
+
if conda_packages:
|
|
213
|
+
self.logger(f" 📦 Conda packages:")
|
|
214
|
+
for package, version in conda_packages.items():
|
|
215
|
+
self.logger(f" 🔧 {package}: {version}")
|
|
216
|
+
|
|
217
|
+
self.logger(f" 🏗️ Base image: {base_image}")
|
|
218
|
+
|
|
219
|
+
start_time = time.time()
|
|
220
|
+
res = bakery.bake()
|
|
221
|
+
# TODO: Get actual bake time from bakery
|
|
222
|
+
bake_time = time.time() - start_time
|
|
223
|
+
|
|
224
|
+
with logger_lock:
|
|
225
|
+
self.logger(f"🏁 Baked [{ref}] in {bake_time:.2f} seconds!")
|
|
226
|
+
self.images_baked += 1
|
|
163
227
|
return res
|
|
164
228
|
except FastBakeryException as ex:
|
|
165
|
-
raise DockerEnvironmentException(str(ex))
|
|
229
|
+
raise DockerEnvironmentException(f"Bake [{ref}] failed: {str(ex)}")
|
|
166
230
|
|
|
167
231
|
def prepare_step(step):
|
|
168
232
|
base_image = next(
|
|
@@ -174,11 +238,7 @@ class DockerEnvironment(MetaflowEnvironment):
|
|
|
174
238
|
None,
|
|
175
239
|
)
|
|
176
240
|
dependencies = next(
|
|
177
|
-
(
|
|
178
|
-
d
|
|
179
|
-
for d in step.decorators
|
|
180
|
-
if isinstance(d, (CondaStepDecorator, PyPIStepDecorator))
|
|
181
|
-
),
|
|
241
|
+
(d for d in step.decorators if _is_env_deco(d)),
|
|
182
242
|
None,
|
|
183
243
|
)
|
|
184
244
|
python = next(
|
|
@@ -216,10 +276,15 @@ class DockerEnvironment(MetaflowEnvironment):
|
|
|
216
276
|
}
|
|
217
277
|
|
|
218
278
|
with ThreadPoolExecutor() as executor:
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
279
|
+
prepared_args = list(executor.map(prepare_step, steps))
|
|
280
|
+
for i, args in enumerate(prepared_args, 1):
|
|
281
|
+
args["ref"] = f"#{i:02d}"
|
|
282
|
+
futures = [executor.submit(_cached_bake, **args) for args in prepared_args]
|
|
283
|
+
results = {}
|
|
284
|
+
for step, future in zip(steps, futures):
|
|
285
|
+
results[step.name] = future.result()
|
|
286
|
+
|
|
287
|
+
return results
|
|
223
288
|
|
|
224
289
|
def executable(self, step_name, default=None):
|
|
225
290
|
if step_name in self.skipped_steps:
|
|
@@ -266,3 +331,22 @@ class DockerEnvironment(MetaflowEnvironment):
|
|
|
266
331
|
|
|
267
332
|
def get_fastbakery_metafile_path(local_root, flow_name):
|
|
268
333
|
return os.path.join(local_root, flow_name, BAKERY_METAFILE)
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
def _is_remote_deco(deco):
|
|
337
|
+
return isinstance(deco, (BatchDecorator, KubernetesDecorator))
|
|
338
|
+
|
|
339
|
+
|
|
340
|
+
def _step_executes_remotely(step):
|
|
341
|
+
"Check if a step is going to execute remotely or locally"
|
|
342
|
+
return any(_is_remote_deco(deco) for deco in step.decorators)
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def _is_env_deco(deco):
|
|
346
|
+
"Check if a decorator is a known environment decorator (conda/pypi)"
|
|
347
|
+
return isinstance(deco, (PyPIStepDecorator, CondaStepDecorator))
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
def _step_has_environment_deco(step):
|
|
351
|
+
"Check if a step has a virtual environment decorator"
|
|
352
|
+
return any(_is_env_deco(deco) for deco in step.decorators)
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from typing import Dict, Optional
|
|
2
2
|
import requests
|
|
3
|
+
import time
|
|
3
4
|
|
|
4
5
|
|
|
5
6
|
class FastBakeryException(Exception):
|
|
@@ -72,6 +73,8 @@ class FastBakeryApiResponse:
|
|
|
72
73
|
|
|
73
74
|
class FastBakery:
|
|
74
75
|
def __init__(self, url: str):
|
|
76
|
+
if not url:
|
|
77
|
+
raise FastBakeryException("Specifying a url is required.")
|
|
75
78
|
self.url = url
|
|
76
79
|
self.headers = {"Content-Type": "application/json", "Connection": "keep-alive"}
|
|
77
80
|
self._reset_payload()
|
|
@@ -140,6 +143,19 @@ class FastBakery:
|
|
|
140
143
|
headers = {**self.headers, **(SERVICE_HEADERS or {})}
|
|
141
144
|
except ImportError:
|
|
142
145
|
headers = self.headers
|
|
146
|
+
|
|
147
|
+
retryable_status_codes = [409]
|
|
148
|
+
|
|
149
|
+
for attempt in range(2): # 0 = initial attempt, 1-2 = retries
|
|
150
|
+
response = requests.post(self.url, json=payload, headers=headers)
|
|
151
|
+
|
|
152
|
+
if response.status_code not in retryable_status_codes:
|
|
153
|
+
break
|
|
154
|
+
|
|
155
|
+
if attempt < 2: # Don't sleep after the last attempt
|
|
156
|
+
sleep_time = 0.5 * (attempt + 1)
|
|
157
|
+
time.sleep(sleep_time)
|
|
158
|
+
|
|
143
159
|
response = requests.post(self.url, json=payload, headers=headers)
|
|
144
160
|
self._handle_error_response(response)
|
|
145
161
|
return FastBakeryApiResponse(response.json())
|
|
@@ -1,13 +1,29 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import sys
|
|
3
3
|
import time
|
|
4
|
-
import
|
|
4
|
+
import subprocess
|
|
5
5
|
from io import BytesIO
|
|
6
6
|
from datetime import datetime, timezone
|
|
7
7
|
|
|
8
8
|
from metaflow.exception import MetaflowException
|
|
9
9
|
|
|
10
10
|
|
|
11
|
+
def kill_process_and_descendants(pid, termination_timeout=5):
|
|
12
|
+
try:
|
|
13
|
+
subprocess.check_call(["pkill", "-TERM", "-P", str(pid)])
|
|
14
|
+
subprocess.check_call(["kill", "-TERM", str(pid)])
|
|
15
|
+
except subprocess.CalledProcessError:
|
|
16
|
+
pass
|
|
17
|
+
|
|
18
|
+
time.sleep(termination_timeout)
|
|
19
|
+
|
|
20
|
+
try:
|
|
21
|
+
subprocess.check_call(["pkill", "-KILL", "-P", str(pid)])
|
|
22
|
+
subprocess.check_call(["kill", "-KILL", str(pid)])
|
|
23
|
+
except subprocess.CalledProcessError:
|
|
24
|
+
pass
|
|
25
|
+
|
|
26
|
+
|
|
11
27
|
class HeartbeatStore(object):
|
|
12
28
|
def __init__(
|
|
13
29
|
self,
|
|
@@ -67,7 +83,7 @@ class HeartbeatStore(object):
|
|
|
67
83
|
contents = f.read()
|
|
68
84
|
if "tombstone" in contents:
|
|
69
85
|
print("[Outerbounds] Tombstone detected. Terminating the task..")
|
|
70
|
-
|
|
86
|
+
kill_process_and_descendants(self.main_pid)
|
|
71
87
|
sys.exit(1)
|
|
72
88
|
|
|
73
89
|
def __handle_heartbeat(self, path):
|
|
@@ -86,7 +102,7 @@ class HeartbeatStore(object):
|
|
|
86
102
|
print(
|
|
87
103
|
f"[Outerbounds] Missed {self.max_missed_heartbeats} consecutive heartbeats. Terminating the task.."
|
|
88
104
|
)
|
|
89
|
-
|
|
105
|
+
kill_process_and_descendants(self.main_pid)
|
|
90
106
|
sys.exit(1)
|
|
91
107
|
|
|
92
108
|
def is_main_process_running(self):
|
|
@@ -39,13 +39,18 @@ def cli():
|
|
|
39
39
|
pass
|
|
40
40
|
|
|
41
41
|
|
|
42
|
-
@cli.group(help="Commands related to
|
|
43
|
-
def
|
|
42
|
+
@cli.group(help="Commands related to nvidia.")
|
|
43
|
+
def nvidia():
|
|
44
44
|
pass
|
|
45
45
|
|
|
46
46
|
|
|
47
|
-
@
|
|
48
|
-
@click.
|
|
47
|
+
@nvidia.command(help="List steps / tasks running as an nvidia job.")
|
|
48
|
+
@click.option(
|
|
49
|
+
"--run-id",
|
|
50
|
+
default=None,
|
|
51
|
+
required=True,
|
|
52
|
+
help="List unfinished tasks corresponding to the run id.",
|
|
53
|
+
)
|
|
49
54
|
@click.pass_context
|
|
50
55
|
def list(ctx, run_id):
|
|
51
56
|
flow_name = ctx.obj.flow.name
|
|
@@ -65,13 +70,18 @@ def list(ctx, run_id):
|
|
|
65
70
|
|
|
66
71
|
if running_invocations:
|
|
67
72
|
for each_invocation in running_invocations:
|
|
68
|
-
|
|
73
|
+
ctx.obj.echo(each_invocation)
|
|
69
74
|
else:
|
|
70
|
-
|
|
75
|
+
ctx.obj.echo("No running @nvidia invocations for Run ID: %s" % run_id)
|
|
71
76
|
|
|
72
77
|
|
|
73
|
-
@
|
|
74
|
-
@click.
|
|
78
|
+
@nvidia.command(help="Kill steps / tasks running as an nvidia job.")
|
|
79
|
+
@click.option(
|
|
80
|
+
"--run-id",
|
|
81
|
+
default=None,
|
|
82
|
+
required=True,
|
|
83
|
+
help="Terminate unfinished tasks corresponding to the run id.",
|
|
84
|
+
)
|
|
75
85
|
@click.pass_context
|
|
76
86
|
def kill(ctx, run_id):
|
|
77
87
|
from metaflow_extensions.outerbounds.plugins.nvcf.heartbeat_store import (
|
|
@@ -100,12 +110,12 @@ def kill(ctx, run_id):
|
|
|
100
110
|
)
|
|
101
111
|
store.emit_tombstone(folder_name="nvcf_heartbeats")
|
|
102
112
|
else:
|
|
103
|
-
|
|
113
|
+
ctx.obj.echo("No running @nvidia invocations for Run ID: %s" % run_id)
|
|
104
114
|
|
|
105
115
|
|
|
106
|
-
@
|
|
107
|
-
help="Execute a single task using
|
|
108
|
-
"top-level step command inside
|
|
116
|
+
@nvidia.command(
|
|
117
|
+
help="Execute a single task using @nvidia. This command calls the "
|
|
118
|
+
"top-level step command inside an nvidia job with the given options. "
|
|
109
119
|
"Typically you do not call this command directly; it is used internally by "
|
|
110
120
|
"Metaflow."
|
|
111
121
|
)
|
|
@@ -72,7 +72,7 @@ class NvcfDecorator(StepDecorator):
|
|
|
72
72
|
# after all attempts to run the user code have failed, we don't need
|
|
73
73
|
# to execute on NVCF anymore. We can execute possible fallback
|
|
74
74
|
# code locally.
|
|
75
|
-
cli_args.commands = ["
|
|
75
|
+
cli_args.commands = ["nvidia", "step"]
|
|
76
76
|
cli_args.command_args.append(self.package_sha)
|
|
77
77
|
cli_args.command_args.append(self.package_url)
|
|
78
78
|
cli_args.command_options.update(self.attributes)
|
{ob_metaflow_extensions-1.1.86.dist-info → ob_metaflow_extensions-1.1.89.dist-info}/METADATA
RENAMED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: ob-metaflow-extensions
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.89
|
|
4
4
|
Summary: Outerbounds Platform Extensions for Metaflow
|
|
5
5
|
Author: Outerbounds, Inc.
|
|
6
6
|
License: Commercial
|
|
7
7
|
Description-Content-Type: text/markdown
|
|
8
8
|
Requires-Dist: boto3
|
|
9
9
|
Requires-Dist: kubernetes
|
|
10
|
-
Requires-Dist: ob-metaflow (==2.12.
|
|
10
|
+
Requires-Dist: ob-metaflow (==2.12.19.1)
|
|
11
11
|
|
|
12
12
|
# Outerbounds platform package
|
|
13
13
|
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
metaflow_extensions/outerbounds/__init__.py,sha256=TRGvIUMjkfneWtYUFSWoubu_Kf2ekAL4WLbV3IxOj9k,499
|
|
2
2
|
metaflow_extensions/outerbounds/remote_config.py,sha256=Zpfpjgz68_ZgxlXezjzlsDLo4840rkWuZgwDB_5H57U,4059
|
|
3
3
|
metaflow_extensions/outerbounds/config/__init__.py,sha256=JsQGRuGFz28fQWjUvxUgR8EKBLGRdLUIk_buPLJplJY,1225
|
|
4
|
-
metaflow_extensions/outerbounds/plugins/__init__.py,sha256=
|
|
4
|
+
metaflow_extensions/outerbounds/plugins/__init__.py,sha256=9DCqKsb2bPfw1f7x-3EuB2Mqc9uKlLtGEG6yLJI8Xx0,12510
|
|
5
5
|
metaflow_extensions/outerbounds/plugins/auth_server.py,sha256=1v2GBqoMBxp5E7Lejz139w-jxJtPnLDvvHXP0HhEIHI,2361
|
|
6
6
|
metaflow_extensions/outerbounds/plugins/perimeters.py,sha256=QXh3SFP7GQbS-RAIxUOPbhPzQ7KDFVxZkTdKqFKgXjI,2697
|
|
7
7
|
metaflow_extensions/outerbounds/plugins/fast_bakery/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
8
|
-
metaflow_extensions/outerbounds/plugins/fast_bakery/docker_environment.py,sha256=
|
|
9
|
-
metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery.py,sha256=
|
|
8
|
+
metaflow_extensions/outerbounds/plugins/fast_bakery/docker_environment.py,sha256=gFeXt8nLv7SzZ2zEoAKMpdxAePN4jZclSsmQAm6hB4w,13846
|
|
9
|
+
metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery.py,sha256=MAPRQsfqeEkL1LXqgwPrUJOzZ3kY3C00QjdDgQ7wdIg,5160
|
|
10
10
|
metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_cli.py,sha256=kqFyu2bJSnc9_9aYfBpz5xK6L6luWFZK_NMuh8f1eVk,1494
|
|
11
11
|
metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_decorator.py,sha256=EZDbyrfZ7fgcU-P9dMS_hpCxsdDeUE0K5VU3uNM4aW4,1506
|
|
12
12
|
metaflow_extensions/outerbounds/plugins/kubernetes/__init__.py,sha256=5zG8gShSj8m7rgF4xgWBZFuY3GDP5n1T0ktjRpGJLHA,69
|
|
@@ -14,10 +14,10 @@ metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py,sha256=g
|
|
|
14
14
|
metaflow_extensions/outerbounds/plugins/nim/__init__.py,sha256=GVnvSTjqYVj5oG2yh8KJFt7iZ33cEadDD5HbdmC9hJ0,1457
|
|
15
15
|
metaflow_extensions/outerbounds/plugins/nim/nim_manager.py,sha256=SWieODDxtIaeZwdMYtObDi57Kjyfw2DUuE6pJtU750w,9206
|
|
16
16
|
metaflow_extensions/outerbounds/plugins/nvcf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
|
-
metaflow_extensions/outerbounds/plugins/nvcf/heartbeat_store.py,sha256=
|
|
17
|
+
metaflow_extensions/outerbounds/plugins/nvcf/heartbeat_store.py,sha256=4aQZ0kpW2LlJbHx6AG4A9eaFLH9rWC_ENWnnfYNq1qk,5910
|
|
18
18
|
metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py,sha256=NIt1kJHuYpnCF7n73A90ZITWsk5QWtsbiHfzvdVjgqk,8997
|
|
19
|
-
metaflow_extensions/outerbounds/plugins/nvcf/nvcf_cli.py,sha256=
|
|
20
|
-
metaflow_extensions/outerbounds/plugins/nvcf/nvcf_decorator.py,sha256=
|
|
19
|
+
metaflow_extensions/outerbounds/plugins/nvcf/nvcf_cli.py,sha256=9nQBwm6AYtaKIAxdb937MOnsut3INEXN3v5eSnXy4cg,9811
|
|
20
|
+
metaflow_extensions/outerbounds/plugins/nvcf/nvcf_decorator.py,sha256=E7h94ni8yW9BQkKSBUptPdGAaVmXpR9FlXkPWpLyPd0,6054
|
|
21
21
|
metaflow_extensions/outerbounds/plugins/snowpark/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
22
22
|
metaflow_extensions/outerbounds/plugins/snowpark/snowpark.py,sha256=vzgpVLCKvHjzHNfJvmH0jcxefYNsVggw_vof_y_U_a8,10643
|
|
23
23
|
metaflow_extensions/outerbounds/plugins/snowpark/snowpark_cli.py,sha256=ErsVoCQLa33byiykOQzDEeEkRKk0mgffZme43f3jxn4,8747
|
|
@@ -33,7 +33,7 @@ metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py,
|
|
|
33
33
|
metaflow_extensions/outerbounds/toplevel/plugins/azure/__init__.py,sha256=WUuhz2YQfI4fz7nIcipwwWq781eaoHEk7n4GAn1npDg,63
|
|
34
34
|
metaflow_extensions/outerbounds/toplevel/plugins/gcp/__init__.py,sha256=BbZiaH3uILlEZ6ntBLKeNyqn3If8nIXZFq_Apd7Dhco,70
|
|
35
35
|
metaflow_extensions/outerbounds/toplevel/plugins/kubernetes/__init__.py,sha256=5zG8gShSj8m7rgF4xgWBZFuY3GDP5n1T0ktjRpGJLHA,69
|
|
36
|
-
ob_metaflow_extensions-1.1.
|
|
37
|
-
ob_metaflow_extensions-1.1.
|
|
38
|
-
ob_metaflow_extensions-1.1.
|
|
39
|
-
ob_metaflow_extensions-1.1.
|
|
36
|
+
ob_metaflow_extensions-1.1.89.dist-info/METADATA,sha256=48w-ipRZMJOxQ7yZx8uqyDmiBQIirZ3MWYtygEXscNI,520
|
|
37
|
+
ob_metaflow_extensions-1.1.89.dist-info/WHEEL,sha256=bb2Ot9scclHKMOLDEHY6B2sicWOgugjFKaJsT7vwMQo,110
|
|
38
|
+
ob_metaflow_extensions-1.1.89.dist-info/top_level.txt,sha256=NwG0ukwjygtanDETyp_BUdtYtqIA_lOjzFFh1TsnxvI,20
|
|
39
|
+
ob_metaflow_extensions-1.1.89.dist-info/RECORD,,
|
|
File without changes
|
{ob_metaflow_extensions-1.1.86.dist-info → ob_metaflow_extensions-1.1.89.dist-info}/top_level.txt
RENAMED
|
File without changes
|