ob-metaflow-extensions 1.1.71__tar.gz → 1.1.73__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ob-metaflow-extensions might be problematic. Click here for more details.
- {ob-metaflow-extensions-1.1.71 → ob-metaflow-extensions-1.1.73}/PKG-INFO +1 -1
- {ob-metaflow-extensions-1.1.71 → ob-metaflow-extensions-1.1.73}/metaflow_extensions/outerbounds/plugins/auth_server.py +9 -2
- {ob-metaflow-extensions-1.1.71 → ob-metaflow-extensions-1.1.73}/metaflow_extensions/outerbounds/plugins/nim/nim_manager.py +68 -24
- {ob-metaflow-extensions-1.1.71 → ob-metaflow-extensions-1.1.73}/ob_metaflow_extensions.egg-info/PKG-INFO +1 -1
- {ob-metaflow-extensions-1.1.71 → ob-metaflow-extensions-1.1.73}/setup.py +1 -1
- {ob-metaflow-extensions-1.1.71 → ob-metaflow-extensions-1.1.73}/README.md +0 -0
- {ob-metaflow-extensions-1.1.71 → ob-metaflow-extensions-1.1.73}/metaflow_extensions/outerbounds/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.71 → ob-metaflow-extensions-1.1.73}/metaflow_extensions/outerbounds/config/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.71 → ob-metaflow-extensions-1.1.73}/metaflow_extensions/outerbounds/plugins/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.71 → ob-metaflow-extensions-1.1.73}/metaflow_extensions/outerbounds/plugins/kubernetes/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.71 → ob-metaflow-extensions-1.1.73}/metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py +0 -0
- {ob-metaflow-extensions-1.1.71 → ob-metaflow-extensions-1.1.73}/metaflow_extensions/outerbounds/plugins/nim/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.71 → ob-metaflow-extensions-1.1.73}/metaflow_extensions/outerbounds/plugins/nvcf/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.71 → ob-metaflow-extensions-1.1.73}/metaflow_extensions/outerbounds/plugins/nvcf/nvcf.py +0 -0
- {ob-metaflow-extensions-1.1.71 → ob-metaflow-extensions-1.1.73}/metaflow_extensions/outerbounds/plugins/nvcf/nvcf_cli.py +0 -0
- {ob-metaflow-extensions-1.1.71 → ob-metaflow-extensions-1.1.73}/metaflow_extensions/outerbounds/plugins/nvcf/nvcf_decorator.py +0 -0
- {ob-metaflow-extensions-1.1.71 → ob-metaflow-extensions-1.1.73}/metaflow_extensions/outerbounds/plugins/perimeters.py +0 -0
- {ob-metaflow-extensions-1.1.71 → ob-metaflow-extensions-1.1.73}/metaflow_extensions/outerbounds/profilers/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.71 → ob-metaflow-extensions-1.1.73}/metaflow_extensions/outerbounds/profilers/gpu.py +0 -0
- {ob-metaflow-extensions-1.1.71 → ob-metaflow-extensions-1.1.73}/metaflow_extensions/outerbounds/remote_config.py +0 -0
- {ob-metaflow-extensions-1.1.71 → ob-metaflow-extensions-1.1.73}/metaflow_extensions/outerbounds/toplevel/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.71 → ob-metaflow-extensions-1.1.73}/metaflow_extensions/outerbounds/toplevel/global_aliases_for_metaflow_package.py +0 -0
- {ob-metaflow-extensions-1.1.71 → ob-metaflow-extensions-1.1.73}/metaflow_extensions/outerbounds/toplevel/plugins/azure/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.71 → ob-metaflow-extensions-1.1.73}/metaflow_extensions/outerbounds/toplevel/plugins/gcp/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.71 → ob-metaflow-extensions-1.1.73}/metaflow_extensions/outerbounds/toplevel/plugins/kubernetes/__init__.py +0 -0
- {ob-metaflow-extensions-1.1.71 → ob-metaflow-extensions-1.1.73}/ob_metaflow_extensions.egg-info/SOURCES.txt +0 -0
- {ob-metaflow-extensions-1.1.71 → ob-metaflow-extensions-1.1.73}/ob_metaflow_extensions.egg-info/dependency_links.txt +0 -0
- {ob-metaflow-extensions-1.1.71 → ob-metaflow-extensions-1.1.73}/ob_metaflow_extensions.egg-info/requires.txt +0 -0
- {ob-metaflow-extensions-1.1.71 → ob-metaflow-extensions-1.1.73}/ob_metaflow_extensions.egg-info/top_level.txt +0 -0
- {ob-metaflow-extensions-1.1.71 → ob-metaflow-extensions-1.1.73}/setup.cfg +0 -0
|
@@ -52,10 +52,17 @@ def get_token(url_path):
|
|
|
52
52
|
return token_info
|
|
53
53
|
except requests.exceptions.HTTPError as e:
|
|
54
54
|
raise MetaflowException(repr(e))
|
|
55
|
-
except
|
|
55
|
+
except (
|
|
56
|
+
requests.exceptions.ConnectionError,
|
|
57
|
+
requests.exceptions.ReadTimeout,
|
|
58
|
+
) as e:
|
|
56
59
|
# ConnectionErrors are generally temporary errors like DNS resolution failures,
|
|
57
60
|
# timeouts etc.
|
|
58
|
-
print(
|
|
61
|
+
print(
|
|
62
|
+
"received error of type {}. Retrying...".format(type(e)),
|
|
63
|
+
e,
|
|
64
|
+
file=sys.stderr,
|
|
65
|
+
)
|
|
59
66
|
time.sleep(retry_delay)
|
|
60
67
|
retry_delay *= 2 # Double the delay for the next attempt
|
|
61
68
|
retry_delay += random.uniform(0, 1) # Add jitter
|
|
@@ -5,6 +5,8 @@ import requests
|
|
|
5
5
|
from urllib.parse import urlparse
|
|
6
6
|
from metaflow.metaflow_config import SERVICE_URL
|
|
7
7
|
from metaflow.metaflow_config_funcs import init_config
|
|
8
|
+
import sys
|
|
9
|
+
import random
|
|
8
10
|
|
|
9
11
|
NVCF_URL = "https://api.nvcf.nvidia.com"
|
|
10
12
|
NVCF_SUBMIT_ENDPOINT = f"{NVCF_URL}/v2/nvcf/pexec/functions"
|
|
@@ -113,6 +115,9 @@ class NimChatCompletion(object):
|
|
|
113
115
|
self._nim_metadata = nim_metadata
|
|
114
116
|
self.compute_provider = provider
|
|
115
117
|
self.invocations = []
|
|
118
|
+
self.max_request_retries = int(
|
|
119
|
+
os.environ.get("METAFLOW_EXT_HTTP_MAX_RETRIES", "10")
|
|
120
|
+
)
|
|
116
121
|
|
|
117
122
|
if self.compute_provider == "CoreWeave":
|
|
118
123
|
cw_model_names = [
|
|
@@ -154,33 +159,72 @@ class NimChatCompletion(object):
|
|
|
154
159
|
request_data = {"model": self.model, **kwargs}
|
|
155
160
|
request_url = f"{NVCF_SUBMIT_ENDPOINT}/{self.function_id}"
|
|
156
161
|
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
162
|
+
attempts = 0
|
|
163
|
+
while attempts < self.max_request_retries:
|
|
164
|
+
try:
|
|
165
|
+
attempts += 1
|
|
166
|
+
response = requests.post(
|
|
167
|
+
request_url,
|
|
168
|
+
headers=self._nim_metadata.get_headers_for_nvcf_request(),
|
|
169
|
+
json=request_data,
|
|
170
|
+
)
|
|
171
|
+
response.raise_for_status()
|
|
172
|
+
if response.status_code == 202:
|
|
173
|
+
invocation_id = response.headers.get("NVCF-REQID")
|
|
174
|
+
self.invocations.append(invocation_id)
|
|
175
|
+
elif response.status_code == 200:
|
|
176
|
+
return response.json()
|
|
177
|
+
except (
|
|
178
|
+
requests.exceptions.ConnectionError,
|
|
179
|
+
requests.exceptions.ReadTimeout,
|
|
180
|
+
) as e:
|
|
181
|
+
# ConnectionErrors are generally temporary errors like DNS resolution failures,
|
|
182
|
+
# timeouts etc.
|
|
183
|
+
print(
|
|
184
|
+
"received error of type {}. Retrying...".format(type(e)),
|
|
185
|
+
e,
|
|
186
|
+
file=sys.stderr,
|
|
187
|
+
)
|
|
188
|
+
time.sleep(retry_delay)
|
|
189
|
+
retry_delay *= 2 # Double the delay for the next attempt
|
|
190
|
+
retry_delay += random.uniform(0, 1) # Add jitter
|
|
191
|
+
retry_delay = min(retry_delay, 10)
|
|
168
192
|
|
|
169
193
|
def _poll():
|
|
170
194
|
poll_request_url = f"{NVCF_RESULT_ENDPOINT}/{invocation_id}"
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
195
|
+
attempts = 0
|
|
196
|
+
|
|
197
|
+
while attempts < self.max_request_retries:
|
|
198
|
+
try:
|
|
199
|
+
attempts += 1
|
|
200
|
+
poll_response = requests.get(
|
|
201
|
+
poll_request_url,
|
|
202
|
+
headers=self._nim_metadata.get_headers_for_nvcf_request(),
|
|
203
|
+
)
|
|
204
|
+
poll_response.raise_for_status()
|
|
205
|
+
if poll_response.status_code == 200:
|
|
206
|
+
return poll_response.json()
|
|
207
|
+
elif poll_response.status_code == 202:
|
|
208
|
+
return 202
|
|
209
|
+
else:
|
|
210
|
+
raise Exception(
|
|
211
|
+
f"NVCF returned {poll_response.status_code} status code. Please contact Outerbounds."
|
|
212
|
+
)
|
|
213
|
+
except (
|
|
214
|
+
requests.exceptions.ConnectionError,
|
|
215
|
+
requests.exceptions.ReadTimeout,
|
|
216
|
+
) as e:
|
|
217
|
+
# ConnectionErrors are generally temporary errors like DNS resolution failures,
|
|
218
|
+
# timeouts etc.
|
|
219
|
+
print(
|
|
220
|
+
"received error of type {}. Retrying...".format(type(e)),
|
|
221
|
+
e,
|
|
222
|
+
file=sys.stderr,
|
|
223
|
+
)
|
|
224
|
+
time.sleep(retry_delay)
|
|
225
|
+
retry_delay *= 2 # Double the delay for the next attempt
|
|
226
|
+
retry_delay += random.uniform(0, 1) # Add jitter
|
|
227
|
+
retry_delay = min(retry_delay, 10)
|
|
184
228
|
|
|
185
229
|
while True:
|
|
186
230
|
data = _poll()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|