ob-metaflow-extensions 1.1.155rc0__py2.py3-none-any.whl → 1.1.156__py2.py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of ob-metaflow-extensions might be problematic. Click here for more details.
- metaflow_extensions/outerbounds/plugins/__init__.py +1 -1
- metaflow_extensions/outerbounds/plugins/nim/card.py +1 -6
- metaflow_extensions/outerbounds/plugins/nim/{__init__.py → nim_decorator.py} +13 -49
- metaflow_extensions/outerbounds/plugins/nim/nim_manager.py +242 -242
- metaflow_extensions/outerbounds/plugins/nim/utils.py +36 -0
- metaflow_extensions/outerbounds/plugins/secrets/secrets.py +12 -5
- {ob_metaflow_extensions-1.1.155rc0.dist-info → ob_metaflow_extensions-1.1.156.dist-info}/METADATA +2 -2
- {ob_metaflow_extensions-1.1.155rc0.dist-info → ob_metaflow_extensions-1.1.156.dist-info}/RECORD +10 -10
- metaflow_extensions/outerbounds/plugins/nim/utilities.py +0 -5
- {ob_metaflow_extensions-1.1.155rc0.dist-info → ob_metaflow_extensions-1.1.156.dist-info}/WHEEL +0 -0
- {ob_metaflow_extensions-1.1.155rc0.dist-info → ob_metaflow_extensions-1.1.156.dist-info}/top_level.txt +0 -0
|
@@ -326,7 +326,7 @@ STEP_DECORATORS_DESC = [
|
|
|
326
326
|
("snowpark", ".snowpark.snowpark_decorator.SnowparkDecorator"),
|
|
327
327
|
("tensorboard", ".tensorboard.TensorboardDecorator"),
|
|
328
328
|
("gpu_profile", ".profilers.gpu_profile_decorator.GPUProfileDecorator"),
|
|
329
|
-
("nim", ".nim.NimDecorator"),
|
|
329
|
+
("nim", ".nim.nim_decorator.NimDecorator"),
|
|
330
330
|
("ollama", ".ollama.OllamaDecorator"),
|
|
331
331
|
("app_deploy", ".apps.deploy_decorator.WorkstationAppDeployDecorator"),
|
|
332
332
|
]
|
|
@@ -1,8 +1,7 @@
|
|
|
1
|
-
import sqlite3
|
|
2
1
|
from metaflow.cards import Markdown, Table
|
|
3
2
|
from metaflow.metaflow_current import current
|
|
4
3
|
|
|
5
|
-
from .
|
|
4
|
+
from .utils import get_storage_path
|
|
6
5
|
from ..card_utilities.async_cards import CardRefresher
|
|
7
6
|
from ..card_utilities.extra_components import BarPlot, ViolinPlot
|
|
8
7
|
|
|
@@ -17,9 +16,7 @@ class NimMetricsRefresher(CardRefresher):
|
|
|
17
16
|
self._file_name = get_storage_path(current.task_id)
|
|
18
17
|
|
|
19
18
|
def sqlite_fetch_func(self, conn):
|
|
20
|
-
cursor = conn.cursor()
|
|
21
19
|
try:
|
|
22
|
-
conn = sqlite3.connect(self._file_name)
|
|
23
20
|
cursor = conn.cursor()
|
|
24
21
|
cursor.execute(
|
|
25
22
|
"SELECT error, success, status_code, prompt_tokens, completion_tokens, e2e_time, model FROM metrics"
|
|
@@ -85,7 +82,6 @@ class NimMetricsRefresher(CardRefresher):
|
|
|
85
82
|
current_card.refresh()
|
|
86
83
|
|
|
87
84
|
def on_error(self, current_card, error_message):
|
|
88
|
-
|
|
89
85
|
if isinstance(error_message, FileNotFoundError):
|
|
90
86
|
return
|
|
91
87
|
|
|
@@ -99,7 +95,6 @@ class NimMetricsRefresher(CardRefresher):
|
|
|
99
95
|
current_card.refresh()
|
|
100
96
|
|
|
101
97
|
def update_only_components(self, current_card, data_object):
|
|
102
|
-
|
|
103
98
|
# update request success data
|
|
104
99
|
self._metrics_charts["request_success"].spec["data"][0]["values"] = [
|
|
105
100
|
{
|
|
@@ -1,64 +1,31 @@
|
|
|
1
|
-
|
|
2
|
-
|
|
3
|
-
import os, time
|
|
4
|
-
from metaflow.decorators import StepDecorator
|
|
1
|
+
import os
|
|
2
|
+
import time
|
|
5
3
|
from metaflow import current
|
|
6
|
-
|
|
4
|
+
from .utils import get_storage_path, NIM_MONITOR_LOCAL_STORAGE_ROOT
|
|
7
5
|
from .nim_manager import NimManager
|
|
6
|
+
from metaflow.decorators import StepDecorator
|
|
8
7
|
from .card import NimMetricsRefresher
|
|
9
|
-
from .utilities import get_storage_path, NIM_MONITOR_LOCAL_STORAGE_ROOT
|
|
10
|
-
from ..card_utilities.async_cards import AsyncPeriodicRefresher
|
|
11
8
|
from ..card_utilities.injector import CardDecoratorInjector
|
|
9
|
+
from ..card_utilities.async_cards import AsyncPeriodicRefresher
|
|
12
10
|
|
|
13
11
|
|
|
14
12
|
class NimDecorator(StepDecorator, CardDecoratorInjector):
|
|
15
|
-
"""
|
|
16
|
-
This decorator is used to run NIM containers in Metaflow tasks as sidecars.
|
|
17
|
-
|
|
18
|
-
User code call
|
|
19
|
-
-----------
|
|
20
|
-
@nim(
|
|
21
|
-
models=['meta/llama3-8b-instruct', 'meta/llama3-70b-instruct'],
|
|
22
|
-
backend='managed'
|
|
23
|
-
)
|
|
24
|
-
|
|
25
|
-
Valid backend options
|
|
26
|
-
---------------------
|
|
27
|
-
- 'managed': Outerbounds selects a compute provider based on the model.
|
|
28
|
-
|
|
29
|
-
Valid model options
|
|
30
|
-
----------------
|
|
31
|
-
- 'meta/llama3-8b-instruct': 8B parameter model
|
|
32
|
-
- 'meta/llama3-70b-instruct': 70B parameter model
|
|
33
|
-
- any model here: https://nvcf.ngc.nvidia.com/functions?filter=nvidia-functions
|
|
34
|
-
|
|
35
|
-
Parameters
|
|
36
|
-
----------
|
|
37
|
-
models: list[NIM]
|
|
38
|
-
List of NIM containers running models in sidecars.
|
|
39
|
-
backend: str
|
|
40
|
-
Compute provider to run the NIM container.
|
|
41
|
-
queue_timeout : int
|
|
42
|
-
Time to keep the job in NVCF's queue.
|
|
43
|
-
"""
|
|
44
|
-
|
|
45
13
|
name = "nim"
|
|
14
|
+
|
|
46
15
|
defaults = {
|
|
47
16
|
"models": [],
|
|
48
|
-
"backend": "managed",
|
|
49
17
|
"monitor": True,
|
|
50
18
|
"persist_db": False,
|
|
51
|
-
"queue_timeout": 5 * 24 * 3600, # Default 5 days in seconds
|
|
52
19
|
}
|
|
53
20
|
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
21
|
+
# Refer https://github.com/Netflix/metaflow/blob/master/docs/lifecycle.png
|
|
22
|
+
# to understand where these functions are invoked in the lifecycle of a
|
|
23
|
+
# Metaflow flow.
|
|
24
|
+
def step_init(self, flow, graph, step, decos, environment, flow_datastore, logger):
|
|
58
25
|
if self.attributes["monitor"]:
|
|
59
26
|
self.attach_card_decorator(
|
|
60
27
|
flow,
|
|
61
|
-
|
|
28
|
+
step,
|
|
62
29
|
NimMetricsRefresher.CARD_ID,
|
|
63
30
|
"blank",
|
|
64
31
|
refresh_interval=4.0,
|
|
@@ -68,11 +35,9 @@ class NimDecorator(StepDecorator, CardDecoratorInjector):
|
|
|
68
35
|
{
|
|
69
36
|
"nim": NimManager(
|
|
70
37
|
models=self.attributes["models"],
|
|
71
|
-
backend=self.attributes["backend"],
|
|
72
38
|
flow=flow,
|
|
73
|
-
step_name=
|
|
39
|
+
step_name=step,
|
|
74
40
|
monitor=self.attributes["monitor"],
|
|
75
|
-
queue_timeout=self.attributes["queue_timeout"],
|
|
76
41
|
)
|
|
77
42
|
}
|
|
78
43
|
)
|
|
@@ -81,15 +46,14 @@ class NimDecorator(StepDecorator, CardDecoratorInjector):
|
|
|
81
46
|
self, step_func, flow, graph, retry_count, max_user_code_retries, ubf_context
|
|
82
47
|
):
|
|
83
48
|
if self.attributes["monitor"]:
|
|
84
|
-
|
|
85
49
|
import sqlite3
|
|
86
|
-
from metaflow import current
|
|
87
50
|
|
|
88
51
|
file_path = get_storage_path(current.task_id)
|
|
89
52
|
if os.path.exists(file_path):
|
|
90
53
|
os.remove(file_path)
|
|
91
54
|
os.makedirs(NIM_MONITOR_LOCAL_STORAGE_ROOT, exist_ok=True)
|
|
92
55
|
conn = sqlite3.connect(file_path)
|
|
56
|
+
|
|
93
57
|
cursor = conn.cursor()
|
|
94
58
|
cursor.execute(
|
|
95
59
|
"""
|
|
@@ -1,47 +1,163 @@
|
|
|
1
|
-
import
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
from .
|
|
6
|
-
from
|
|
1
|
+
import sys
|
|
2
|
+
import time
|
|
3
|
+
import requests
|
|
4
|
+
import sqlite3
|
|
5
|
+
from urllib3.util.retry import Retry
|
|
6
|
+
from requests.adapters import HTTPAdapter
|
|
7
|
+
from typing import Dict, Optional, Any
|
|
8
|
+
from .utils import get_ngc_response, get_storage_path
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def nvcf_submit_helper(
|
|
12
|
+
url: str,
|
|
13
|
+
payload: Dict[str, Any],
|
|
14
|
+
headers: Optional[Dict[str, str]] = None,
|
|
15
|
+
timeout: int = 30,
|
|
16
|
+
max_retries: int = 300,
|
|
17
|
+
backoff_factor: float = 0.3,
|
|
18
|
+
request_delay: float = 1.1,
|
|
19
|
+
log_callback: Optional[callable] = None,
|
|
20
|
+
) -> Dict[str, Any]:
|
|
21
|
+
def _log_error(start_time: float, status_code: int, poll_count: int):
|
|
22
|
+
if log_callback:
|
|
23
|
+
end_time = time.time()
|
|
24
|
+
try:
|
|
25
|
+
log_callback({}, end_time - start_time, status_code, poll_count)
|
|
26
|
+
except Exception as log_error:
|
|
27
|
+
print(f"Warning: Logging callback failed: {log_error}")
|
|
28
|
+
|
|
29
|
+
# use default headers
|
|
30
|
+
if not headers:
|
|
31
|
+
headers = {"accept": "application/json", "content-type": "application/json"}
|
|
32
|
+
print(f"Using Default Headers: {headers}")
|
|
33
|
+
|
|
34
|
+
# Configure session with retry strategy
|
|
35
|
+
session = requests.Session()
|
|
36
|
+
status_forcelist = [429, 500, 502, 503, 504, 404]
|
|
37
|
+
retry_strategy = Retry(
|
|
38
|
+
total=max_retries,
|
|
39
|
+
backoff_factor=backoff_factor,
|
|
40
|
+
status_forcelist=status_forcelist,
|
|
41
|
+
allowed_methods=["GET", "POST"],
|
|
42
|
+
)
|
|
43
|
+
adapter = HTTPAdapter(max_retries=retry_strategy)
|
|
44
|
+
session.mount("http://", adapter)
|
|
45
|
+
session.mount("https://", adapter)
|
|
46
|
+
|
|
47
|
+
# Add artificial delay if specified
|
|
48
|
+
time.sleep(request_delay)
|
|
49
|
+
|
|
50
|
+
start_time = time.time()
|
|
51
|
+
poll_count = 0
|
|
52
|
+
status_code = 0
|
|
53
|
+
response_data = {}
|
|
54
|
+
|
|
55
|
+
try:
|
|
56
|
+
# Make initial request
|
|
57
|
+
response = session.post(url, json=payload, headers=headers, timeout=timeout)
|
|
58
|
+
time.sleep(request_delay)
|
|
59
|
+
|
|
60
|
+
# Handle initial response
|
|
61
|
+
response.raise_for_status()
|
|
62
|
+
request_id = response.headers.get("NVCF-REQID")
|
|
63
|
+
polling_url = f"https://api.nvcf.nvidia.com/v2/nvcf/pexec/status/{request_id}"
|
|
64
|
+
|
|
65
|
+
print(f"Polling NVCF Request ID: {request_id}")
|
|
66
|
+
|
|
67
|
+
# Initial response status
|
|
68
|
+
status_code = response.status_code
|
|
69
|
+
print(f"Initial response status: {status_code}")
|
|
70
|
+
|
|
71
|
+
# Create a variable to store the final response
|
|
72
|
+
final_response = response
|
|
73
|
+
|
|
74
|
+
# Continue polling while we get 202 (Accepted/Processing)
|
|
75
|
+
while status_code == 202:
|
|
76
|
+
poll_count += 1
|
|
77
|
+
print(f"Polling attempt #{poll_count} to {polling_url}")
|
|
78
|
+
|
|
79
|
+
# Wait before next poll
|
|
80
|
+
time.sleep(request_delay)
|
|
81
|
+
|
|
82
|
+
# Make a new poll request
|
|
83
|
+
poll_response = session.get(polling_url, headers=headers, timeout=timeout)
|
|
84
|
+
status_code = poll_response.status_code
|
|
85
|
+
print(f"Poll #{poll_count} status: {status_code}")
|
|
86
|
+
|
|
87
|
+
# Check for errors
|
|
88
|
+
try:
|
|
89
|
+
poll_response.raise_for_status()
|
|
90
|
+
except requests.exceptions.HTTPError as e:
|
|
91
|
+
print(f"Poll request failed: {str(e)}")
|
|
92
|
+
poll_response.close()
|
|
93
|
+
# Log the error before re-raising
|
|
94
|
+
_log_error(start_time, poll_response.status_code, poll_count)
|
|
95
|
+
raise
|
|
96
|
+
|
|
97
|
+
# If status is 200, the job is complete
|
|
98
|
+
if status_code == 200:
|
|
99
|
+
print("Polling complete - job finished successfully")
|
|
100
|
+
# Update our final response to be this poll response
|
|
101
|
+
final_response = poll_response
|
|
102
|
+
break
|
|
103
|
+
|
|
104
|
+
# Close this poll response if we're going to loop again
|
|
105
|
+
if status_code == 202:
|
|
106
|
+
poll_response.close()
|
|
107
|
+
|
|
108
|
+
# If we exited the loop without a 200 status, something went wrong
|
|
109
|
+
if status_code != 200:
|
|
110
|
+
print(f"Polling ended with unexpected status: {status_code}")
|
|
111
|
+
# Log the error before raising
|
|
112
|
+
_log_error(start_time, status_code, poll_count)
|
|
113
|
+
raise Exception(f"Unexpected status code after polling: {status_code}")
|
|
114
|
+
|
|
115
|
+
# Get the response data for logging
|
|
116
|
+
response_data = final_response.json()
|
|
117
|
+
|
|
118
|
+
except requests.exceptions.HTTPError as e:
|
|
119
|
+
# Handle HTTP errors (4xx, 5xx status codes)
|
|
120
|
+
status_code = e.response.status_code if e.response else 0
|
|
121
|
+
print(f"HTTP Error: {str(e)}", file=sys.stderr)
|
|
122
|
+
# Log the error
|
|
123
|
+
_log_error(start_time, status_code, poll_count)
|
|
124
|
+
raise
|
|
125
|
+
|
|
126
|
+
except Exception as e:
|
|
127
|
+
# Handle other errors (connection errors, timeouts, etc.)
|
|
128
|
+
print(f"Request Error: {str(e)}", file=sys.stderr)
|
|
129
|
+
# Log the error with status_code 0 to indicate non-HTTP error
|
|
130
|
+
_log_error(start_time, 0, poll_count)
|
|
131
|
+
raise
|
|
132
|
+
|
|
133
|
+
# Calculate final duration and log successful requests
|
|
134
|
+
end_time = time.time()
|
|
135
|
+
duration = end_time - start_time
|
|
136
|
+
|
|
137
|
+
# Call the logging callback if provided
|
|
138
|
+
if log_callback:
|
|
139
|
+
try:
|
|
140
|
+
log_callback(response_data, duration, status_code, poll_count)
|
|
141
|
+
except Exception as e:
|
|
142
|
+
print(f"Warning: Logging callback failed: {e}")
|
|
7
143
|
|
|
144
|
+
# Log metrics
|
|
145
|
+
print(
|
|
146
|
+
f"Request completed: duration={duration:.2f}s, polls={poll_count}, "
|
|
147
|
+
f"status={status_code}, size={len(final_response.content)} bytes"
|
|
148
|
+
)
|
|
8
149
|
|
|
9
|
-
|
|
10
|
-
NVCF_SUBMIT_ENDPOINT = f"{NVCF_URL}/v2/nvcf/pexec/functions"
|
|
11
|
-
NVCF_RESULT_ENDPOINT = f"{NVCF_URL}/v2/nvcf/pexec/status"
|
|
12
|
-
NVCF_POLL_INTERVAL_SECONDS = 1
|
|
13
|
-
COMMON_HEADERS = {
|
|
14
|
-
"accept": "application/json",
|
|
15
|
-
"Content-Type": "application/json",
|
|
16
|
-
"nvcf-feature-enable-gateway-timeout": "true",
|
|
17
|
-
}
|
|
150
|
+
return response_data
|
|
18
151
|
|
|
19
152
|
|
|
20
153
|
class NimMetadata(object):
|
|
21
154
|
def __init__(self):
|
|
22
155
|
self._nvcf_chat_completion_models = []
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
conf = init_config()
|
|
26
|
-
|
|
27
|
-
if "OBP_AUTH_SERVER" in conf:
|
|
28
|
-
auth_host = conf["OBP_AUTH_SERVER"]
|
|
29
|
-
else:
|
|
30
|
-
auth_host = "auth." + urlparse(SERVICE_URL).hostname.split(".", 1)[1]
|
|
156
|
+
ngc_response = get_ngc_response()
|
|
31
157
|
|
|
32
|
-
|
|
158
|
+
self.ngc_api_key = ngc_response["nvcf"]["api_key"]
|
|
33
159
|
|
|
34
|
-
|
|
35
|
-
headers = {"x-api-key": conf["METAFLOW_SERVICE_AUTH_KEY"]}
|
|
36
|
-
res = requests.get(nim_info_url, headers=headers)
|
|
37
|
-
else:
|
|
38
|
-
headers = json.loads(os.environ.get("METAFLOW_SERVICE_HEADERS"))
|
|
39
|
-
res = requests.get(nim_info_url, headers=headers)
|
|
40
|
-
|
|
41
|
-
res.raise_for_status()
|
|
42
|
-
self._ngc_api_key = res.json()["nvcf"]["api_key"]
|
|
43
|
-
|
|
44
|
-
for model in res.json()["nvcf"]["functions"]:
|
|
160
|
+
for model in ngc_response["nvcf"]["functions"]:
|
|
45
161
|
self._nvcf_chat_completion_models.append(
|
|
46
162
|
{
|
|
47
163
|
"name": model["model_key"],
|
|
@@ -49,64 +165,48 @@ class NimMetadata(object):
|
|
|
49
165
|
"version-id": model["version"],
|
|
50
166
|
}
|
|
51
167
|
)
|
|
52
|
-
for model in res.json()["coreweave"]["containers"]:
|
|
53
|
-
self._coreweave_chat_completion_models.append(
|
|
54
|
-
{"name": model["nim_name"], "ip-address": model["ip_addr"]}
|
|
55
|
-
)
|
|
56
168
|
|
|
57
169
|
def get_nvcf_chat_completion_models(self):
|
|
58
170
|
return self._nvcf_chat_completion_models
|
|
59
171
|
|
|
60
172
|
def get_headers_for_nvcf_request(self):
|
|
61
|
-
return {
|
|
173
|
+
return {
|
|
174
|
+
"accept": "application/json",
|
|
175
|
+
"content-type": "application/json",
|
|
176
|
+
"Authorization": f"Bearer {self.ngc_api_key}",
|
|
177
|
+
"NVCF-POLL-SECONDS": "5",
|
|
178
|
+
}
|
|
62
179
|
|
|
63
180
|
|
|
64
181
|
class NimManager(object):
|
|
65
|
-
def __init__(self, models,
|
|
66
|
-
|
|
182
|
+
def __init__(self, models, flow, step_name, monitor):
|
|
67
183
|
nim_metadata = NimMetadata()
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
f"\nYou can choose from these options: {nvcf_models}\n\n"
|
|
87
|
-
"Reach out to Outerbounds if there are other models you'd like supported."
|
|
88
|
-
)
|
|
89
|
-
else:
|
|
90
|
-
raise ValueError(
|
|
91
|
-
f"Backend {backend} not supported by the Outerbounds @nim offering. Please reach out to Outerbounds."
|
|
92
|
-
)
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
class JobStatus(object):
|
|
96
|
-
SUBMITTED = "SUBMITTED"
|
|
97
|
-
RUNNING = "RUNNING"
|
|
98
|
-
SUCCESSFUL = "SUCCESSFUL"
|
|
99
|
-
FAILED = "FAILED"
|
|
184
|
+
nvcf_models = [
|
|
185
|
+
m["name"] for m in nim_metadata.get_nvcf_chat_completion_models()
|
|
186
|
+
]
|
|
187
|
+
self.models = {}
|
|
188
|
+
|
|
189
|
+
for each_model in models:
|
|
190
|
+
if each_model in nvcf_models:
|
|
191
|
+
self.models[each_model] = NimChatCompletion(
|
|
192
|
+
model=each_model,
|
|
193
|
+
nim_metadata=nim_metadata,
|
|
194
|
+
monitor=monitor,
|
|
195
|
+
)
|
|
196
|
+
else:
|
|
197
|
+
raise ValueError(
|
|
198
|
+
f"Model {each_model} not supported by the Outerbounds @nim offering."
|
|
199
|
+
f"\nYou can choose from these options: {nvcf_models}\n\n"
|
|
200
|
+
"Reach out to Outerbounds if there are other models you'd like supported."
|
|
201
|
+
)
|
|
100
202
|
|
|
101
203
|
|
|
102
204
|
class NimChatCompletion(object):
|
|
103
205
|
def __init__(
|
|
104
206
|
self,
|
|
105
|
-
model="meta/llama3-8b-instruct",
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
monitor=False,
|
|
109
|
-
queue_timeout=None,
|
|
207
|
+
model: str = "meta/llama3-8b-instruct",
|
|
208
|
+
nim_metadata: NimMetadata = None,
|
|
209
|
+
monitor: bool = False,
|
|
110
210
|
**kwargs,
|
|
111
211
|
):
|
|
112
212
|
if nim_metadata is None:
|
|
@@ -114,79 +214,54 @@ class NimChatCompletion(object):
|
|
|
114
214
|
"NimMetadata object is required to initialize NimChatCompletion object."
|
|
115
215
|
)
|
|
116
216
|
|
|
117
|
-
self.
|
|
118
|
-
self.
|
|
119
|
-
self.invocations = []
|
|
120
|
-
self.max_request_retries = int(
|
|
121
|
-
os.environ.get("METAFLOW_EXT_HTTP_MAX_RETRIES", "10")
|
|
122
|
-
)
|
|
217
|
+
self.model_name = model
|
|
218
|
+
self.nim_metadata = nim_metadata
|
|
123
219
|
self.monitor = monitor
|
|
124
220
|
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
self.model = model
|
|
130
|
-
self.function_id = self._nim_metadata.get_nvcf_chat_completion_models()[
|
|
131
|
-
nvcf_model_names.index(model)
|
|
132
|
-
]["function-id"]
|
|
133
|
-
self.version_id = self._nim_metadata.get_nvcf_chat_completion_models()[
|
|
134
|
-
nvcf_model_names.index(model)
|
|
135
|
-
]["version-id"]
|
|
136
|
-
else:
|
|
221
|
+
all_nvcf_models = self.nim_metadata.get_nvcf_chat_completion_models()
|
|
222
|
+
all_nvcf_model_names = [m["name"] for m in all_nvcf_models]
|
|
223
|
+
|
|
224
|
+
if self.model_name not in all_nvcf_model_names:
|
|
137
225
|
raise ValueError(
|
|
138
|
-
f"
|
|
226
|
+
f"Model {self.model_name} not found in available NVCF models"
|
|
139
227
|
)
|
|
140
228
|
|
|
141
|
-
|
|
229
|
+
self.model = all_nvcf_models[all_nvcf_model_names.index(self.model_name)]
|
|
230
|
+
self.function_id = self.model["function-id"]
|
|
231
|
+
self.version_id = self.model["version-id"]
|
|
232
|
+
|
|
142
233
|
self.first_request = True
|
|
143
234
|
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
def _log_stats(self, response, e2e_time):
|
|
168
|
-
stats = {}
|
|
169
|
-
if response.status_code == 200:
|
|
170
|
-
stats["success"] = 1
|
|
171
|
-
stats["error"] = 0
|
|
235
|
+
def log_stats(self, response_data, duration, status_code, poll_count):
|
|
236
|
+
if not self.monitor:
|
|
237
|
+
return
|
|
238
|
+
|
|
239
|
+
stats = {
|
|
240
|
+
"status_code": status_code,
|
|
241
|
+
"success": 1 if status_code == 200 else 0,
|
|
242
|
+
"error": 0 if status_code == 200 else 1,
|
|
243
|
+
"e2e_time": duration,
|
|
244
|
+
"model": self.model_name,
|
|
245
|
+
"poll_count": poll_count,
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
if status_code == 200 and response_data:
|
|
249
|
+
try:
|
|
250
|
+
stats["prompt_tokens"] = response_data["usage"]["prompt_tokens"]
|
|
251
|
+
except (KeyError, TypeError):
|
|
252
|
+
stats["prompt_tokens"] = None
|
|
253
|
+
|
|
254
|
+
try:
|
|
255
|
+
stats["completion_tokens"] = response_data["usage"]["completion_tokens"]
|
|
256
|
+
except (KeyError, TypeError):
|
|
257
|
+
stats["completion_tokens"] = None
|
|
172
258
|
else:
|
|
173
|
-
stats["success"] = 0
|
|
174
|
-
stats["error"] = 1
|
|
175
|
-
stats["status_code"] = response.status_code
|
|
176
|
-
try:
|
|
177
|
-
stats["prompt_tokens"] = response.json()["usage"]["prompt_tokens"]
|
|
178
|
-
except KeyError:
|
|
179
259
|
stats["prompt_tokens"] = None
|
|
180
|
-
try:
|
|
181
|
-
stats["completion_tokens"] = response.json()["usage"]["completion_tokens"]
|
|
182
|
-
except KeyError:
|
|
183
260
|
stats["completion_tokens"] = None
|
|
184
|
-
stats["e2e_time"] = e2e_time
|
|
185
|
-
stats["provider"] = self.compute_provider
|
|
186
|
-
stats["model"] = self.model
|
|
187
261
|
|
|
188
262
|
conn = sqlite3.connect(self.file_name)
|
|
189
263
|
cursor = conn.cursor()
|
|
264
|
+
|
|
190
265
|
try:
|
|
191
266
|
cursor.execute(
|
|
192
267
|
"""
|
|
@@ -207,112 +282,37 @@ class NimChatCompletion(object):
|
|
|
207
282
|
finally:
|
|
208
283
|
conn.close()
|
|
209
284
|
|
|
210
|
-
@retry_on_status(status_codes=[500], max_retries=3, delay=5)
|
|
211
|
-
@retry_on_status(status_codes=[504])
|
|
212
285
|
def __call__(self, **kwargs):
|
|
213
|
-
|
|
214
286
|
if self.first_request:
|
|
215
|
-
# Put here to guarantee self.file_name is set after task_id exists.
|
|
216
287
|
from metaflow import current
|
|
217
288
|
|
|
218
289
|
self.file_name = get_storage_path(current.task_id)
|
|
290
|
+
self.first_request = False
|
|
219
291
|
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
)
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
"[@nim ERROR] The OpenAI-compatible returned a 400 status code. "
|
|
248
|
-
+ "Known causes include improper requests or prompts with too many tokens for the selected model. "
|
|
249
|
-
+ "Please contact Outerbounds if you need assistance resolving the issue."
|
|
250
|
-
)
|
|
251
|
-
print(msg, file=sys.stderr)
|
|
252
|
-
self._result = {"ERROR": msg}
|
|
253
|
-
return self._result
|
|
254
|
-
except (
|
|
255
|
-
requests.exceptions.ConnectionError,
|
|
256
|
-
requests.exceptions.ReadTimeout,
|
|
257
|
-
) as e:
|
|
258
|
-
# ConnectionErrors are generally temporary errors like DNS resolution failures,
|
|
259
|
-
# timeouts etc.
|
|
260
|
-
print(
|
|
261
|
-
"received error of type {}. Retrying...".format(type(e)),
|
|
262
|
-
e,
|
|
263
|
-
file=sys.stderr,
|
|
264
|
-
)
|
|
265
|
-
time.sleep(retry_delay)
|
|
266
|
-
retry_delay *= 2 # Double the delay for the next attempt
|
|
267
|
-
retry_delay += random.uniform(0, 1) # Add jitter
|
|
268
|
-
retry_delay = min(retry_delay, 10)
|
|
269
|
-
|
|
270
|
-
def _poll():
|
|
271
|
-
poll_request_url = f"{NVCF_RESULT_ENDPOINT}/{invocation_id}"
|
|
272
|
-
attempts = 0
|
|
273
|
-
retry_delay = 1
|
|
274
|
-
while attempts < self.max_request_retries:
|
|
275
|
-
try:
|
|
276
|
-
attempts += 1
|
|
277
|
-
poll_response = requests.get(
|
|
278
|
-
poll_request_url,
|
|
279
|
-
headers=self._nim_metadata.get_headers_for_nvcf_request(),
|
|
280
|
-
)
|
|
281
|
-
if poll_response.status_code == 200:
|
|
282
|
-
tf = time.time()
|
|
283
|
-
self._log_stats(response, tf - t0)
|
|
284
|
-
self._status = JobStatus.SUCCESSFUL
|
|
285
|
-
self._result = poll_response.json()
|
|
286
|
-
return self._result
|
|
287
|
-
elif poll_response.status_code == 202:
|
|
288
|
-
self._status = JobStatus.SUBMITTED
|
|
289
|
-
return 202
|
|
290
|
-
elif poll_response.status_code == 400:
|
|
291
|
-
self._status = JobStatus.FAILED
|
|
292
|
-
msg = (
|
|
293
|
-
"[@nim ERROR] The OpenAI-compatible API returned a 400 status code. "
|
|
294
|
-
+ "Known causes include improper requests or prompts with too many tokens for the selected model. "
|
|
295
|
-
+ "Please contact Outerbounds if you need assistance resolving the issue."
|
|
296
|
-
)
|
|
297
|
-
print(msg, file=sys.stderr)
|
|
298
|
-
self._result = {"@nim ERROR": msg}
|
|
299
|
-
return self._result
|
|
300
|
-
except (
|
|
301
|
-
requests.exceptions.ConnectionError,
|
|
302
|
-
requests.exceptions.ReadTimeout,
|
|
303
|
-
) as e:
|
|
304
|
-
print(
|
|
305
|
-
"received error of type {}. Retrying...".format(type(e)),
|
|
306
|
-
e,
|
|
307
|
-
file=sys.stderr,
|
|
308
|
-
)
|
|
309
|
-
time.sleep(retry_delay)
|
|
310
|
-
retry_delay *= 2 # Double the delay for the next attempt
|
|
311
|
-
retry_delay += random.uniform(0, 1) # Add jitter
|
|
312
|
-
retry_delay = min(retry_delay, 10)
|
|
313
|
-
|
|
314
|
-
while True:
|
|
315
|
-
data = _poll()
|
|
316
|
-
if data and data != 202:
|
|
317
|
-
return data
|
|
318
|
-
time.sleep(NVCF_POLL_INTERVAL_SECONDS)
|
|
292
|
+
# Create log callback if monitoring is enabled
|
|
293
|
+
log_callback = self.log_stats if self.monitor else None
|
|
294
|
+
|
|
295
|
+
request_data = {"model": self.model_name, **kwargs}
|
|
296
|
+
request_url = (
|
|
297
|
+
f"https://api.nvcf.nvidia.com/v2/nvcf/pexec/functions/{self.function_id}"
|
|
298
|
+
)
|
|
299
|
+
|
|
300
|
+
try:
|
|
301
|
+
response_data = nvcf_submit_helper(
|
|
302
|
+
url=request_url,
|
|
303
|
+
payload=request_data,
|
|
304
|
+
headers=self.nim_metadata.get_headers_for_nvcf_request(),
|
|
305
|
+
log_callback=log_callback,
|
|
306
|
+
)
|
|
307
|
+
|
|
308
|
+
return response_data
|
|
309
|
+
|
|
310
|
+
except requests.exceptions.HTTPError as e:
|
|
311
|
+
error_msg = f"[@nim ERROR] NVCF API request failed: {str(e)}"
|
|
312
|
+
print(error_msg, file=sys.stderr)
|
|
313
|
+
raise
|
|
314
|
+
|
|
315
|
+
except Exception as e:
|
|
316
|
+
error_msg = f"[@nim ERROR] Unexpected error: {str(e)}"
|
|
317
|
+
print(error_msg, file=sys.stderr)
|
|
318
|
+
raise
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
import os
|
|
2
|
+
import json
|
|
3
|
+
import requests
|
|
4
|
+
from urllib.parse import urlparse
|
|
5
|
+
from metaflow.metaflow_config import SERVICE_URL
|
|
6
|
+
from metaflow.metaflow_config_funcs import init_config
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
NIM_MONITOR_LOCAL_STORAGE_ROOT = ".nim-monitor"
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def get_storage_path(task_id):
|
|
13
|
+
return f"{NIM_MONITOR_LOCAL_STORAGE_ROOT}/" + task_id + ".sqlite"
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def get_ngc_response():
|
|
17
|
+
conf = init_config()
|
|
18
|
+
if "OBP_AUTH_SERVER" in conf:
|
|
19
|
+
auth_host = conf["OBP_AUTH_SERVER"]
|
|
20
|
+
else:
|
|
21
|
+
auth_host = "auth." + urlparse(SERVICE_URL).hostname.split(".", 1)[1]
|
|
22
|
+
|
|
23
|
+
# NOTE: reusing the same auth_host as the one used in NimMetadata,
|
|
24
|
+
# however, user should not need to use nim container to use @nvct.
|
|
25
|
+
# May want to refactor this to a common endpoint.
|
|
26
|
+
nim_info_url = "https://" + auth_host + "/generate/nim"
|
|
27
|
+
|
|
28
|
+
if "METAFLOW_SERVICE_AUTH_KEY" in conf:
|
|
29
|
+
headers = {"x-api-key": conf["METAFLOW_SERVICE_AUTH_KEY"]}
|
|
30
|
+
res = requests.get(nim_info_url, headers=headers)
|
|
31
|
+
else:
|
|
32
|
+
headers = json.loads(os.environ.get("METAFLOW_SERVICE_HEADERS"))
|
|
33
|
+
res = requests.get(nim_info_url, headers=headers)
|
|
34
|
+
|
|
35
|
+
res.raise_for_status()
|
|
36
|
+
return res.json()
|
|
@@ -187,11 +187,18 @@ class OuterboundsSecretsProvider(SecretsProvider):
|
|
|
187
187
|
raise OuterboundsSecretsException(
|
|
188
188
|
f"Server error: {response.text}. Please reach out to your Outerbounds support team."
|
|
189
189
|
)
|
|
190
|
-
|
|
190
|
+
|
|
191
|
+
body = response.json()
|
|
192
|
+
status_code = body.get("error", {}).get("statusCode", response.status_code)
|
|
191
193
|
if status_code == 404:
|
|
192
|
-
raise OuterboundsSecretsException(f"Secret not found: {
|
|
194
|
+
raise OuterboundsSecretsException(f"Secret not found: {body}")
|
|
193
195
|
|
|
194
196
|
if status_code >= 400:
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
197
|
+
try:
|
|
198
|
+
raise OuterboundsSecretsException(
|
|
199
|
+
f"status_code={status_code}\t*{body['error']['details']['kind']}*\n{body['error']['details']['message']}"
|
|
200
|
+
)
|
|
201
|
+
except KeyError:
|
|
202
|
+
raise OuterboundsSecretsException(
|
|
203
|
+
f"status_code={status_code} Unexpected error: {body}"
|
|
204
|
+
)
|
{ob_metaflow_extensions-1.1.155rc0.dist-info → ob_metaflow_extensions-1.1.156.dist-info}/METADATA
RENAMED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: ob-metaflow-extensions
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.156
|
|
4
4
|
Summary: Outerbounds Platform Extensions for Metaflow
|
|
5
5
|
Author: Outerbounds, Inc.
|
|
6
6
|
License: Commercial
|
|
7
7
|
Description-Content-Type: text/markdown
|
|
8
8
|
Requires-Dist: boto3
|
|
9
9
|
Requires-Dist: kubernetes
|
|
10
|
-
Requires-Dist: ob-metaflow (==2.15.
|
|
10
|
+
Requires-Dist: ob-metaflow (==2.15.14.1)
|
|
11
11
|
|
|
12
12
|
# Outerbounds platform package
|
|
13
13
|
|
{ob_metaflow_extensions-1.1.155rc0.dist-info → ob_metaflow_extensions-1.1.156.dist-info}/RECORD
RENAMED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
metaflow_extensions/outerbounds/__init__.py,sha256=Gb8u06s9ClQsA_vzxmkCzuMnigPy7kKcDnLfb7eB-64,514
|
|
2
2
|
metaflow_extensions/outerbounds/remote_config.py,sha256=pEFJuKDYs98eoB_-ryPjVi9b_c4gpHMdBHE14ltoxIU,4672
|
|
3
3
|
metaflow_extensions/outerbounds/config/__init__.py,sha256=JsQGRuGFz28fQWjUvxUgR8EKBLGRdLUIk_buPLJplJY,1225
|
|
4
|
-
metaflow_extensions/outerbounds/plugins/__init__.py,sha256=
|
|
4
|
+
metaflow_extensions/outerbounds/plugins/__init__.py,sha256=gytuNt3lNabirHLEYzrmHFMp-JWh8dA2AZPK11HmaNw,13242
|
|
5
5
|
metaflow_extensions/outerbounds/plugins/auth_server.py,sha256=_Q9_2EL0Xy77bCRphkwT1aSu8gQXRDOH-Z-RxTUO8N4,2202
|
|
6
6
|
metaflow_extensions/outerbounds/plugins/perimeters.py,sha256=QXh3SFP7GQbS-RAIxUOPbhPzQ7KDFVxZkTdKqFKgXjI,2697
|
|
7
7
|
metaflow_extensions/outerbounds/plugins/apps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -23,10 +23,10 @@ metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_cli.py,sha256=kq
|
|
|
23
23
|
metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_decorator.py,sha256=MXSIp05-jvt8Q2uGaLKjtuM_ToLeRLxhtMbfHc9Kcko,1515
|
|
24
24
|
metaflow_extensions/outerbounds/plugins/kubernetes/__init__.py,sha256=5zG8gShSj8m7rgF4xgWBZFuY3GDP5n1T0ktjRpGJLHA,69
|
|
25
25
|
metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py,sha256=fx_XUkgR4r6hF2ilDfT5LubRyVrYMVIv5f6clHkCaEk,5988
|
|
26
|
-
metaflow_extensions/outerbounds/plugins/nim/
|
|
27
|
-
metaflow_extensions/outerbounds/plugins/nim/
|
|
28
|
-
metaflow_extensions/outerbounds/plugins/nim/nim_manager.py,sha256=
|
|
29
|
-
metaflow_extensions/outerbounds/plugins/nim/
|
|
26
|
+
metaflow_extensions/outerbounds/plugins/nim/card.py,sha256=dXOJvsZed5NyYyxYLPDvtwg9z_X4azL9HTJGYaiNriY,4690
|
|
27
|
+
metaflow_extensions/outerbounds/plugins/nim/nim_decorator.py,sha256=50YVvC7mcZYlPluM0Wq1UtufhzlQb-RxzZkTOJJ3LkM,3439
|
|
28
|
+
metaflow_extensions/outerbounds/plugins/nim/nim_manager.py,sha256=5YkohM-vfoDHPUMWb19sY0HErORoKOKf4jexERJTO80,10912
|
|
29
|
+
metaflow_extensions/outerbounds/plugins/nim/utils.py,sha256=nU-v1sheBjmITXfHiJx2ucm_Tq_nGb5BcuAm5c235cQ,1164
|
|
30
30
|
metaflow_extensions/outerbounds/plugins/nvcf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
31
31
|
metaflow_extensions/outerbounds/plugins/nvcf/constants.py,sha256=aGHdNw_hqBu8i0zWXcatQM6e769wUXox0l8g0f6fNZ8,146
|
|
32
32
|
metaflow_extensions/outerbounds/plugins/nvcf/exceptions.py,sha256=-Pm9cOWUzpv94TvVUeq-FenAWdfLBJd5N7WPqIGZVqU,3671
|
|
@@ -47,7 +47,7 @@ metaflow_extensions/outerbounds/plugins/ollama/ollama.py,sha256=KlP8_EmnUoi8-Pid
|
|
|
47
47
|
metaflow_extensions/outerbounds/plugins/profilers/deco_injector.py,sha256=oI_C3c64XBm7n88FILqHwn-Nnc5DeT_68I67lM9rXaI,2434
|
|
48
48
|
metaflow_extensions/outerbounds/plugins/profilers/gpu_profile_decorator.py,sha256=gDHQ2sMIp4NuZSzUspbSd8RGdFAoO5mgZAyFcZ2a51Y,2619
|
|
49
49
|
metaflow_extensions/outerbounds/plugins/secrets/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
50
|
-
metaflow_extensions/outerbounds/plugins/secrets/secrets.py,sha256=
|
|
50
|
+
metaflow_extensions/outerbounds/plugins/secrets/secrets.py,sha256=3s98hO_twKkM22tKyDdcUjGQNfYpSXW_jLKISV9ju_U,8433
|
|
51
51
|
metaflow_extensions/outerbounds/plugins/snowflake/__init__.py,sha256=RG4ixt3jwqcK1_tt0QxLcUbNmf7wWAMnZhBx-ZMGgLk,114
|
|
52
52
|
metaflow_extensions/outerbounds/plugins/snowflake/snowflake.py,sha256=zoWSHM4CJSfUmJSP-_i4zREWyQOW4USBlgjhQnEhlTE,13669
|
|
53
53
|
metaflow_extensions/outerbounds/plugins/snowpark/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
@@ -68,7 +68,7 @@ metaflow_extensions/outerbounds/toplevel/plugins/gcp/__init__.py,sha256=BbZiaH3u
|
|
|
68
68
|
metaflow_extensions/outerbounds/toplevel/plugins/kubernetes/__init__.py,sha256=5zG8gShSj8m7rgF4xgWBZFuY3GDP5n1T0ktjRpGJLHA,69
|
|
69
69
|
metaflow_extensions/outerbounds/toplevel/plugins/ollama/__init__.py,sha256=GRSz2zwqkvlmFS6bcfYD_CX6CMko9DHQokMaH1iBshA,47
|
|
70
70
|
metaflow_extensions/outerbounds/toplevel/plugins/snowflake/__init__.py,sha256=LptpH-ziXHrednMYUjIaosS1SXD3sOtF_9_eRqd8SJw,50
|
|
71
|
-
ob_metaflow_extensions-1.1.
|
|
72
|
-
ob_metaflow_extensions-1.1.
|
|
73
|
-
ob_metaflow_extensions-1.1.
|
|
74
|
-
ob_metaflow_extensions-1.1.
|
|
71
|
+
ob_metaflow_extensions-1.1.156.dist-info/METADATA,sha256=G9c19j9g0v8dDQU5sP5Zaaub2fot__EMCJ6iBQBb4Qo,521
|
|
72
|
+
ob_metaflow_extensions-1.1.156.dist-info/WHEEL,sha256=bb2Ot9scclHKMOLDEHY6B2sicWOgugjFKaJsT7vwMQo,110
|
|
73
|
+
ob_metaflow_extensions-1.1.156.dist-info/top_level.txt,sha256=NwG0ukwjygtanDETyp_BUdtYtqIA_lOjzFFh1TsnxvI,20
|
|
74
|
+
ob_metaflow_extensions-1.1.156.dist-info/RECORD,,
|
{ob_metaflow_extensions-1.1.155rc0.dist-info → ob_metaflow_extensions-1.1.156.dist-info}/WHEEL
RENAMED
|
File without changes
|
|
File without changes
|