ob-metaflow-extensions 1.1.155__py2.py3-none-any.whl → 1.1.156__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ob-metaflow-extensions might be problematic. Click here for more details.

@@ -326,7 +326,7 @@ STEP_DECORATORS_DESC = [
326
326
  ("snowpark", ".snowpark.snowpark_decorator.SnowparkDecorator"),
327
327
  ("tensorboard", ".tensorboard.TensorboardDecorator"),
328
328
  ("gpu_profile", ".profilers.gpu_profile_decorator.GPUProfileDecorator"),
329
- ("nim", ".nim.NimDecorator"),
329
+ ("nim", ".nim.nim_decorator.NimDecorator"),
330
330
  ("ollama", ".ollama.OllamaDecorator"),
331
331
  ("app_deploy", ".apps.deploy_decorator.WorkstationAppDeployDecorator"),
332
332
  ]
@@ -1,8 +1,7 @@
1
- import sqlite3
2
1
  from metaflow.cards import Markdown, Table
3
2
  from metaflow.metaflow_current import current
4
3
 
5
- from .utilities import get_storage_path
4
+ from .utils import get_storage_path
6
5
  from ..card_utilities.async_cards import CardRefresher
7
6
  from ..card_utilities.extra_components import BarPlot, ViolinPlot
8
7
 
@@ -17,9 +16,7 @@ class NimMetricsRefresher(CardRefresher):
17
16
  self._file_name = get_storage_path(current.task_id)
18
17
 
19
18
  def sqlite_fetch_func(self, conn):
20
- cursor = conn.cursor()
21
19
  try:
22
- conn = sqlite3.connect(self._file_name)
23
20
  cursor = conn.cursor()
24
21
  cursor.execute(
25
22
  "SELECT error, success, status_code, prompt_tokens, completion_tokens, e2e_time, model FROM metrics"
@@ -85,7 +82,6 @@ class NimMetricsRefresher(CardRefresher):
85
82
  current_card.refresh()
86
83
 
87
84
  def on_error(self, current_card, error_message):
88
-
89
85
  if isinstance(error_message, FileNotFoundError):
90
86
  return
91
87
 
@@ -99,7 +95,6 @@ class NimMetricsRefresher(CardRefresher):
99
95
  current_card.refresh()
100
96
 
101
97
  def update_only_components(self, current_card, data_object):
102
-
103
98
  # update request success data
104
99
  self._metrics_charts["request_success"].spec["data"][0]["values"] = [
105
100
  {
@@ -1,64 +1,31 @@
1
- from functools import partial
2
- from uuid import uuid4
3
- import os, time
4
- from metaflow.decorators import StepDecorator
1
+ import os
2
+ import time
5
3
  from metaflow import current
6
-
4
+ from .utils import get_storage_path, NIM_MONITOR_LOCAL_STORAGE_ROOT
7
5
  from .nim_manager import NimManager
6
+ from metaflow.decorators import StepDecorator
8
7
  from .card import NimMetricsRefresher
9
- from .utilities import get_storage_path, NIM_MONITOR_LOCAL_STORAGE_ROOT
10
- from ..card_utilities.async_cards import AsyncPeriodicRefresher
11
8
  from ..card_utilities.injector import CardDecoratorInjector
9
+ from ..card_utilities.async_cards import AsyncPeriodicRefresher
12
10
 
13
11
 
14
12
  class NimDecorator(StepDecorator, CardDecoratorInjector):
15
- """
16
- This decorator is used to run NIM containers in Metaflow tasks as sidecars.
17
-
18
- User code call
19
- -----------
20
- @nim(
21
- models=['meta/llama3-8b-instruct', 'meta/llama3-70b-instruct'],
22
- backend='managed'
23
- )
24
-
25
- Valid backend options
26
- ---------------------
27
- - 'managed': Outerbounds selects a compute provider based on the model.
28
-
29
- Valid model options
30
- ----------------
31
- - 'meta/llama3-8b-instruct': 8B parameter model
32
- - 'meta/llama3-70b-instruct': 70B parameter model
33
- - any model here: https://nvcf.ngc.nvidia.com/functions?filter=nvidia-functions
34
-
35
- Parameters
36
- ----------
37
- models: list[NIM]
38
- List of NIM containers running models in sidecars.
39
- backend: str
40
- Compute provider to run the NIM container.
41
- queue_timeout : int
42
- Time to keep the job in NVCF's queue.
43
- """
44
-
45
13
  name = "nim"
14
+
46
15
  defaults = {
47
16
  "models": [],
48
- "backend": "managed",
49
17
  "monitor": True,
50
18
  "persist_db": False,
51
- "queue_timeout": 5 * 24 * 3600, # Default 5 days in seconds
52
19
  }
53
20
 
54
- def step_init(
55
- self, flow, graph, step_name, decorators, environment, flow_datastore, logger
56
- ):
57
-
21
+ # Refer https://github.com/Netflix/metaflow/blob/master/docs/lifecycle.png
22
+ # to understand where these functions are invoked in the lifecycle of a
23
+ # Metaflow flow.
24
+ def step_init(self, flow, graph, step, decos, environment, flow_datastore, logger):
58
25
  if self.attributes["monitor"]:
59
26
  self.attach_card_decorator(
60
27
  flow,
61
- step_name,
28
+ step,
62
29
  NimMetricsRefresher.CARD_ID,
63
30
  "blank",
64
31
  refresh_interval=4.0,
@@ -68,11 +35,9 @@ class NimDecorator(StepDecorator, CardDecoratorInjector):
68
35
  {
69
36
  "nim": NimManager(
70
37
  models=self.attributes["models"],
71
- backend=self.attributes["backend"],
72
38
  flow=flow,
73
- step_name=step_name,
39
+ step_name=step,
74
40
  monitor=self.attributes["monitor"],
75
- queue_timeout=self.attributes["queue_timeout"],
76
41
  )
77
42
  }
78
43
  )
@@ -81,15 +46,14 @@ class NimDecorator(StepDecorator, CardDecoratorInjector):
81
46
  self, step_func, flow, graph, retry_count, max_user_code_retries, ubf_context
82
47
  ):
83
48
  if self.attributes["monitor"]:
84
-
85
49
  import sqlite3
86
- from metaflow import current
87
50
 
88
51
  file_path = get_storage_path(current.task_id)
89
52
  if os.path.exists(file_path):
90
53
  os.remove(file_path)
91
54
  os.makedirs(NIM_MONITOR_LOCAL_STORAGE_ROOT, exist_ok=True)
92
55
  conn = sqlite3.connect(file_path)
56
+
93
57
  cursor = conn.cursor()
94
58
  cursor.execute(
95
59
  """
@@ -1,47 +1,163 @@
1
- import os, sys, time, json, random, requests, sqlite3
2
- from urllib.parse import urlparse
3
- from metaflow.metaflow_config import SERVICE_URL
4
- from metaflow.metaflow_config_funcs import init_config
5
- from .utilities import get_storage_path
6
- from ..nvcf.nvcf import retry_on_status
1
+ import sys
2
+ import time
3
+ import requests
4
+ import sqlite3
5
+ from urllib3.util.retry import Retry
6
+ from requests.adapters import HTTPAdapter
7
+ from typing import Dict, Optional, Any
8
+ from .utils import get_ngc_response, get_storage_path
9
+
10
+
11
+ def nvcf_submit_helper(
12
+ url: str,
13
+ payload: Dict[str, Any],
14
+ headers: Optional[Dict[str, str]] = None,
15
+ timeout: int = 30,
16
+ max_retries: int = 300,
17
+ backoff_factor: float = 0.3,
18
+ request_delay: float = 1.1,
19
+ log_callback: Optional[callable] = None,
20
+ ) -> Dict[str, Any]:
21
+ def _log_error(start_time: float, status_code: int, poll_count: int):
22
+ if log_callback:
23
+ end_time = time.time()
24
+ try:
25
+ log_callback({}, end_time - start_time, status_code, poll_count)
26
+ except Exception as log_error:
27
+ print(f"Warning: Logging callback failed: {log_error}")
28
+
29
+ # use default headers
30
+ if not headers:
31
+ headers = {"accept": "application/json", "content-type": "application/json"}
32
+ print(f"Using Default Headers: {headers}")
33
+
34
+ # Configure session with retry strategy
35
+ session = requests.Session()
36
+ status_forcelist = [429, 500, 502, 503, 504, 404]
37
+ retry_strategy = Retry(
38
+ total=max_retries,
39
+ backoff_factor=backoff_factor,
40
+ status_forcelist=status_forcelist,
41
+ allowed_methods=["GET", "POST"],
42
+ )
43
+ adapter = HTTPAdapter(max_retries=retry_strategy)
44
+ session.mount("http://", adapter)
45
+ session.mount("https://", adapter)
46
+
47
+ # Add artificial delay if specified
48
+ time.sleep(request_delay)
49
+
50
+ start_time = time.time()
51
+ poll_count = 0
52
+ status_code = 0
53
+ response_data = {}
54
+
55
+ try:
56
+ # Make initial request
57
+ response = session.post(url, json=payload, headers=headers, timeout=timeout)
58
+ time.sleep(request_delay)
59
+
60
+ # Handle initial response
61
+ response.raise_for_status()
62
+ request_id = response.headers.get("NVCF-REQID")
63
+ polling_url = f"https://api.nvcf.nvidia.com/v2/nvcf/pexec/status/{request_id}"
64
+
65
+ print(f"Polling NVCF Request ID: {request_id}")
66
+
67
+ # Initial response status
68
+ status_code = response.status_code
69
+ print(f"Initial response status: {status_code}")
70
+
71
+ # Create a variable to store the final response
72
+ final_response = response
73
+
74
+ # Continue polling while we get 202 (Accepted/Processing)
75
+ while status_code == 202:
76
+ poll_count += 1
77
+ print(f"Polling attempt #{poll_count} to {polling_url}")
78
+
79
+ # Wait before next poll
80
+ time.sleep(request_delay)
81
+
82
+ # Make a new poll request
83
+ poll_response = session.get(polling_url, headers=headers, timeout=timeout)
84
+ status_code = poll_response.status_code
85
+ print(f"Poll #{poll_count} status: {status_code}")
86
+
87
+ # Check for errors
88
+ try:
89
+ poll_response.raise_for_status()
90
+ except requests.exceptions.HTTPError as e:
91
+ print(f"Poll request failed: {str(e)}")
92
+ poll_response.close()
93
+ # Log the error before re-raising
94
+ _log_error(start_time, poll_response.status_code, poll_count)
95
+ raise
96
+
97
+ # If status is 200, the job is complete
98
+ if status_code == 200:
99
+ print("Polling complete - job finished successfully")
100
+ # Update our final response to be this poll response
101
+ final_response = poll_response
102
+ break
103
+
104
+ # Close this poll response if we're going to loop again
105
+ if status_code == 202:
106
+ poll_response.close()
107
+
108
+ # If we exited the loop without a 200 status, something went wrong
109
+ if status_code != 200:
110
+ print(f"Polling ended with unexpected status: {status_code}")
111
+ # Log the error before raising
112
+ _log_error(start_time, status_code, poll_count)
113
+ raise Exception(f"Unexpected status code after polling: {status_code}")
114
+
115
+ # Get the response data for logging
116
+ response_data = final_response.json()
117
+
118
+ except requests.exceptions.HTTPError as e:
119
+ # Handle HTTP errors (4xx, 5xx status codes)
120
+ status_code = e.response.status_code if e.response else 0
121
+ print(f"HTTP Error: {str(e)}", file=sys.stderr)
122
+ # Log the error
123
+ _log_error(start_time, status_code, poll_count)
124
+ raise
125
+
126
+ except Exception as e:
127
+ # Handle other errors (connection errors, timeouts, etc.)
128
+ print(f"Request Error: {str(e)}", file=sys.stderr)
129
+ # Log the error with status_code 0 to indicate non-HTTP error
130
+ _log_error(start_time, 0, poll_count)
131
+ raise
132
+
133
+ # Calculate final duration and log successful requests
134
+ end_time = time.time()
135
+ duration = end_time - start_time
136
+
137
+ # Call the logging callback if provided
138
+ if log_callback:
139
+ try:
140
+ log_callback(response_data, duration, status_code, poll_count)
141
+ except Exception as e:
142
+ print(f"Warning: Logging callback failed: {e}")
7
143
 
144
+ # Log metrics
145
+ print(
146
+ f"Request completed: duration={duration:.2f}s, polls={poll_count}, "
147
+ f"status={status_code}, size={len(final_response.content)} bytes"
148
+ )
8
149
 
9
- NVCF_URL = "https://api.nvcf.nvidia.com"
10
- NVCF_SUBMIT_ENDPOINT = f"{NVCF_URL}/v2/nvcf/pexec/functions"
11
- NVCF_RESULT_ENDPOINT = f"{NVCF_URL}/v2/nvcf/pexec/status"
12
- NVCF_POLL_INTERVAL_SECONDS = 1
13
- COMMON_HEADERS = {
14
- "accept": "application/json",
15
- "Content-Type": "application/json",
16
- "nvcf-feature-enable-gateway-timeout": "true",
17
- }
150
+ return response_data
18
151
 
19
152
 
20
153
  class NimMetadata(object):
21
154
  def __init__(self):
22
155
  self._nvcf_chat_completion_models = []
23
- self._coreweave_chat_completion_models = []
24
-
25
- conf = init_config()
26
-
27
- if "OBP_AUTH_SERVER" in conf:
28
- auth_host = conf["OBP_AUTH_SERVER"]
29
- else:
30
- auth_host = "auth." + urlparse(SERVICE_URL).hostname.split(".", 1)[1]
156
+ ngc_response = get_ngc_response()
31
157
 
32
- nim_info_url = "https://" + auth_host + "/generate/nim"
158
+ self.ngc_api_key = ngc_response["nvcf"]["api_key"]
33
159
 
34
- if "METAFLOW_SERVICE_AUTH_KEY" in conf:
35
- headers = {"x-api-key": conf["METAFLOW_SERVICE_AUTH_KEY"]}
36
- res = requests.get(nim_info_url, headers=headers)
37
- else:
38
- headers = json.loads(os.environ.get("METAFLOW_SERVICE_HEADERS"))
39
- res = requests.get(nim_info_url, headers=headers)
40
-
41
- res.raise_for_status()
42
- self._ngc_api_key = res.json()["nvcf"]["api_key"]
43
-
44
- for model in res.json()["nvcf"]["functions"]:
160
+ for model in ngc_response["nvcf"]["functions"]:
45
161
  self._nvcf_chat_completion_models.append(
46
162
  {
47
163
  "name": model["model_key"],
@@ -49,64 +165,48 @@ class NimMetadata(object):
49
165
  "version-id": model["version"],
50
166
  }
51
167
  )
52
- for model in res.json()["coreweave"]["containers"]:
53
- self._coreweave_chat_completion_models.append(
54
- {"name": model["nim_name"], "ip-address": model["ip_addr"]}
55
- )
56
168
 
57
169
  def get_nvcf_chat_completion_models(self):
58
170
  return self._nvcf_chat_completion_models
59
171
 
60
172
  def get_headers_for_nvcf_request(self):
61
- return {**COMMON_HEADERS, "Authorization": f"Bearer {self._ngc_api_key}"}
173
+ return {
174
+ "accept": "application/json",
175
+ "content-type": "application/json",
176
+ "Authorization": f"Bearer {self.ngc_api_key}",
177
+ "NVCF-POLL-SECONDS": "5",
178
+ }
62
179
 
63
180
 
64
181
  class NimManager(object):
65
- def __init__(self, models, backend, flow, step_name, monitor, queue_timeout):
66
-
182
+ def __init__(self, models, flow, step_name, monitor):
67
183
  nim_metadata = NimMetadata()
68
- if backend == "managed":
69
- nvcf_models = [
70
- m["name"] for m in nim_metadata.get_nvcf_chat_completion_models()
71
- ]
72
-
73
- self.models = {}
74
- for m in models:
75
- if m in nvcf_models:
76
- self.models[m] = NimChatCompletion(
77
- model=m,
78
- provider="NVCF",
79
- nim_metadata=nim_metadata,
80
- monitor=monitor,
81
- queue_timeout=queue_timeout,
82
- )
83
- else:
84
- raise ValueError(
85
- f"Model {m} not supported by the Outerbounds @nim offering."
86
- f"\nYou can choose from these options: {nvcf_models}\n\n"
87
- "Reach out to Outerbounds if there are other models you'd like supported."
88
- )
89
- else:
90
- raise ValueError(
91
- f"Backend {backend} not supported by the Outerbounds @nim offering. Please reach out to Outerbounds."
92
- )
93
-
94
-
95
- class JobStatus(object):
96
- SUBMITTED = "SUBMITTED"
97
- RUNNING = "RUNNING"
98
- SUCCESSFUL = "SUCCESSFUL"
99
- FAILED = "FAILED"
184
+ nvcf_models = [
185
+ m["name"] for m in nim_metadata.get_nvcf_chat_completion_models()
186
+ ]
187
+ self.models = {}
188
+
189
+ for each_model in models:
190
+ if each_model in nvcf_models:
191
+ self.models[each_model] = NimChatCompletion(
192
+ model=each_model,
193
+ nim_metadata=nim_metadata,
194
+ monitor=monitor,
195
+ )
196
+ else:
197
+ raise ValueError(
198
+ f"Model {each_model} not supported by the Outerbounds @nim offering."
199
+ f"\nYou can choose from these options: {nvcf_models}\n\n"
200
+ "Reach out to Outerbounds if there are other models you'd like supported."
201
+ )
100
202
 
101
203
 
102
204
  class NimChatCompletion(object):
103
205
  def __init__(
104
206
  self,
105
- model="meta/llama3-8b-instruct",
106
- provider="NVCF",
107
- nim_metadata=None,
108
- monitor=False,
109
- queue_timeout=None,
207
+ model: str = "meta/llama3-8b-instruct",
208
+ nim_metadata: NimMetadata = None,
209
+ monitor: bool = False,
110
210
  **kwargs,
111
211
  ):
112
212
  if nim_metadata is None:
@@ -114,79 +214,54 @@ class NimChatCompletion(object):
114
214
  "NimMetadata object is required to initialize NimChatCompletion object."
115
215
  )
116
216
 
117
- self._nim_metadata = nim_metadata
118
- self.compute_provider = provider
119
- self.invocations = []
120
- self.max_request_retries = int(
121
- os.environ.get("METAFLOW_EXT_HTTP_MAX_RETRIES", "10")
122
- )
217
+ self.model_name = model
218
+ self.nim_metadata = nim_metadata
123
219
  self.monitor = monitor
124
220
 
125
- if self.compute_provider == "NVCF":
126
- nvcf_model_names = [
127
- m["name"] for m in self._nim_metadata.get_nvcf_chat_completion_models()
128
- ]
129
- self.model = model
130
- self.function_id = self._nim_metadata.get_nvcf_chat_completion_models()[
131
- nvcf_model_names.index(model)
132
- ]["function-id"]
133
- self.version_id = self._nim_metadata.get_nvcf_chat_completion_models()[
134
- nvcf_model_names.index(model)
135
- ]["version-id"]
136
- else:
221
+ all_nvcf_models = self.nim_metadata.get_nvcf_chat_completion_models()
222
+ all_nvcf_model_names = [m["name"] for m in all_nvcf_models]
223
+
224
+ if self.model_name not in all_nvcf_model_names:
137
225
  raise ValueError(
138
- f"Backend compute provider {self.compute_provider} not yet supported for @nim."
226
+ f"Model {self.model_name} not found in available NVCF models"
139
227
  )
140
228
 
141
- # to know whether to set file_name
229
+ self.model = all_nvcf_models[all_nvcf_model_names.index(self.model_name)]
230
+ self.function_id = self.model["function-id"]
231
+ self.version_id = self.model["version-id"]
232
+
142
233
  self.first_request = True
143
234
 
144
- # TODO (Eddie) - this may make more sense in a base class.
145
- # @nim arch needs redesign if customers start using it in more creative ways.
146
- self._poll_seconds = "3600"
147
- self._queue_timeout = queue_timeout
148
- self._status = None
149
- self._result = {}
150
-
151
- @property
152
- def status(self):
153
- return self._status
154
-
155
- @property
156
- def has_failed(self):
157
- return self._status == JobStatus.FAILED
158
-
159
- @property
160
- def is_running(self):
161
- return self._status == JobStatus.SUBMITTED
162
-
163
- @property
164
- def result(self):
165
- return self._result
166
-
167
- def _log_stats(self, response, e2e_time):
168
- stats = {}
169
- if response.status_code == 200:
170
- stats["success"] = 1
171
- stats["error"] = 0
235
+ def log_stats(self, response_data, duration, status_code, poll_count):
236
+ if not self.monitor:
237
+ return
238
+
239
+ stats = {
240
+ "status_code": status_code,
241
+ "success": 1 if status_code == 200 else 0,
242
+ "error": 0 if status_code == 200 else 1,
243
+ "e2e_time": duration,
244
+ "model": self.model_name,
245
+ "poll_count": poll_count,
246
+ }
247
+
248
+ if status_code == 200 and response_data:
249
+ try:
250
+ stats["prompt_tokens"] = response_data["usage"]["prompt_tokens"]
251
+ except (KeyError, TypeError):
252
+ stats["prompt_tokens"] = None
253
+
254
+ try:
255
+ stats["completion_tokens"] = response_data["usage"]["completion_tokens"]
256
+ except (KeyError, TypeError):
257
+ stats["completion_tokens"] = None
172
258
  else:
173
- stats["success"] = 0
174
- stats["error"] = 1
175
- stats["status_code"] = response.status_code
176
- try:
177
- stats["prompt_tokens"] = response.json()["usage"]["prompt_tokens"]
178
- except KeyError:
179
259
  stats["prompt_tokens"] = None
180
- try:
181
- stats["completion_tokens"] = response.json()["usage"]["completion_tokens"]
182
- except KeyError:
183
260
  stats["completion_tokens"] = None
184
- stats["e2e_time"] = e2e_time
185
- stats["provider"] = self.compute_provider
186
- stats["model"] = self.model
187
261
 
188
262
  conn = sqlite3.connect(self.file_name)
189
263
  cursor = conn.cursor()
264
+
190
265
  try:
191
266
  cursor.execute(
192
267
  """
@@ -207,112 +282,37 @@ class NimChatCompletion(object):
207
282
  finally:
208
283
  conn.close()
209
284
 
210
- @retry_on_status(status_codes=[500], max_retries=3, delay=5)
211
- @retry_on_status(status_codes=[504])
212
285
  def __call__(self, **kwargs):
213
-
214
286
  if self.first_request:
215
- # Put here to guarantee self.file_name is set after task_id exists.
216
287
  from metaflow import current
217
288
 
218
289
  self.file_name = get_storage_path(current.task_id)
290
+ self.first_request = False
219
291
 
220
- request_data = {"model": self.model, **kwargs}
221
- request_url = f"{NVCF_SUBMIT_ENDPOINT}/{self.function_id}"
222
- retry_delay = 1
223
- attempts = 0
224
- t0 = time.time()
225
- while attempts < self.max_request_retries:
226
- try:
227
- attempts += 1
228
- response = requests.post(
229
- request_url,
230
- headers=self._nim_metadata.get_headers_for_nvcf_request(),
231
- json=request_data,
232
- )
233
- if response.status_code == 202:
234
- invocation_id = response.headers.get("NVCF-REQID")
235
- self.invocations.append(invocation_id)
236
- self._status = JobStatus.SUBMITTED
237
- elif response.status_code == 200:
238
- tf = time.time()
239
- if self.monitor:
240
- self._log_stats(response, tf - t0)
241
- self._status = JobStatus.SUCCESSFUL
242
- self._result = response.json()
243
- return self._result
244
- elif response.status_code == 400:
245
- self._status = JobStatus.FAILED
246
- msg = (
247
- "[@nim ERROR] The OpenAI-compatible returned a 400 status code. "
248
- + "Known causes include improper requests or prompts with too many tokens for the selected model. "
249
- + "Please contact Outerbounds if you need assistance resolving the issue."
250
- )
251
- print(msg, file=sys.stderr)
252
- self._result = {"ERROR": msg}
253
- return self._result
254
- except (
255
- requests.exceptions.ConnectionError,
256
- requests.exceptions.ReadTimeout,
257
- ) as e:
258
- # ConnectionErrors are generally temporary errors like DNS resolution failures,
259
- # timeouts etc.
260
- print(
261
- "received error of type {}. Retrying...".format(type(e)),
262
- e,
263
- file=sys.stderr,
264
- )
265
- time.sleep(retry_delay)
266
- retry_delay *= 2 # Double the delay for the next attempt
267
- retry_delay += random.uniform(0, 1) # Add jitter
268
- retry_delay = min(retry_delay, 10)
269
-
270
- def _poll():
271
- poll_request_url = f"{NVCF_RESULT_ENDPOINT}/{invocation_id}"
272
- attempts = 0
273
- retry_delay = 1
274
- while attempts < self.max_request_retries:
275
- try:
276
- attempts += 1
277
- poll_response = requests.get(
278
- poll_request_url,
279
- headers=self._nim_metadata.get_headers_for_nvcf_request(),
280
- )
281
- if poll_response.status_code == 200:
282
- tf = time.time()
283
- self._log_stats(response, tf - t0)
284
- self._status = JobStatus.SUCCESSFUL
285
- self._result = poll_response.json()
286
- return self._result
287
- elif poll_response.status_code == 202:
288
- self._status = JobStatus.SUBMITTED
289
- return 202
290
- elif poll_response.status_code == 400:
291
- self._status = JobStatus.FAILED
292
- msg = (
293
- "[@nim ERROR] The OpenAI-compatible API returned a 400 status code. "
294
- + "Known causes include improper requests or prompts with too many tokens for the selected model. "
295
- + "Please contact Outerbounds if you need assistance resolving the issue."
296
- )
297
- print(msg, file=sys.stderr)
298
- self._result = {"@nim ERROR": msg}
299
- return self._result
300
- except (
301
- requests.exceptions.ConnectionError,
302
- requests.exceptions.ReadTimeout,
303
- ) as e:
304
- print(
305
- "received error of type {}. Retrying...".format(type(e)),
306
- e,
307
- file=sys.stderr,
308
- )
309
- time.sleep(retry_delay)
310
- retry_delay *= 2 # Double the delay for the next attempt
311
- retry_delay += random.uniform(0, 1) # Add jitter
312
- retry_delay = min(retry_delay, 10)
313
-
314
- while True:
315
- data = _poll()
316
- if data and data != 202:
317
- return data
318
- time.sleep(NVCF_POLL_INTERVAL_SECONDS)
292
+ # Create log callback if monitoring is enabled
293
+ log_callback = self.log_stats if self.monitor else None
294
+
295
+ request_data = {"model": self.model_name, **kwargs}
296
+ request_url = (
297
+ f"https://api.nvcf.nvidia.com/v2/nvcf/pexec/functions/{self.function_id}"
298
+ )
299
+
300
+ try:
301
+ response_data = nvcf_submit_helper(
302
+ url=request_url,
303
+ payload=request_data,
304
+ headers=self.nim_metadata.get_headers_for_nvcf_request(),
305
+ log_callback=log_callback,
306
+ )
307
+
308
+ return response_data
309
+
310
+ except requests.exceptions.HTTPError as e:
311
+ error_msg = f"[@nim ERROR] NVCF API request failed: {str(e)}"
312
+ print(error_msg, file=sys.stderr)
313
+ raise
314
+
315
+ except Exception as e:
316
+ error_msg = f"[@nim ERROR] Unexpected error: {str(e)}"
317
+ print(error_msg, file=sys.stderr)
318
+ raise
@@ -0,0 +1,36 @@
1
+ import os
2
+ import json
3
+ import requests
4
+ from urllib.parse import urlparse
5
+ from metaflow.metaflow_config import SERVICE_URL
6
+ from metaflow.metaflow_config_funcs import init_config
7
+
8
+
9
+ NIM_MONITOR_LOCAL_STORAGE_ROOT = ".nim-monitor"
10
+
11
+
12
+ def get_storage_path(task_id):
13
+ return f"{NIM_MONITOR_LOCAL_STORAGE_ROOT}/" + task_id + ".sqlite"
14
+
15
+
16
+ def get_ngc_response():
17
+ conf = init_config()
18
+ if "OBP_AUTH_SERVER" in conf:
19
+ auth_host = conf["OBP_AUTH_SERVER"]
20
+ else:
21
+ auth_host = "auth." + urlparse(SERVICE_URL).hostname.split(".", 1)[1]
22
+
23
+ # NOTE: reusing the same auth_host as the one used in NimMetadata,
24
+ # however, user should not need to use nim container to use @nvct.
25
+ # May want to refactor this to a common endpoint.
26
+ nim_info_url = "https://" + auth_host + "/generate/nim"
27
+
28
+ if "METAFLOW_SERVICE_AUTH_KEY" in conf:
29
+ headers = {"x-api-key": conf["METAFLOW_SERVICE_AUTH_KEY"]}
30
+ res = requests.get(nim_info_url, headers=headers)
31
+ else:
32
+ headers = json.loads(os.environ.get("METAFLOW_SERVICE_HEADERS"))
33
+ res = requests.get(nim_info_url, headers=headers)
34
+
35
+ res.raise_for_status()
36
+ return res.json()
@@ -1,13 +1,13 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ob-metaflow-extensions
3
- Version: 1.1.155
3
+ Version: 1.1.156
4
4
  Summary: Outerbounds Platform Extensions for Metaflow
5
5
  Author: Outerbounds, Inc.
6
6
  License: Commercial
7
7
  Description-Content-Type: text/markdown
8
8
  Requires-Dist: boto3
9
9
  Requires-Dist: kubernetes
10
- Requires-Dist: ob-metaflow (==2.15.13.1)
10
+ Requires-Dist: ob-metaflow (==2.15.14.1)
11
11
 
12
12
  # Outerbounds platform package
13
13
 
@@ -1,7 +1,7 @@
1
1
  metaflow_extensions/outerbounds/__init__.py,sha256=Gb8u06s9ClQsA_vzxmkCzuMnigPy7kKcDnLfb7eB-64,514
2
2
  metaflow_extensions/outerbounds/remote_config.py,sha256=pEFJuKDYs98eoB_-ryPjVi9b_c4gpHMdBHE14ltoxIU,4672
3
3
  metaflow_extensions/outerbounds/config/__init__.py,sha256=JsQGRuGFz28fQWjUvxUgR8EKBLGRdLUIk_buPLJplJY,1225
4
- metaflow_extensions/outerbounds/plugins/__init__.py,sha256=eHcM_t2Mzlge7B9Dv3VGVM5x8qNZYdLyqBOAC6uRxec,13228
4
+ metaflow_extensions/outerbounds/plugins/__init__.py,sha256=gytuNt3lNabirHLEYzrmHFMp-JWh8dA2AZPK11HmaNw,13242
5
5
  metaflow_extensions/outerbounds/plugins/auth_server.py,sha256=_Q9_2EL0Xy77bCRphkwT1aSu8gQXRDOH-Z-RxTUO8N4,2202
6
6
  metaflow_extensions/outerbounds/plugins/perimeters.py,sha256=QXh3SFP7GQbS-RAIxUOPbhPzQ7KDFVxZkTdKqFKgXjI,2697
7
7
  metaflow_extensions/outerbounds/plugins/apps/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
@@ -23,10 +23,10 @@ metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_cli.py,sha256=kq
23
23
  metaflow_extensions/outerbounds/plugins/fast_bakery/fast_bakery_decorator.py,sha256=MXSIp05-jvt8Q2uGaLKjtuM_ToLeRLxhtMbfHc9Kcko,1515
24
24
  metaflow_extensions/outerbounds/plugins/kubernetes/__init__.py,sha256=5zG8gShSj8m7rgF4xgWBZFuY3GDP5n1T0ktjRpGJLHA,69
25
25
  metaflow_extensions/outerbounds/plugins/kubernetes/kubernetes_client.py,sha256=fx_XUkgR4r6hF2ilDfT5LubRyVrYMVIv5f6clHkCaEk,5988
26
- metaflow_extensions/outerbounds/plugins/nim/__init__.py,sha256=MEdX6TPdY9POflCiaYbVmwT-nUNeqwregZBzBZ5CNz0,4487
27
- metaflow_extensions/outerbounds/plugins/nim/card.py,sha256=EM6QtevpJmXpeCesKDk2L6ts6M2qLSYUbajaEEU_yys,4794
28
- metaflow_extensions/outerbounds/plugins/nim/nim_manager.py,sha256=fCFdRuvagzgPSVQfhy5bzbxfVBeO562h-cDz6CCLQLw,12118
29
- metaflow_extensions/outerbounds/plugins/nim/utilities.py,sha256=jSdNP3tSCrDjxD2E9bIzxVqDDu6S14femlxSjsMv57o,151
26
+ metaflow_extensions/outerbounds/plugins/nim/card.py,sha256=dXOJvsZed5NyYyxYLPDvtwg9z_X4azL9HTJGYaiNriY,4690
27
+ metaflow_extensions/outerbounds/plugins/nim/nim_decorator.py,sha256=50YVvC7mcZYlPluM0Wq1UtufhzlQb-RxzZkTOJJ3LkM,3439
28
+ metaflow_extensions/outerbounds/plugins/nim/nim_manager.py,sha256=5YkohM-vfoDHPUMWb19sY0HErORoKOKf4jexERJTO80,10912
29
+ metaflow_extensions/outerbounds/plugins/nim/utils.py,sha256=nU-v1sheBjmITXfHiJx2ucm_Tq_nGb5BcuAm5c235cQ,1164
30
30
  metaflow_extensions/outerbounds/plugins/nvcf/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
31
31
  metaflow_extensions/outerbounds/plugins/nvcf/constants.py,sha256=aGHdNw_hqBu8i0zWXcatQM6e769wUXox0l8g0f6fNZ8,146
32
32
  metaflow_extensions/outerbounds/plugins/nvcf/exceptions.py,sha256=-Pm9cOWUzpv94TvVUeq-FenAWdfLBJd5N7WPqIGZVqU,3671
@@ -68,7 +68,7 @@ metaflow_extensions/outerbounds/toplevel/plugins/gcp/__init__.py,sha256=BbZiaH3u
68
68
  metaflow_extensions/outerbounds/toplevel/plugins/kubernetes/__init__.py,sha256=5zG8gShSj8m7rgF4xgWBZFuY3GDP5n1T0ktjRpGJLHA,69
69
69
  metaflow_extensions/outerbounds/toplevel/plugins/ollama/__init__.py,sha256=GRSz2zwqkvlmFS6bcfYD_CX6CMko9DHQokMaH1iBshA,47
70
70
  metaflow_extensions/outerbounds/toplevel/plugins/snowflake/__init__.py,sha256=LptpH-ziXHrednMYUjIaosS1SXD3sOtF_9_eRqd8SJw,50
71
- ob_metaflow_extensions-1.1.155.dist-info/METADATA,sha256=1SCxq4RN5keRpny8BRcQn9FJ71aB58cUeIkk2yytPI8,521
72
- ob_metaflow_extensions-1.1.155.dist-info/WHEEL,sha256=bb2Ot9scclHKMOLDEHY6B2sicWOgugjFKaJsT7vwMQo,110
73
- ob_metaflow_extensions-1.1.155.dist-info/top_level.txt,sha256=NwG0ukwjygtanDETyp_BUdtYtqIA_lOjzFFh1TsnxvI,20
74
- ob_metaflow_extensions-1.1.155.dist-info/RECORD,,
71
+ ob_metaflow_extensions-1.1.156.dist-info/METADATA,sha256=G9c19j9g0v8dDQU5sP5Zaaub2fot__EMCJ6iBQBb4Qo,521
72
+ ob_metaflow_extensions-1.1.156.dist-info/WHEEL,sha256=bb2Ot9scclHKMOLDEHY6B2sicWOgugjFKaJsT7vwMQo,110
73
+ ob_metaflow_extensions-1.1.156.dist-info/top_level.txt,sha256=NwG0ukwjygtanDETyp_BUdtYtqIA_lOjzFFh1TsnxvI,20
74
+ ob_metaflow_extensions-1.1.156.dist-info/RECORD,,
@@ -1,5 +0,0 @@
1
- NIM_MONITOR_LOCAL_STORAGE_ROOT = ".nim-monitor"
2
-
3
-
4
- def get_storage_path(task_id):
5
- return f"{NIM_MONITOR_LOCAL_STORAGE_ROOT}/" + task_id + ".sqlite"