ob-metaflow-extensions 1.1.128__py2.py3-none-any.whl → 1.1.130__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ob-metaflow-extensions might be problematic. Click here for more details.

@@ -1,10 +1,17 @@
1
1
  from functools import partial
2
- from metaflow.decorators import FlowDecorator
2
+ from uuid import uuid4
3
+ import os, time
4
+ from metaflow.decorators import StepDecorator
3
5
  from metaflow import current
6
+
4
7
  from .nim_manager import NimManager
8
+ from .card import NimMetricsRefresher
9
+ from .utilities import get_storage_path, NIM_MONITOR_LOCAL_STORAGE_ROOT
10
+ from ..card_utilities.async_cards import AsyncPeriodicRefresher
11
+ from ..card_utilities.injector import CardDecoratorInjector
5
12
 
6
13
 
7
- class NimDecorator(FlowDecorator):
14
+ class NimDecorator(StepDecorator, CardDecoratorInjector):
8
15
  """
9
16
  This decorator is used to run NIM containers in Metaflow tasks as sidecars.
10
17
 
@@ -18,13 +25,12 @@ class NimDecorator(FlowDecorator):
18
25
  Valid backend options
19
26
  ---------------------
20
27
  - 'managed': Outerbounds selects a compute provider based on the model.
21
- - 🚧 'dataplane': Run in your account.
22
28
 
23
29
  Valid model options
24
30
  ----------------
25
31
  - 'meta/llama3-8b-instruct': 8B parameter model
26
32
  - 'meta/llama3-70b-instruct': 70B parameter model
27
- - Upon request, any model here: https://nvcf.ngc.nvidia.com/functions?filter=nvidia-functions
33
+ - any model here: https://nvcf.ngc.nvidia.com/functions?filter=nvidia-functions
28
34
 
29
35
  Parameters
30
36
  ----------
@@ -32,21 +38,100 @@ class NimDecorator(FlowDecorator):
32
38
  List of NIM containers running models in sidecars.
33
39
  backend: str
34
40
  Compute provider to run the NIM container.
41
+ queue_timeout : int
42
+ Time to keep the job in NVCF's queue.
35
43
  """
36
44
 
37
45
  name = "nim"
38
46
  defaults = {
39
47
  "models": [],
40
48
  "backend": "managed",
49
+ "monitor": True,
50
+ "persist_db": False,
51
+ "queue_timeout": 5 * 24 * 3600, # Default 5 days in seconds
41
52
  }
42
53
 
43
- def flow_init(
44
- self, flow, graph, environment, flow_datastore, metadata, logger, echo, options
54
+ def step_init(
55
+ self, flow, graph, step_name, decorators, environment, flow_datastore, logger
45
56
  ):
57
+
58
+ if self.attributes["monitor"]:
59
+ self.attach_card_decorator(
60
+ flow,
61
+ step_name,
62
+ NimMetricsRefresher.CARD_ID,
63
+ "blank",
64
+ refresh_interval=4.0,
65
+ )
66
+
46
67
  current._update_env(
47
68
  {
48
69
  "nim": NimManager(
49
- models=self.attributes["models"], backend=self.attributes["backend"]
70
+ models=self.attributes["models"],
71
+ backend=self.attributes["backend"],
72
+ flow=flow,
73
+ step_name=step_name,
74
+ monitor=self.attributes["monitor"],
75
+ queue_timeout=self.attributes["queue_timeout"],
50
76
  )
51
77
  }
52
78
  )
79
+
80
+ def task_decorate(
81
+ self, step_func, flow, graph, retry_count, max_user_code_retries, ubf_context
82
+ ):
83
+ if self.attributes["monitor"]:
84
+
85
+ import sqlite3
86
+ from metaflow import current
87
+
88
+ file_path = get_storage_path(current.task_id)
89
+ if os.path.exists(file_path):
90
+ os.remove(file_path)
91
+ os.makedirs(NIM_MONITOR_LOCAL_STORAGE_ROOT, exist_ok=True)
92
+ conn = sqlite3.connect(file_path)
93
+ cursor = conn.cursor()
94
+ cursor.execute(
95
+ """
96
+ CREATE TABLE metrics (
97
+ error INTEGER,
98
+ success INTEGER,
99
+ status_code INTEGER,
100
+ prompt_tokens INTEGER,
101
+ completion_tokens INTEGER,
102
+ e2e_time NUMERIC,
103
+ model TEXT
104
+ )
105
+ """
106
+ )
107
+
108
+ def _wrapped_step_func(*args, **kwargs):
109
+ async_refresher_metrics = AsyncPeriodicRefresher(
110
+ NimMetricsRefresher(),
111
+ updater_interval=4.0,
112
+ collector_interval=2.0,
113
+ file_name=file_path,
114
+ )
115
+ try:
116
+ async_refresher_metrics.start()
117
+ return step_func(*args, **kwargs)
118
+ finally:
119
+ time.sleep(5.0) # buffer for the last update to synchronize
120
+ async_refresher_metrics.stop()
121
+
122
+ return _wrapped_step_func
123
+ else:
124
+ return step_func
125
+
126
+ def task_post_step(
127
+ self, step_name, flow, graph, retry_count, max_user_code_retries
128
+ ):
129
+ if not self.attributes["persist_db"]:
130
+ import shutil
131
+
132
+ file_path = get_storage_path(current.task_id)
133
+ if os.path.exists(file_path):
134
+ os.remove(file_path)
135
+ # if this task is the last one, delete the whole enchilada.
136
+ if not os.listdir(NIM_MONITOR_LOCAL_STORAGE_ROOT):
137
+ shutil.rmtree(NIM_MONITOR_LOCAL_STORAGE_ROOT, ignore_errors=True)
@@ -0,0 +1,154 @@
1
+ import os, sqlite3
2
+ from metaflow.cards import (
3
+ Markdown,
4
+ Table,
5
+ ProgressBar,
6
+ )
7
+ from metaflow.decorators import StepDecorator
8
+ from metaflow.metaflow_current import current
9
+
10
+ from .utilities import get_storage_path
11
+ from ..card_utilities.async_cards import CardRefresher
12
+ from ..card_utilities.extra_components import BarPlot, ViolinPlot
13
+
14
+
15
+ def json_to_artifact_table(data):
16
+ return ArtifactTable(data)
17
+
18
+
19
+ class NimMetricsRefresher(CardRefresher):
20
+ CARD_ID = "nim_metrics"
21
+
22
+ def __init__(self) -> None:
23
+ self._metrics_charts = {}
24
+ self._last_updated_on = None
25
+ self._already_rendered = False
26
+ self._file_name = get_storage_path(current.task_id)
27
+
28
+ def sqlite_fetch_func(self, conn):
29
+ cursor = conn.cursor()
30
+ try:
31
+ conn = sqlite3.connect(self._file_name)
32
+ cursor = conn.cursor()
33
+ cursor.execute(
34
+ "SELECT error, success, status_code, prompt_tokens, completion_tokens, e2e_time, model FROM metrics"
35
+ )
36
+ rows = cursor.fetchall()
37
+ data = {
38
+ "error": 0,
39
+ "success": 0,
40
+ "status_code": [],
41
+ "prompt_tokens": [],
42
+ "completion_tokens": [],
43
+ "e2e_time": [],
44
+ "model": [],
45
+ }
46
+ for row in rows:
47
+ data["error"] += row[0]
48
+ data["success"] += row[1]
49
+ data["status_code"].append(row[2])
50
+ data["prompt_tokens"].append(row[3])
51
+ data["completion_tokens"].append(row[4])
52
+ data["e2e_time"].append(row[5])
53
+ data["model"].append(row[6])
54
+ return data
55
+ finally:
56
+ conn.close()
57
+
58
+ def render_card_fresh(self, current_card, data):
59
+ self._already_rendered = True
60
+ current_card.clear()
61
+ current_card.append(Markdown("## Metrics"))
62
+
63
+ self._metrics_charts["request_success"] = BarPlot(
64
+ title="Request success",
65
+ category_name="category",
66
+ value_name="amount",
67
+ orientation="horizontal",
68
+ )
69
+ self._metrics_charts["latency_distribution"] = ViolinPlot(
70
+ title="Latency distribution (s)",
71
+ category_col_name="model",
72
+ value_col_name="e2e_time",
73
+ )
74
+
75
+ current_card.append(
76
+ Table(
77
+ data=[
78
+ [
79
+ self._metrics_charts["request_success"],
80
+ ],
81
+ [self._metrics_charts["latency_distribution"]],
82
+ ]
83
+ )
84
+ )
85
+ current_card.refresh()
86
+
87
+ def on_startup(self, current_card):
88
+ current_card.append(Markdown("# Task-level NIM API metrics"))
89
+ current_card.append(
90
+ Markdown(
91
+ "_waiting for data to appear_",
92
+ )
93
+ )
94
+ current_card.refresh()
95
+
96
+ def on_error(self, current_card, error_message):
97
+
98
+ if isinstance(error_message, FileNotFoundError):
99
+ return
100
+
101
+ if not self._already_rendered:
102
+ current_card.clear()
103
+ current_card.append(
104
+ Markdown(
105
+ f"## Error: {str(error_message)}",
106
+ )
107
+ )
108
+ current_card.refresh()
109
+
110
+ def update_only_components(self, current_card, data_object):
111
+
112
+ # update request success data
113
+ self._metrics_charts["request_success"].spec["data"][0]["values"] = [
114
+ {
115
+ "category": "Successful requests",
116
+ "amount": data_object["metrics"]["success"],
117
+ },
118
+ {"category": "Errors", "amount": data_object["metrics"]["error"]},
119
+ ]
120
+
121
+ latency_data = []
122
+ times = []
123
+ for m, e in zip(
124
+ data_object["metrics"]["model"], data_object["metrics"]["e2e_time"]
125
+ ):
126
+ latency_data.append({"model": m, "e2e_time": e})
127
+ times.append(e)
128
+
129
+ # update latency data
130
+ self._metrics_charts["latency_distribution"].spec["data"][0][
131
+ "values"
132
+ ] = latency_data
133
+
134
+ # update domain for latency plot
135
+ min_time = min(times)
136
+ max_time = max(times)
137
+ for scale in self._metrics_charts["latency_distribution"].spec["scales"]:
138
+ if scale["name"] == "xscale":
139
+ scale["domain"] = [min_time - max_time * 0.1, max_time + max_time * 0.1]
140
+
141
+ current_card.refresh()
142
+
143
+ def on_update(self, current_card, data_object):
144
+ data_object_keys = set(data_object.keys())
145
+ if len(data_object_keys) == 0:
146
+ return
147
+ if len(self._metrics_charts) == 0:
148
+ self.render_card_fresh(current_card, data_object)
149
+ return
150
+ elif len(data_object["metrics"]["status_code"]) == 0:
151
+ return
152
+ else:
153
+ self.update_only_components(current_card, data_object)
154
+ return
@@ -1,19 +1,20 @@
1
- import os
2
- import time
3
- import json
4
- import requests
1
+ import os, sys, time, json, random, requests, sqlite3
5
2
  from urllib.parse import urlparse
6
3
  from metaflow.metaflow_config import SERVICE_URL
7
4
  from metaflow.metaflow_config_funcs import init_config
8
- import sys
9
- import random
5
+ from .utilities import get_storage_path
6
+ from ..nvcf.nvcf import retry_on_status
7
+
10
8
 
11
9
  NVCF_URL = "https://api.nvcf.nvidia.com"
12
10
  NVCF_SUBMIT_ENDPOINT = f"{NVCF_URL}/v2/nvcf/pexec/functions"
13
11
  NVCF_RESULT_ENDPOINT = f"{NVCF_URL}/v2/nvcf/pexec/status"
14
-
15
- COMMON_HEADERS = {"accept": "application/json", "Content-Type": "application/json"}
16
- POLL_INTERVAL = 1
12
+ NVCF_POLL_INTERVAL_SECONDS = 1
13
+ COMMON_HEADERS = {
14
+ "accept": "application/json",
15
+ "Content-Type": "application/json",
16
+ "nvcf-feature-enable-gateway-timeout": "true",
17
+ }
17
18
 
18
19
 
19
20
  class NimMetadata(object):
@@ -56,41 +57,33 @@ class NimMetadata(object):
56
57
  def get_nvcf_chat_completion_models(self):
57
58
  return self._nvcf_chat_completion_models
58
59
 
59
- def get_coreweave_chat_completion_models(self):
60
- return self._coreweave_chat_completion_models
61
-
62
60
  def get_headers_for_nvcf_request(self):
63
61
  return {**COMMON_HEADERS, "Authorization": f"Bearer {self._ngc_api_key}"}
64
62
 
65
- def get_headers_for_coreweave_request(self):
66
- return COMMON_HEADERS
67
-
68
63
 
69
64
  class NimManager(object):
70
- def __init__(self, models, backend):
65
+ def __init__(self, models, backend, flow, step_name, monitor, queue_timeout):
66
+
71
67
  nim_metadata = NimMetadata()
72
68
  if backend == "managed":
73
69
  nvcf_models = [
74
70
  m["name"] for m in nim_metadata.get_nvcf_chat_completion_models()
75
71
  ]
76
- cw_models = [
77
- m["name"] for m in nim_metadata.get_coreweave_chat_completion_models()
78
- ]
79
72
 
80
73
  self.models = {}
81
74
  for m in models:
82
75
  if m in nvcf_models:
83
76
  self.models[m] = NimChatCompletion(
84
- model=m, provider="NVCF", nim_metadata=nim_metadata
85
- )
86
- elif m in cw_models:
87
- self.models[m] = NimChatCompletion(
88
- model=m, provider="CoreWeave", nim_metadata=nim_metadata
77
+ model=m,
78
+ provider="NVCF",
79
+ nim_metadata=nim_metadata,
80
+ monitor=monitor,
81
+ queue_timeout=queue_timeout,
89
82
  )
90
83
  else:
91
84
  raise ValueError(
92
85
  f"Model {m} not supported by the Outerbounds @nim offering."
93
- f"\nYou can choose from these options: {nvcf_models + cw_models}\n\n"
86
+ f"\nYou can choose from these options: {nvcf_models}\n\n"
94
87
  "Reach out to Outerbounds if there are other models you'd like supported."
95
88
  )
96
89
  else:
@@ -99,12 +92,21 @@ class NimManager(object):
99
92
  )
100
93
 
101
94
 
95
+ class JobStatus(object):
96
+ SUBMITTED = "SUBMITTED"
97
+ RUNNING = "RUNNING"
98
+ SUCCESSFUL = "SUCCESSFUL"
99
+ FAILED = "FAILED"
100
+
101
+
102
102
  class NimChatCompletion(object):
103
103
  def __init__(
104
104
  self,
105
105
  model="meta/llama3-8b-instruct",
106
- provider="CoreWeave",
106
+ provider="NVCF",
107
107
  nim_metadata=None,
108
+ monitor=False,
109
+ queue_timeout=None,
108
110
  **kwargs,
109
111
  ):
110
112
  if nim_metadata is None:
@@ -118,19 +120,9 @@ class NimChatCompletion(object):
118
120
  self.max_request_retries = int(
119
121
  os.environ.get("METAFLOW_EXT_HTTP_MAX_RETRIES", "10")
120
122
  )
123
+ self.monitor = monitor
121
124
 
122
- if self.compute_provider == "CoreWeave":
123
- cw_model_names = [
124
- m["name"]
125
- for m in self._nim_metadata.get_coreweave_chat_completion_models()
126
- ]
127
- self.model = model
128
- self.ip_address = self._nim_metadata.get_coreweave_chat_completion_models()[
129
- cw_model_names.index(model)
130
- ]["ip-address"]
131
- self.endpoint = f"http://{self.ip_address}:8000/v1/chat/completions"
132
-
133
- elif self.compute_provider == "NVCF":
125
+ if self.compute_provider == "NVCF":
134
126
  nvcf_model_names = [
135
127
  m["name"] for m in self._nim_metadata.get_nvcf_chat_completion_models()
136
128
  ]
@@ -141,45 +133,174 @@ class NimChatCompletion(object):
141
133
  self.version_id = self._nim_metadata.get_nvcf_chat_completion_models()[
142
134
  nvcf_model_names.index(model)
143
135
  ]["version-id"]
136
+ else:
137
+ raise ValueError(
138
+ f"Backend compute provider {self.compute_provider} not yet supported for @nim."
139
+ )
144
140
 
145
- def __call__(self, **kwargs):
141
+ # to know whether to set file_name
142
+ self.first_request = True
143
+
144
+ # TODO (Eddie) - this may make more sense in a base class.
145
+ # @nim arch needs redesign if customers start using it in more creative ways.
146
+ self._poll_seconds = "3600"
147
+ self._queue_timeout = queue_timeout
148
+ self._status = None
149
+ self._result = {}
150
+
151
+ @property
152
+ def status(self):
153
+ return self._status
154
+
155
+ @property
156
+ def has_failed(self):
157
+ return self._status == JobStatus.FAILED
158
+
159
+ @property
160
+ def is_running(self):
161
+ return self._status == JobStatus.SUBMITTED
162
+
163
+ @property
164
+ def result(self):
165
+ return self._result
166
+
167
+ def _log_stats(self, response, e2e_time):
168
+ stats = {}
169
+ if response.status_code == 200:
170
+ stats["success"] = 1
171
+ stats["error"] = 0
172
+ else:
173
+ stats["success"] = 0
174
+ stats["error"] = 1
175
+ stats["status_code"] = response.status_code
176
+ try:
177
+ stats["prompt_tokens"] = response.json()["usage"]["prompt_tokens"]
178
+ except KeyError:
179
+ stats["prompt_tokens"] = None
180
+ try:
181
+ stats["completion_tokens"] = response.json()["usage"]["completion_tokens"]
182
+ except KeyError:
183
+ stats["completion_tokens"] = None
184
+ stats["e2e_time"] = e2e_time
185
+ stats["provider"] = self.compute_provider
186
+ stats["model"] = self.model
146
187
 
147
- if self.compute_provider == "CoreWeave":
148
- request_data = {"model": self.model, **kwargs}
149
- response = requests.post(
150
- self.endpoint,
151
- headers=self._nim_metadata.get_headers_for_coreweave_request(),
152
- json=request_data,
188
+ conn = sqlite3.connect(self.file_name)
189
+ cursor = conn.cursor()
190
+ try:
191
+ cursor.execute(
192
+ """
193
+ INSERT INTO metrics (error, success, status_code, prompt_tokens, completion_tokens, e2e_time, model)
194
+ VALUES (?, ?, ?, ?, ?, ?, ?)
195
+ """,
196
+ (
197
+ stats["error"],
198
+ stats["success"],
199
+ stats["status_code"],
200
+ stats["prompt_tokens"],
201
+ stats["completion_tokens"],
202
+ stats["e2e_time"],
203
+ stats["model"],
204
+ ),
153
205
  )
154
- response.raise_for_status()
155
- return response.json()
206
+ conn.commit()
207
+ finally:
208
+ conn.close()
156
209
 
157
- elif self.compute_provider == "NVCF":
210
+ @retry_on_status(status_codes=[500], max_retries=3, delay=5)
211
+ @retry_on_status(status_codes=[504])
212
+ def __call__(self, **kwargs):
213
+
214
+ if self.first_request:
215
+ # Put here to guarantee self.file_name is set after task_id exists.
216
+ from metaflow import current
158
217
 
159
- request_data = {"model": self.model, **kwargs}
160
- request_url = f"{NVCF_SUBMIT_ENDPOINT}/{self.function_id}"
218
+ self.file_name = get_storage_path(current.task_id)
219
+
220
+ request_data = {"model": self.model, **kwargs}
221
+ request_url = f"{NVCF_SUBMIT_ENDPOINT}/{self.function_id}"
222
+ retry_delay = 1
223
+ attempts = 0
224
+ t0 = time.time()
225
+ while attempts < self.max_request_retries:
226
+ try:
227
+ attempts += 1
228
+ response = requests.post(
229
+ request_url,
230
+ headers=self._nim_metadata.get_headers_for_nvcf_request(),
231
+ json=request_data,
232
+ )
233
+ if response.status_code == 202:
234
+ invocation_id = response.headers.get("NVCF-REQID")
235
+ self.invocations.append(invocation_id)
236
+ self._status = JobStatus.SUBMITTED
237
+ elif response.status_code == 200:
238
+ tf = time.time()
239
+ if self.monitor:
240
+ self._log_stats(response, tf - t0)
241
+ self._status = JobStatus.SUCCESSFUL
242
+ self._result = response.json()
243
+ return self._result
244
+ elif response.status_code == 400:
245
+ self._status = JobStatus.FAILED
246
+ msg = (
247
+ "[@nim ERROR] The OpenAI-compatible returned a 400 status code. "
248
+ + "Known causes include improper requests or prompts with too many tokens for the selected model. "
249
+ + "Please contact Outerbounds if you need assistance resolving the issue."
250
+ )
251
+ print(msg, file=sys.stderr)
252
+ self._result = {"ERROR": msg}
253
+ return self._result
254
+ except (
255
+ requests.exceptions.ConnectionError,
256
+ requests.exceptions.ReadTimeout,
257
+ ) as e:
258
+ # ConnectionErrors are generally temporary errors like DNS resolution failures,
259
+ # timeouts etc.
260
+ print(
261
+ "received error of type {}. Retrying...".format(type(e)),
262
+ e,
263
+ file=sys.stderr,
264
+ )
265
+ time.sleep(retry_delay)
266
+ retry_delay *= 2 # Double the delay for the next attempt
267
+ retry_delay += random.uniform(0, 1) # Add jitter
268
+ retry_delay = min(retry_delay, 10)
161
269
 
270
+ def _poll():
271
+ poll_request_url = f"{NVCF_RESULT_ENDPOINT}/{invocation_id}"
162
272
  attempts = 0
273
+ retry_delay = 1
163
274
  while attempts < self.max_request_retries:
164
275
  try:
165
276
  attempts += 1
166
- response = requests.post(
167
- request_url,
277
+ poll_response = requests.get(
278
+ poll_request_url,
168
279
  headers=self._nim_metadata.get_headers_for_nvcf_request(),
169
- json=request_data,
170
280
  )
171
- response.raise_for_status()
172
- if response.status_code == 202:
173
- invocation_id = response.headers.get("NVCF-REQID")
174
- self.invocations.append(invocation_id)
175
- elif response.status_code == 200:
176
- return response.json()
281
+ if poll_response.status_code == 200:
282
+ tf = time.time()
283
+ self._log_stats(response, tf - t0)
284
+ self._status = JobStatus.SUCCESSFUL
285
+ self._result = poll_response.json()
286
+ return self._result
287
+ elif poll_response.status_code == 202:
288
+ self._status = JobStatus.SUBMITTED
289
+ return 202
290
+ elif poll_response.status_code == 400:
291
+ self._status = JobStatus.FAILED
292
+ msg = (
293
+ "[@nim ERROR] The OpenAI-compatible API returned a 400 status code. "
294
+ + "Known causes include improper requests or prompts with too many tokens for the selected model. "
295
+ + "Please contact Outerbounds if you need assistance resolving the issue."
296
+ )
297
+ print(msg, file=sys.stderr)
298
+ self._result = {"@nim ERROR": msg}
299
+ return self._result
177
300
  except (
178
301
  requests.exceptions.ConnectionError,
179
302
  requests.exceptions.ReadTimeout,
180
303
  ) as e:
181
- # ConnectionErrors are generally temporary errors like DNS resolution failures,
182
- # timeouts etc.
183
304
  print(
184
305
  "received error of type {}. Retrying...".format(type(e)),
185
306
  e,
@@ -190,44 +311,8 @@ class NimChatCompletion(object):
190
311
  retry_delay += random.uniform(0, 1) # Add jitter
191
312
  retry_delay = min(retry_delay, 10)
192
313
 
193
- def _poll():
194
- poll_request_url = f"{NVCF_RESULT_ENDPOINT}/{invocation_id}"
195
- attempts = 0
196
-
197
- while attempts < self.max_request_retries:
198
- try:
199
- attempts += 1
200
- poll_response = requests.get(
201
- poll_request_url,
202
- headers=self._nim_metadata.get_headers_for_nvcf_request(),
203
- )
204
- poll_response.raise_for_status()
205
- if poll_response.status_code == 200:
206
- return poll_response.json()
207
- elif poll_response.status_code == 202:
208
- return 202
209
- else:
210
- raise Exception(
211
- f"NVCF returned {poll_response.status_code} status code. Please contact Outerbounds."
212
- )
213
- except (
214
- requests.exceptions.ConnectionError,
215
- requests.exceptions.ReadTimeout,
216
- ) as e:
217
- # ConnectionErrors are generally temporary errors like DNS resolution failures,
218
- # timeouts etc.
219
- print(
220
- "received error of type {}. Retrying...".format(type(e)),
221
- e,
222
- file=sys.stderr,
223
- )
224
- time.sleep(retry_delay)
225
- retry_delay *= 2 # Double the delay for the next attempt
226
- retry_delay += random.uniform(0, 1) # Add jitter
227
- retry_delay = min(retry_delay, 10)
228
-
229
- while True:
230
- data = _poll()
231
- if data and data != 202:
232
- return data
233
- time.sleep(POLL_INTERVAL)
314
+ while True:
315
+ data = _poll()
316
+ if data and data != 202:
317
+ return data
318
+ time.sleep(NVCF_POLL_INTERVAL_SECONDS)
@@ -0,0 +1,5 @@
1
+ NIM_MONITOR_LOCAL_STORAGE_ROOT = ".nim-monitor"
2
+
3
+
4
+ def get_storage_path(task_id):
5
+ return f"{NIM_MONITOR_LOCAL_STORAGE_ROOT}/" + task_id + ".sqlite"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ob-metaflow-extensions
3
- Version: 1.1.128
3
+ Version: 1.1.130
4
4
  Summary: Outerbounds Platform Extensions for Metaflow
5
5
  Author: Outerbounds, Inc.
6
6
  License: Commercial