ob-metaflow-extensions 1.1.128__py2.py3-none-any.whl → 1.1.129__py2.py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of ob-metaflow-extensions might be problematic. Click here for more details.

@@ -1,10 +1,17 @@
1
1
  from functools import partial
2
- from metaflow.decorators import FlowDecorator
2
+ from uuid import uuid4
3
+ import os, time
4
+ from metaflow.decorators import StepDecorator
3
5
  from metaflow import current
6
+
4
7
  from .nim_manager import NimManager
8
+ from .card import NimMetricsRefresher
9
+ from .utilities import get_storage_path, NIM_MONITOR_LOCAL_STORAGE_ROOT
10
+ from ..card_utilities.async_cards import AsyncPeriodicRefresher
11
+ from ..card_utilities.injector import CardDecoratorInjector
5
12
 
6
13
 
7
- class NimDecorator(FlowDecorator):
14
+ class NimDecorator(StepDecorator, CardDecoratorInjector):
8
15
  """
9
16
  This decorator is used to run NIM containers in Metaflow tasks as sidecars.
10
17
 
@@ -18,13 +25,12 @@ class NimDecorator(FlowDecorator):
18
25
  Valid backend options
19
26
  ---------------------
20
27
  - 'managed': Outerbounds selects a compute provider based on the model.
21
- - 🚧 'dataplane': Run in your account.
22
28
 
23
29
  Valid model options
24
30
  ----------------
25
31
  - 'meta/llama3-8b-instruct': 8B parameter model
26
32
  - 'meta/llama3-70b-instruct': 70B parameter model
27
- - Upon request, any model here: https://nvcf.ngc.nvidia.com/functions?filter=nvidia-functions
33
+ - any model here: https://nvcf.ngc.nvidia.com/functions?filter=nvidia-functions
28
34
 
29
35
  Parameters
30
36
  ----------
@@ -32,21 +38,100 @@ class NimDecorator(FlowDecorator):
32
38
  List of NIM containers running models in sidecars.
33
39
  backend: str
34
40
  Compute provider to run the NIM container.
41
+ queue_timeout : int
42
+ Time to keep the job in NVCF's queue.
35
43
  """
36
44
 
37
45
  name = "nim"
38
46
  defaults = {
39
47
  "models": [],
40
48
  "backend": "managed",
49
+ "monitor": True,
50
+ "persist_db": False,
51
+ "queue_timeout": 5 * 24 * 3600, # Default 5 days in seconds
41
52
  }
42
53
 
43
- def flow_init(
44
- self, flow, graph, environment, flow_datastore, metadata, logger, echo, options
54
+ def step_init(
55
+ self, flow, graph, step_name, decorators, environment, flow_datastore, logger
45
56
  ):
57
+
58
+ if self.attributes["monitor"]:
59
+ self.attach_card_decorator(
60
+ flow,
61
+ step_name,
62
+ NimMetricsRefresher.CARD_ID,
63
+ "blank",
64
+ refresh_interval=4.0,
65
+ )
66
+
46
67
  current._update_env(
47
68
  {
48
69
  "nim": NimManager(
49
- models=self.attributes["models"], backend=self.attributes["backend"]
70
+ models=self.attributes["models"],
71
+ backend=self.attributes["backend"],
72
+ flow=flow,
73
+ step_name=step_name,
74
+ monitor=self.attributes["monitor"],
75
+ queue_timeout=self.attributes["queue_timeout"],
50
76
  )
51
77
  }
52
78
  )
79
+
80
+ def task_decorate(
81
+ self, step_func, flow, graph, retry_count, max_user_code_retries, ubf_context
82
+ ):
83
+ if self.attributes["monitor"]:
84
+
85
+ import sqlite3
86
+ from metaflow import current
87
+
88
+ file_path = get_storage_path(current.task_id)
89
+ if os.path.exists(file_path):
90
+ os.remove(file_path)
91
+ os.makedirs(NIM_MONITOR_LOCAL_STORAGE_ROOT, exist_ok=True)
92
+ conn = sqlite3.connect(file_path)
93
+ cursor = conn.cursor()
94
+ cursor.execute(
95
+ """
96
+ CREATE TABLE metrics (
97
+ error INTEGER,
98
+ success INTEGER,
99
+ status_code INTEGER,
100
+ prompt_tokens INTEGER,
101
+ completion_tokens INTEGER,
102
+ e2e_time NUMERIC,
103
+ model TEXT
104
+ )
105
+ """
106
+ )
107
+
108
+ def _wrapped_step_func(*args, **kwargs):
109
+ async_refresher_metrics = AsyncPeriodicRefresher(
110
+ NimMetricsRefresher(),
111
+ updater_interval=4.0,
112
+ collector_interval=2.0,
113
+ file_name=file_path,
114
+ )
115
+ try:
116
+ async_refresher_metrics.start()
117
+ return step_func(*args, **kwargs)
118
+ finally:
119
+ time.sleep(5.0) # buffer for the last update to synchronize
120
+ async_refresher_metrics.stop()
121
+
122
+ return _wrapped_step_func
123
+ else:
124
+ return step_func
125
+
126
+ def task_post_step(
127
+ self, step_name, flow, graph, retry_count, max_user_code_retries
128
+ ):
129
+ if not self.attributes["persist_db"]:
130
+ import shutil
131
+
132
+ file_path = get_storage_path(current.task_id)
133
+ if os.path.exists(file_path):
134
+ os.remove(file_path)
135
+ # if this task is the last one, delete the whole enchilada.
136
+ if not os.listdir(NIM_MONITOR_LOCAL_STORAGE_ROOT):
137
+ shutil.rmtree(NIM_MONITOR_LOCAL_STORAGE_ROOT, ignore_errors=True)
@@ -0,0 +1,154 @@
1
+ import os, sqlite3
2
+ from metaflow.cards import (
3
+ Markdown,
4
+ Table,
5
+ ProgressBar,
6
+ )
7
+ from metaflow.decorators import StepDecorator
8
+ from metaflow.metaflow_current import current
9
+
10
+ from .utilities import get_storage_path
11
+ from ..card_utilities.async_cards import CardRefresher
12
+ from ..card_utilities.extra_components import BarPlot, ViolinPlot
13
+
14
+
15
+ def json_to_artifact_table(data):
16
+ return ArtifactTable(data)
17
+
18
+
19
+ class NimMetricsRefresher(CardRefresher):
20
+ CARD_ID = "nim_metrics"
21
+
22
+ def __init__(self) -> None:
23
+ self._metrics_charts = {}
24
+ self._last_updated_on = None
25
+ self._already_rendered = False
26
+ self._file_name = get_storage_path(current.task_id)
27
+
28
+ def sqlite_fetch_func(self, conn):
29
+ cursor = conn.cursor()
30
+ try:
31
+ conn = sqlite3.connect(self._file_name)
32
+ cursor = conn.cursor()
33
+ cursor.execute(
34
+ "SELECT error, success, status_code, prompt_tokens, completion_tokens, e2e_time, model FROM metrics"
35
+ )
36
+ rows = cursor.fetchall()
37
+ data = {
38
+ "error": 0,
39
+ "success": 0,
40
+ "status_code": [],
41
+ "prompt_tokens": [],
42
+ "completion_tokens": [],
43
+ "e2e_time": [],
44
+ "model": [],
45
+ }
46
+ for row in rows:
47
+ data["error"] += row[0]
48
+ data["success"] += row[1]
49
+ data["status_code"].append(row[2])
50
+ data["prompt_tokens"].append(row[3])
51
+ data["completion_tokens"].append(row[4])
52
+ data["e2e_time"].append(row[5])
53
+ data["model"].append(row[6])
54
+ return data
55
+ finally:
56
+ conn.close()
57
+
58
+ def render_card_fresh(self, current_card, data):
59
+ self._already_rendered = True
60
+ current_card.clear()
61
+ current_card.append(Markdown("## Metrics"))
62
+
63
+ self._metrics_charts["request_success"] = BarPlot(
64
+ title="Request success",
65
+ category_name="category",
66
+ value_name="amount",
67
+ orientation="horizontal",
68
+ )
69
+ self._metrics_charts["latency_distribution"] = ViolinPlot(
70
+ title="Latency distribution (s)",
71
+ category_col_name="model",
72
+ value_col_name="e2e_time",
73
+ )
74
+
75
+ current_card.append(
76
+ Table(
77
+ data=[
78
+ [
79
+ self._metrics_charts["request_success"],
80
+ ],
81
+ [self._metrics_charts["latency_distribution"]],
82
+ ]
83
+ )
84
+ )
85
+ current_card.refresh()
86
+
87
+ def on_startup(self, current_card):
88
+ current_card.append(Markdown("# Task-level NIM API metrics"))
89
+ current_card.append(
90
+ Markdown(
91
+ "_waiting for data to appear_",
92
+ )
93
+ )
94
+ current_card.refresh()
95
+
96
+ def on_error(self, current_card, error_message):
97
+
98
+ if isinstance(error_message, FileNotFoundError):
99
+ return
100
+
101
+ if not self._already_rendered:
102
+ current_card.clear()
103
+ current_card.append(
104
+ Markdown(
105
+ f"## Error: {str(error_message)}",
106
+ )
107
+ )
108
+ current_card.refresh()
109
+
110
+ def update_only_components(self, current_card, data_object):
111
+
112
+ # update request success data
113
+ self._metrics_charts["request_success"].spec["data"][0]["values"] = [
114
+ {
115
+ "category": "Successful requests",
116
+ "amount": data_object["metrics"]["success"],
117
+ },
118
+ {"category": "Errors", "amount": data_object["metrics"]["error"]},
119
+ ]
120
+
121
+ latency_data = []
122
+ times = []
123
+ for m, e in zip(
124
+ data_object["metrics"]["model"], data_object["metrics"]["e2e_time"]
125
+ ):
126
+ latency_data.append({"model": m, "e2e_time": e})
127
+ times.append(e)
128
+
129
+ # update latency data
130
+ self._metrics_charts["latency_distribution"].spec["data"][0][
131
+ "values"
132
+ ] = latency_data
133
+
134
+ # update domain for latency plot
135
+ min_time = min(times)
136
+ max_time = max(times)
137
+ for scale in self._metrics_charts["latency_distribution"].spec["scales"]:
138
+ if scale["name"] == "xscale":
139
+ scale["domain"] = [min_time - max_time * 0.1, max_time + max_time * 0.1]
140
+
141
+ current_card.refresh()
142
+
143
+ def on_update(self, current_card, data_object):
144
+ data_object_keys = set(data_object.keys())
145
+ if len(data_object_keys) == 0:
146
+ return
147
+ if len(self._metrics_charts) == 0:
148
+ self.render_card_fresh(current_card, data_object)
149
+ return
150
+ elif len(data_object["metrics"]["status_code"]) == 0:
151
+ return
152
+ else:
153
+ self.update_only_components(current_card, data_object)
154
+ return
@@ -1,19 +1,20 @@
1
- import os
2
- import time
3
- import json
4
- import requests
1
+ import os, sys, time, json, random, requests, sqlite3
5
2
  from urllib.parse import urlparse
6
3
  from metaflow.metaflow_config import SERVICE_URL
7
4
  from metaflow.metaflow_config_funcs import init_config
8
- import sys
9
- import random
5
+ from .utilities import get_storage_path
6
+ from ..nvcf.nvcf import retry_on_status
7
+
10
8
 
11
9
  NVCF_URL = "https://api.nvcf.nvidia.com"
12
10
  NVCF_SUBMIT_ENDPOINT = f"{NVCF_URL}/v2/nvcf/pexec/functions"
13
11
  NVCF_RESULT_ENDPOINT = f"{NVCF_URL}/v2/nvcf/pexec/status"
14
-
15
- COMMON_HEADERS = {"accept": "application/json", "Content-Type": "application/json"}
16
- POLL_INTERVAL = 1
12
+ NVCF_POLL_INTERVAL_SECONDS = 1
13
+ COMMON_HEADERS = {
14
+ "accept": "application/json",
15
+ "Content-Type": "application/json",
16
+ "nvcf-feature-enable-gateway-timeout": "true",
17
+ }
17
18
 
18
19
 
19
20
  class NimMetadata(object):
@@ -56,41 +57,33 @@ class NimMetadata(object):
56
57
  def get_nvcf_chat_completion_models(self):
57
58
  return self._nvcf_chat_completion_models
58
59
 
59
- def get_coreweave_chat_completion_models(self):
60
- return self._coreweave_chat_completion_models
61
-
62
60
  def get_headers_for_nvcf_request(self):
63
61
  return {**COMMON_HEADERS, "Authorization": f"Bearer {self._ngc_api_key}"}
64
62
 
65
- def get_headers_for_coreweave_request(self):
66
- return COMMON_HEADERS
67
-
68
63
 
69
64
  class NimManager(object):
70
- def __init__(self, models, backend):
65
+ def __init__(self, models, backend, flow, step_name, monitor, queue_timeout):
66
+
71
67
  nim_metadata = NimMetadata()
72
68
  if backend == "managed":
73
69
  nvcf_models = [
74
70
  m["name"] for m in nim_metadata.get_nvcf_chat_completion_models()
75
71
  ]
76
- cw_models = [
77
- m["name"] for m in nim_metadata.get_coreweave_chat_completion_models()
78
- ]
79
72
 
80
73
  self.models = {}
81
74
  for m in models:
82
75
  if m in nvcf_models:
83
76
  self.models[m] = NimChatCompletion(
84
- model=m, provider="NVCF", nim_metadata=nim_metadata
85
- )
86
- elif m in cw_models:
87
- self.models[m] = NimChatCompletion(
88
- model=m, provider="CoreWeave", nim_metadata=nim_metadata
77
+ model=m,
78
+ provider="NVCF",
79
+ nim_metadata=nim_metadata,
80
+ monitor=monitor,
81
+ queue_timeout=queue_timeout,
89
82
  )
90
83
  else:
91
84
  raise ValueError(
92
85
  f"Model {m} not supported by the Outerbounds @nim offering."
93
- f"\nYou can choose from these options: {nvcf_models + cw_models}\n\n"
86
+ f"\nYou can choose from these options: {nvcf_models}\n\n"
94
87
  "Reach out to Outerbounds if there are other models you'd like supported."
95
88
  )
96
89
  else:
@@ -99,12 +92,21 @@ class NimManager(object):
99
92
  )
100
93
 
101
94
 
95
+ class JobStatus(object):
96
+ SUBMITTED = "SUBMITTED"
97
+ RUNNING = "RUNNING"
98
+ SUCCESSFUL = "SUCCESSFUL"
99
+ FAILED = "FAILED"
100
+
101
+
102
102
  class NimChatCompletion(object):
103
103
  def __init__(
104
104
  self,
105
105
  model="meta/llama3-8b-instruct",
106
- provider="CoreWeave",
106
+ provider="NVCF",
107
107
  nim_metadata=None,
108
+ monitor=False,
109
+ queue_timeout=None,
108
110
  **kwargs,
109
111
  ):
110
112
  if nim_metadata is None:
@@ -118,19 +120,9 @@ class NimChatCompletion(object):
118
120
  self.max_request_retries = int(
119
121
  os.environ.get("METAFLOW_EXT_HTTP_MAX_RETRIES", "10")
120
122
  )
123
+ self.monitor = monitor
121
124
 
122
- if self.compute_provider == "CoreWeave":
123
- cw_model_names = [
124
- m["name"]
125
- for m in self._nim_metadata.get_coreweave_chat_completion_models()
126
- ]
127
- self.model = model
128
- self.ip_address = self._nim_metadata.get_coreweave_chat_completion_models()[
129
- cw_model_names.index(model)
130
- ]["ip-address"]
131
- self.endpoint = f"http://{self.ip_address}:8000/v1/chat/completions"
132
-
133
- elif self.compute_provider == "NVCF":
125
+ if self.compute_provider == "NVCF":
134
126
  nvcf_model_names = [
135
127
  m["name"] for m in self._nim_metadata.get_nvcf_chat_completion_models()
136
128
  ]
@@ -141,45 +133,175 @@ class NimChatCompletion(object):
141
133
  self.version_id = self._nim_metadata.get_nvcf_chat_completion_models()[
142
134
  nvcf_model_names.index(model)
143
135
  ]["version-id"]
136
+ else:
137
+ raise ValueError(
138
+ f"Backend compute provider {self.compute_provider} not yet supported for @nim."
139
+ )
144
140
 
145
- def __call__(self, **kwargs):
141
+ # to know whether to set file_name
142
+ self.first_request = True
143
+
144
+ # TODO (Eddie) - this may make more sense in a base class.
145
+ # @nim arch needs redesign if customers start using it in more creative ways.
146
+ self._poll_seconds = "3600"
147
+ self._queue_timeout = queue_timeout
148
+ self._status = None
149
+ self._result = {}
150
+
151
+ @property
152
+ def status(self):
153
+ return self._status
154
+
155
+ @property
156
+ def has_failed(self):
157
+ return self._status == JobStatus.FAILED
158
+
159
+ @property
160
+ def is_running(self):
161
+ return self._status == JobStatus.SUBMITTED
162
+
163
+ @property
164
+ def result(self):
165
+ return self._result
166
+
167
+ def _log_stats(self, response, e2e_time):
168
+ stats = {}
169
+ if response.status_code == 200:
170
+ stats["success"] = 1
171
+ stats["error"] = 0
172
+ else:
173
+ stats["success"] = 0
174
+ stats["error"] = 1
175
+ stats["status_code"] = response.status_code
176
+ try:
177
+ stats["prompt_tokens"] = response.json()["usage"]["prompt_tokens"]
178
+ except KeyError:
179
+ stats["prompt_tokens"] = None
180
+ try:
181
+ stats["completion_tokens"] = response.json()["usage"]["completion_tokens"]
182
+ except KeyError:
183
+ stats["completion_tokens"] = None
184
+ stats["e2e_time"] = e2e_time
185
+ stats["provider"] = self.compute_provider
186
+ stats["model"] = self.model
146
187
 
147
- if self.compute_provider == "CoreWeave":
148
- request_data = {"model": self.model, **kwargs}
149
- response = requests.post(
150
- self.endpoint,
151
- headers=self._nim_metadata.get_headers_for_coreweave_request(),
152
- json=request_data,
188
+ conn = sqlite3.connect(self.file_name)
189
+ cursor = conn.cursor()
190
+ try:
191
+ cursor.execute(
192
+ """
193
+ INSERT INTO metrics (error, success, status_code, prompt_tokens, completion_tokens, e2e_time, model)
194
+ VALUES (?, ?, ?, ?, ?, ?, ?)
195
+ """,
196
+ (
197
+ stats["error"],
198
+ stats["success"],
199
+ stats["status_code"],
200
+ stats["prompt_tokens"],
201
+ stats["completion_tokens"],
202
+ stats["e2e_time"],
203
+ stats["model"],
204
+ ),
153
205
  )
154
- response.raise_for_status()
155
- return response.json()
206
+ conn.commit()
207
+ finally:
208
+ conn.close()
156
209
 
157
- elif self.compute_provider == "NVCF":
210
+ @retry_on_status(status_codes=[504])
211
+ def __call__(self, **kwargs):
212
+
213
+ if self.first_request:
214
+ # Put here to guarantee self.file_name is set after task_id exists.
215
+ from metaflow import current
158
216
 
159
- request_data = {"model": self.model, **kwargs}
160
- request_url = f"{NVCF_SUBMIT_ENDPOINT}/{self.function_id}"
217
+ self.file_name = get_storage_path(current.task_id)
218
+
219
+ request_data = {"model": self.model, **kwargs}
220
+ request_url = f"{NVCF_SUBMIT_ENDPOINT}/{self.function_id}"
221
+ retry_delay = 1
222
+ attempts = 0
223
+ t0 = time.time()
224
+ while attempts < self.max_request_retries:
225
+ try:
226
+ attempts += 1
227
+ response = requests.post(
228
+ request_url,
229
+ headers=self._nim_metadata.get_headers_for_nvcf_request(),
230
+ json=request_data,
231
+ )
232
+ if response.status_code == 202:
233
+ invocation_id = response.headers.get("NVCF-REQID")
234
+ self.invocations.append(invocation_id)
235
+ self._status = JobStatus.SUBMITTED
236
+ elif response.status_code == 200:
237
+ tf = time.time()
238
+ if self.monitor:
239
+ self._log_stats(response, tf - t0)
240
+ self._status = JobStatus.SUCCESSFUL
241
+ self._result = response.json()
242
+ return self._result
243
+ elif response.status_code == 400:
244
+ self._status = JobStatus.FAILED
245
+ msg = (
246
+ "[@nim ERROR] The OpenAI-compatible returned a 400 status code. "
247
+ + "Known causes include improper requests or prompts with too many tokens for the selected model. "
248
+ + "Please contact Outerbounds if you need assistance resolving the issue."
249
+ )
250
+ print(msg, file=sys.stderr)
251
+ self._result = {"ERROR": msg}
252
+ return self._result
253
+ except (
254
+ requests.exceptions.ConnectionError,
255
+ requests.exceptions.ReadTimeout,
256
+ ) as e:
257
+ # ConnectionErrors are generally temporary errors like DNS resolution failures,
258
+ # timeouts etc.
259
+ print(
260
+ "received error of type {}. Retrying...".format(type(e)),
261
+ e,
262
+ file=sys.stderr,
263
+ )
264
+ time.sleep(retry_delay)
265
+ retry_delay *= 2 # Double the delay for the next attempt
266
+ retry_delay += random.uniform(0, 1) # Add jitter
267
+ retry_delay = min(retry_delay, 10)
161
268
 
269
+ @retry_on_status(status_codes=[500], max_retries=3, delay=5)
270
+ @retry_on_status(status_codes=[504])
271
+ def _poll():
272
+ poll_request_url = f"{NVCF_RESULT_ENDPOINT}/{invocation_id}"
162
273
  attempts = 0
274
+ retry_delay = 1
163
275
  while attempts < self.max_request_retries:
164
276
  try:
165
277
  attempts += 1
166
- response = requests.post(
167
- request_url,
278
+ poll_response = requests.get(
279
+ poll_request_url,
168
280
  headers=self._nim_metadata.get_headers_for_nvcf_request(),
169
- json=request_data,
170
281
  )
171
- response.raise_for_status()
172
- if response.status_code == 202:
173
- invocation_id = response.headers.get("NVCF-REQID")
174
- self.invocations.append(invocation_id)
175
- elif response.status_code == 200:
176
- return response.json()
282
+ if poll_response.status_code == 200:
283
+ tf = time.time()
284
+ self._log_stats(response, tf - t0)
285
+ self._status = JobStatus.SUCCESSFUL
286
+ self._result = poll_response.json()
287
+ return self._result
288
+ elif poll_response.status_code == 202:
289
+ self._status = JobStatus.SUBMITTED
290
+ return 202
291
+ elif poll_response.status_code == 400:
292
+ self._status = JobStatus.FAILED
293
+ msg = (
294
+ "[@nim ERROR] The OpenAI-compatible API returned a 400 status code. "
295
+ + "Known causes include improper requests or prompts with too many tokens for the selected model. "
296
+ + "Please contact Outerbounds if you need assistance resolving the issue."
297
+ )
298
+ print(msg, file=sys.stderr)
299
+ self._result = {"@nim ERROR": msg}
300
+ return self._result
177
301
  except (
178
302
  requests.exceptions.ConnectionError,
179
303
  requests.exceptions.ReadTimeout,
180
304
  ) as e:
181
- # ConnectionErrors are generally temporary errors like DNS resolution failures,
182
- # timeouts etc.
183
305
  print(
184
306
  "received error of type {}. Retrying...".format(type(e)),
185
307
  e,
@@ -190,44 +312,8 @@ class NimChatCompletion(object):
190
312
  retry_delay += random.uniform(0, 1) # Add jitter
191
313
  retry_delay = min(retry_delay, 10)
192
314
 
193
- def _poll():
194
- poll_request_url = f"{NVCF_RESULT_ENDPOINT}/{invocation_id}"
195
- attempts = 0
196
-
197
- while attempts < self.max_request_retries:
198
- try:
199
- attempts += 1
200
- poll_response = requests.get(
201
- poll_request_url,
202
- headers=self._nim_metadata.get_headers_for_nvcf_request(),
203
- )
204
- poll_response.raise_for_status()
205
- if poll_response.status_code == 200:
206
- return poll_response.json()
207
- elif poll_response.status_code == 202:
208
- return 202
209
- else:
210
- raise Exception(
211
- f"NVCF returned {poll_response.status_code} status code. Please contact Outerbounds."
212
- )
213
- except (
214
- requests.exceptions.ConnectionError,
215
- requests.exceptions.ReadTimeout,
216
- ) as e:
217
- # ConnectionErrors are generally temporary errors like DNS resolution failures,
218
- # timeouts etc.
219
- print(
220
- "received error of type {}. Retrying...".format(type(e)),
221
- e,
222
- file=sys.stderr,
223
- )
224
- time.sleep(retry_delay)
225
- retry_delay *= 2 # Double the delay for the next attempt
226
- retry_delay += random.uniform(0, 1) # Add jitter
227
- retry_delay = min(retry_delay, 10)
228
-
229
- while True:
230
- data = _poll()
231
- if data and data != 202:
232
- return data
233
- time.sleep(POLL_INTERVAL)
315
+ while True:
316
+ data = _poll()
317
+ if data and data != 202:
318
+ return data
319
+ time.sleep(NVCF_POLL_INTERVAL_SECONDS)
@@ -0,0 +1,5 @@
1
+ NIM_MONITOR_LOCAL_STORAGE_ROOT = ".nim-monitor"
2
+
3
+
4
+ def get_storage_path(task_id):
5
+ return f"{NIM_MONITOR_LOCAL_STORAGE_ROOT}/" + task_id + ".sqlite"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: ob-metaflow-extensions
3
- Version: 1.1.128
3
+ Version: 1.1.129
4
4
  Summary: Outerbounds Platform Extensions for Metaflow
5
5
  Author: Outerbounds, Inc.
6
6
  License: Commercial