nv-ingest-api 2025.4.17.dev20250417__py3-none-any.whl → 2025.4.18.dev20250418__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

@@ -2,14 +2,12 @@
2
2
  # All rights reserved.
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
- # pylint: skip-file
6
-
7
5
  import logging
8
6
  import re
9
7
  import time
10
- from typing import Any
8
+ from typing import Any, Union, Tuple, Optional, Dict, Callable
9
+ from urllib.parse import urlparse
11
10
 
12
- import httpx
13
11
  import requests
14
12
 
15
13
  from nv_ingest_api.internal.schemas.message_brokers.response_schema import ResponseSchema
@@ -64,28 +62,11 @@ _TERMINAL_RESPONSE_STATUSES = [
64
62
 
65
63
  class RestClient(MessageBrokerClientBase):
66
64
  """
67
- A client for interfacing with the nv-ingest HTTP endpoint, providing mechanisms for sending and receiving messages
68
- with retry logic and connection management.
69
-
70
- Parameters
71
- ----------
72
- host : str
73
- The hostname of the HTTP server.
74
- port : int
75
- The port number of the HTTP server.
76
- max_retries : int, optional
77
- The maximum number of retry attempts for operations. Default is 0 (no retries).
78
- max_backoff : int, optional
79
- The maximum backoff delay between retries in seconds. Default is 32 seconds.
80
- connection_timeout : int, optional
81
- The timeout in seconds for connecting to the HTTP server. Default is 300 seconds.
82
- http_allocator : Any, optional
83
- The HTTP client allocator.
84
-
85
- Attributes
86
- ----------
87
- client : Any
88
- The HTTP client instance used for operations.
65
+ A client for interfacing with an HTTP endpoint (e.g., nv-ingest), providing mechanisms for sending
66
+ and receiving messages with retry logic using the `requests` library by default, but allowing a custom
67
+ HTTP client allocator.
68
+
69
+ Extends MessageBrokerClientBase for interface compatibility.
89
70
  """
90
71
 
91
72
  def __init__(
@@ -94,305 +75,457 @@ class RestClient(MessageBrokerClientBase):
94
75
  port: int,
95
76
  max_retries: int = 0,
96
77
  max_backoff: int = 32,
97
- connection_timeout: int = 300,
98
- http_allocator: Any = httpx.AsyncClient,
78
+ default_connect_timeout: float = 300.0,
79
+ default_read_timeout: Optional[float] = None,
80
+ http_allocator: Optional[Callable[[], Any]] = None,
99
81
  **kwargs,
100
- ):
101
- self._host = host
102
- self._port = port
103
- self._max_retries = max_retries
104
- self._max_backoff = max_backoff
105
- self._connection_timeout = connection_timeout
106
- self._http_allocator = http_allocator
107
- self._client = self._http_allocator()
108
- self._retries = 0
109
-
110
- self._submit_endpoint = "/v1/submit_job"
111
- self._fetch_endpoint = "/v1/fetch_job"
112
-
113
- if "base_url" in kwargs:
114
- logger.debug("Using custom base_url; ignoring host and port")
115
-
116
- self._base_url = kwargs.get("base_url") or self.generate_url(self._host, self._port)
82
+ ) -> None:
83
+ """
84
+ Initializes the RestClient.
85
+
86
+ By default, uses `requests.Session`. If `http_allocator` is provided, it will be called to instantiate
87
+ the client. If a custom allocator is used, the internal methods (`fetch_message`, `submit_message`)
88
+ might need adjustments if the allocated client's API differs significantly from `requests.Session`.
89
+
90
+ Parameters
91
+ ----------
92
+ host : str
93
+ The hostname or IP address of the HTTP server.
94
+ port : int
95
+ The port number of the HTTP server.
96
+ max_retries : int, optional
97
+ Maximum number of retry attempts for connection errors or specific retryable HTTP statuses. Default is 0.
98
+ max_backoff : int, optional
99
+ Maximum backoff delay between retries, in seconds. Default is 32.
100
+ default_connect_timeout : float, optional
101
+ Default timeout in seconds for establishing a connection. Default is 300.0.
102
+ default_read_timeout : float, optional
103
+ Default timeout in seconds for waiting for data after connection. Default is None.
104
+ http_allocator : Optional[Callable[[], Any]], optional
105
+ A callable that returns an HTTP client instance. If None, `requests.Session()` is used.
106
+
107
+ Returns
108
+ -------
109
+ None
110
+ """
111
+ self._host: str = host
112
+ self._port: int = port
113
+ self._max_retries: int = max_retries
114
+ self._max_backoff: int = max_backoff
115
+ self._default_connect_timeout: float = default_connect_timeout
116
+ self._default_read_timeout: Optional[float] = default_read_timeout
117
+ self._http_allocator: Optional[Callable[[], Any]] = http_allocator
118
+
119
+ self._timeout: Tuple[float, Optional[float]] = (self._default_connect_timeout, default_read_timeout)
120
+
121
+ if self._http_allocator is None:
122
+ self._client: Any = requests.Session()
123
+ logger.debug("RestClient initialized using default requests.Session.")
124
+ else:
125
+ try:
126
+ self._client = self._http_allocator()
127
+ logger.debug(f"RestClient initialized using provided http_allocator: {self._http_allocator.__name__}")
128
+ if not isinstance(self._client, requests.Session):
129
+ logger.warning(
130
+ "Provided http_allocator does not create a requests.Session. "
131
+ "Internal HTTP calls may fail if the client API is incompatible."
132
+ )
133
+ except Exception as e:
134
+ logger.exception(
135
+ f"Failed to instantiate client using provided http_allocator: {e}. "
136
+ f"Falling back to requests.Session."
137
+ )
138
+ self._client = requests.Session()
139
+
140
+ self._submit_endpoint: str = "/v1/submit_job"
141
+ self._fetch_endpoint: str = "/v1/fetch_job"
142
+ self._base_url: str = kwargs.get("base_url") or self._generate_url(self._host, self._port)
117
143
  self._headers = kwargs.get("headers", {})
118
144
  self._auth = kwargs.get("auth", None)
119
145
 
120
- def _connect(self) -> None:
146
+ logger.debug(f"RestClient base URL set to: {self._base_url}")
147
+
148
+ @staticmethod
149
+ def _generate_url(host: str, port: int) -> str:
121
150
  """
122
- Attempts to reconnect to the HTTP server if the current connection is not responsive.
151
+ Constructs a base URL from host and port, intelligently handling schemes and existing ports.
152
+
153
+ Parameters
154
+ ----------
155
+ host : str
156
+ Hostname, IP address, or full URL (e.g., "localhost", "192.168.1.100",
157
+ "http://example.com", "https://api.example.com:8443/v1").
158
+ port : int
159
+ The default port number to use if the host string does not explicitly specify one.
160
+
161
+ Returns
162
+ -------
163
+ str
164
+ A fully constructed base URL string, including scheme, hostname, port,
165
+ and any original path, without a trailing slash.
166
+
167
+ Raises
168
+ ------
169
+ ValueError
170
+ If the host string appears to be a URL but lacks a valid hostname.
123
171
  """
124
- ping_result = self.ping()
172
+ url_str: str = str(host).strip()
173
+ scheme: str = "http"
174
+ parsed_path: Optional[str] = None
175
+ effective_port: int = port
176
+ hostname: Optional[str] = None
177
+
178
+ if re.match(r"^https?://", url_str, re.IGNORECASE):
179
+ parsed_url = urlparse(url_str)
180
+ hostname = parsed_url.hostname
181
+ if hostname is None:
182
+ raise ValueError(f"Invalid URL provided in host string: '{url_str}'. Could not parse a valid hostname.")
183
+ scheme = parsed_url.scheme
184
+ if parsed_url.port is not None:
185
+ effective_port = parsed_url.port
186
+ else:
187
+ effective_port = port
188
+ if parsed_url.path and parsed_url.path.strip("/"):
189
+ parsed_path = parsed_url.path
190
+ else:
191
+ hostname = url_str
192
+ effective_port = port
193
+
194
+ if not hostname:
195
+ raise ValueError(f"Could not determine a valid hostname from input: '{host}'")
196
+
197
+ base_url: str = f"{scheme}://{hostname}:{effective_port}"
198
+ if parsed_path:
199
+ if not parsed_path.startswith("/"):
200
+ parsed_path = "/" + parsed_path
201
+ base_url += parsed_path
125
202
 
126
- if ping_result.response_code != 0:
127
- logger.debug("Reconnecting to HTTP server")
128
- self._client = self._http_allocator()
203
+ final_url: str = base_url.rstrip("/")
204
+ logger.debug(f"Generated base URL: {final_url}")
205
+ return final_url
129
206
 
130
207
  @property
131
208
  def max_retries(self) -> int:
209
+ """
210
+ Maximum number of retry attempts configured for operations.
211
+
212
+ Returns
213
+ -------
214
+ int
215
+ The maximum number of retries.
216
+ """
132
217
  return self._max_retries
133
218
 
134
219
  @max_retries.setter
135
220
  def max_retries(self, value: int) -> None:
221
+ """
222
+ Sets the maximum number of retry attempts.
223
+
224
+ Parameters
225
+ ----------
226
+ value : int
227
+ The new maximum number of retries. Must be a non-negative integer.
228
+
229
+ Raises
230
+ ------
231
+ ValueError
232
+ If value is not a non-negative integer.
233
+ """
234
+ if not isinstance(value, int) or value < 0:
235
+ raise ValueError("max_retries must be a non-negative integer.")
136
236
  self._max_retries = value
137
237
 
138
238
  def get_client(self) -> Any:
139
239
  """
140
- Returns a HTTP client instance, reconnecting if necessary.
240
+ Returns the underlying HTTP client instance.
141
241
 
142
242
  Returns
143
243
  -------
144
244
  Any
145
- The HTTP client instance.
245
+ The active HTTP client instance.
146
246
  """
147
- if self._client is None:
148
- self._connect()
149
247
  return self._client
150
248
 
151
- def ping(self) -> ResponseSchema:
249
+ def ping(self) -> "ResponseSchema":
152
250
  """
153
- Checks if the HTTP server is responsive.
251
+ Checks if the HTTP server endpoint is responsive using an HTTP GET request.
154
252
 
155
253
  Returns
156
254
  -------
157
- bool
158
- True if the server responds to a ping, False otherwise.
255
+ ResponseSchema
256
+ An object encapsulating the outcome:
257
+ - response_code = 0 indicates success (HTTP status code < 400).
258
+ - response_code = 1 indicates failure, with details in response_reason.
159
259
  """
260
+ ping_timeout: Tuple[float, float] = (min(self._default_connect_timeout, 5.0), 10.0)
261
+ logger.debug(f"Attempting to ping server at {self._base_url} with timeout {ping_timeout}")
160
262
  try:
161
- # Implement a simple GET request to a health endpoint or root
162
- self._client.ping()
163
- return ResponseSchema(response_code=0)
164
- except (httpx.HTTPError, AttributeError):
165
- return ResponseSchema(response_code=1, response_reason="Failed to ping HTTP server")
166
-
167
- @staticmethod
168
- def generate_url(user_provided_url, user_provided_port) -> str:
169
- """Examines the user defined URL for http*://. If that
170
- pattern is detected the URL is used as provided by the user.
171
- If that pattern does not exist then the assumption is made that
172
- the endpoint is simply `http://` and that is prepended
173
- to the user supplied endpoint.
174
-
175
- Args:
176
- user_provided_url str: Endpoint where the Rest service is running
177
-
178
- Returns:
179
- str: Fully validated URL
263
+ if isinstance(self._client, requests.Session):
264
+ response: requests.Response = self._client.get(self._base_url, timeout=ping_timeout)
265
+ response.raise_for_status()
266
+ logger.debug(f"Ping successful to {self._base_url} (Status: {response.status_code})")
267
+ return ResponseSchema(response_code=0, response_reason="Ping OK")
268
+ except requests.exceptions.RequestException as e:
269
+ error_reason: str = f"Ping failed due to RequestException for {self._base_url}: {e}"
270
+ logger.warning(error_reason)
271
+ return ResponseSchema(response_code=1, response_reason=error_reason)
272
+ except Exception as e:
273
+ error_reason: str = f"Unexpected error during ping to {self._base_url}: {e}"
274
+ logger.exception(error_reason)
275
+ return ResponseSchema(response_code=1, response_reason=error_reason)
276
+
277
+ def fetch_message(
278
+ self, job_id: str, timeout: Optional[Union[float, Tuple[float, float]]] = None
279
+ ) -> "ResponseSchema":
180
280
  """
181
- if not re.match(r"^https?://", user_provided_url):
182
- # Add the default `http://` if it's not already present in the URL
183
- user_provided_url = f"http://{user_provided_url}:{user_provided_port}"
184
- else:
185
- user_provided_url = f"{user_provided_url}:{user_provided_port}"
186
- return user_provided_url
281
+ Fetches a job result message from the server's fetch endpoint.
187
282
 
188
- def fetch_message(self, job_id: str, timeout: float = (10, 600)) -> ResponseSchema:
189
- """
190
- Fetches a message from the specified queue with retries on failure, handling streaming HTTP responses.
283
+ Handles retries for connection errors and non-terminal HTTP errors based on the max_retries configuration.
284
+ Specific HTTP statuses are treated as immediate failures (terminal) or as job not ready (HTTP 202).
191
285
 
192
286
  Parameters
193
287
  ----------
194
288
  job_id : str
195
- The server-side job identifier.
196
- timeout : float
197
- The timeout in seconds for blocking until a message is available.
289
+ The server-assigned identifier of the job to fetch.
290
+ timeout : float or tuple of float, optional
291
+ Specific timeout override for this request.
198
292
 
199
293
  Returns
200
294
  -------
201
295
  ResponseSchema
202
- The fetched message wrapped in a ResponseSchema object.
203
- """
204
- retries = 0
205
- url = f"{self._base_url}{self._fetch_endpoint}/{job_id}"
296
+ - response_code = 0: Success (HTTP 200) with the job result.
297
+ - response_code = 1: Terminal failure (e.g., 404, 400, 5xx, or max retries exceeded).
298
+ - response_code = 2: Job not ready (HTTP 202).
206
299
 
300
+ Raises
301
+ ------
302
+ TypeError
303
+ If the configured client does not support the required HTTP GET method.
304
+ """
207
305
  # Ensure headers are included
208
306
  headers = {"Content-Type": "application/json"}
209
307
  headers.update(self._headers)
210
308
 
211
- while True:
212
- try:
213
- logger.debug(f"Invoking fetch_message http endpoint @ '{url}'")
214
-
215
- # Fetch using streaming response
216
- with requests.get(
217
- url,
218
- timeout=(30, 600),
219
- stream=True,
220
- headers=headers,
221
- auth=self._auth,
222
- ) as result:
223
- response_code = result.status_code
224
-
225
- if response_code in _TERMINAL_RESPONSE_STATUSES:
226
- # Terminal response code; return error ResponseSchema
227
- return ResponseSchema(
228
- response_code=1,
229
- response_reason=(
230
- f"Terminal response code {response_code} received when fetching JobSpec: {job_id}"
231
- ),
232
- response=result.text,
233
- )
234
-
235
- if response_code == 200:
236
- # Handle streaming response, reconstructing payload incrementally
237
- response_chunks = []
238
- for chunk in result.iter_content(chunk_size=1024 * 1024): # 1MB chunks
239
- if chunk:
240
- response_chunks.append(chunk)
241
- full_response = b"".join(response_chunks).decode("utf-8")
309
+ retries: int = 0
310
+ url: str = f"{self._base_url}{self._fetch_endpoint}/{job_id}"
311
+ req_timeout: Tuple[float, Optional[float]] = self._timeout
242
312
 
243
- return ResponseSchema(
244
- response_code=0,
245
- response_reason="OK",
246
- response=full_response,
247
- )
248
-
249
- elif response_code == 202:
250
- # Job is not ready yet
251
- return ResponseSchema(
252
- response_code=1,
253
- response_reason="Job is not ready yet. Retry later.",
254
- )
313
+ while True:
314
+ result: Optional[Any] = None
315
+ trace_id: Optional[str] = None
316
+ response_code: int = -1
255
317
 
256
- else:
257
- try:
258
- # Retry the operation
259
- retries = self.perform_retry_backoff(retries)
260
- except RuntimeError as rte:
261
- raise rte
262
-
263
- except (ConnectionError, requests.HTTPError, requests.exceptions.ConnectionError) as err:
264
- logger.error(f"Error during fetching, retrying... Error: {err}")
265
- self._client = None # Invalidate client to force reconnection
266
- if "Connection refused" in str(err):
267
- logger.debug(
268
- "Connection refused encountered during fetch; sleeping for 10 seconds before retrying."
318
+ try:
319
+ if isinstance(self._client, requests.Session):
320
+ with self._client.get(
321
+ url, timeout=req_timeout, headers=headers, stream=True, auth=self._auth
322
+ ) as result:
323
+ response_code = result.status_code
324
+ response_text = result.text
325
+
326
+ if response_code in _TERMINAL_RESPONSE_STATUSES:
327
+ error_reason: str = f"Terminal response code {response_code} fetching {job_id}."
328
+ logger.error(f"{error_reason} Response: {response_text[:200]}")
329
+ return ResponseSchema(
330
+ response_code=1, response_reason=error_reason, response=response_text, trace_id=trace_id
331
+ )
332
+ elif response_code == 200:
333
+ try:
334
+ full_response: str = b"".join(c for c in result.iter_content(1024 * 1024) if c).decode(
335
+ "utf-8"
336
+ )
337
+ return ResponseSchema(
338
+ response_code=0, response_reason="OK", response=full_response, trace_id=trace_id
339
+ )
340
+ except Exception as e:
341
+ logger.error(f"Stream processing error for {job_id}: {e}")
342
+ return ResponseSchema(
343
+ response_code=1, response_reason=f"Stream processing error: {e}", trace_id=trace_id
344
+ )
345
+ elif response_code == 202:
346
+ logger.debug(f"Job {job_id} not ready (202)")
347
+ return ResponseSchema(
348
+ response_code=2, response_reason="Job not ready yet. Retry later.", trace_id=trace_id
349
+ )
350
+ else:
351
+ logger.warning(f"Unexpected status {response_code} for {job_id}. Retrying if possible.")
352
+ else:
353
+ raise TypeError(
354
+ f"Unsupported client type for fetch_message: {type(self._client)}. "
355
+ f"Requires a requests.Session compatible API."
269
356
  )
270
- time.sleep(10)
357
+ except requests.exceptions.RequestException as err:
358
+ logger.debug(
359
+ f"RequestException fetching {job_id}: {err}. "
360
+ f"Attempting retry ({retries + 1}/{self._max_retries})..."
361
+ )
271
362
  try:
272
363
  retries = self.perform_retry_backoff(retries)
364
+ continue
273
365
  except RuntimeError as rte:
274
- # Max retries reached
366
+ logger.error(f"Max retries hit fetching {job_id} after RequestException: {rte}")
275
367
  return ResponseSchema(response_code=1, response_reason=str(rte), response=str(err))
276
- except TimeoutError:
277
- raise
278
368
  except Exception as e:
279
- # Handle non-http specific exceptions
280
- logger.error(f"Unexpected error during fetch from {url}: {e}")
369
+ logger.exception(f"Unexpected error fetching {job_id}: {e}")
370
+ return ResponseSchema(response_code=1, response_reason=f"Unexpected fetch error: {e}")
371
+
372
+ try:
373
+ retries = self.perform_retry_backoff(retries)
374
+ continue
375
+ except RuntimeError as rte:
376
+ logger.error(f"Max retries hit fetching {job_id} after HTTP {response_code}: {rte}")
377
+ resp_text_snippet: Optional[str] = response_text[:500] if "response_text" in locals() else None
281
378
  return ResponseSchema(
282
- response_code=1, response_reason=f"Unexpected error during fetch: {e}", response=None
379
+ response_code=1,
380
+ response_reason=f"Max retries after HTTP {response_code}: {rte}",
381
+ response=resp_text_snippet,
382
+ trace_id=trace_id,
283
383
  )
284
384
 
285
- def submit_message(self, channel_name: str, message: str, for_nv_ingest: bool = False) -> ResponseSchema:
385
+ def submit_message(
386
+ self,
387
+ channel_name: str,
388
+ message: str,
389
+ for_nv_ingest: bool = False,
390
+ timeout: Optional[Union[float, Tuple[float, float]]] = None,
391
+ ) -> "ResponseSchema":
286
392
  """
287
- Submits a JobSpec to a specified HTTP endpoint with retries on failure.
393
+ Submits a job message payload to the server's submit endpoint.
394
+
395
+ Handles retries for connection errors and non-terminal HTTP errors based on the max_retries configuration.
396
+ Specific HTTP statuses are treated as immediate failures.
288
397
 
289
398
  Parameters
290
399
  ----------
291
400
  channel_name : str
292
- Not used as part of RestClient but defined in MessageClientBase.
401
+ Not used by RestClient; included for interface compatibility.
293
402
  message : str
294
- The message to submit.
295
- for_nv_ingest : bool
296
- Not used as part of RestClient but defined in MessageClientBase.
403
+ The JSON string representing the job specification payload.
404
+ for_nv_ingest : bool, optional
405
+ Not used by RestClient. Default is False.
406
+ timeout : float or tuple of float, optional
407
+ Specific timeout override for this request.
297
408
 
298
409
  Returns
299
410
  -------
300
411
  ResponseSchema
301
- The response from the server wrapped in a ResponseSchema object.
412
+ - response_code = 0: Success (HTTP 200) with a successful job submission.
413
+ - response_code = 1: Terminal failure (e.g., 422, 400, 5xx, or max retries exceeded).
414
+
415
+ Raises
416
+ ------
417
+ TypeError
418
+ If the configured client does not support the required HTTP POST method.
302
419
  """
303
- retries = 0
304
- url = f"{self._base_url}{self._submit_endpoint}"
420
+ retries: int = 0
421
+ url: str = f"{self._base_url}{self._submit_endpoint}"
422
+ headers: Dict[str, str] = {"Content-Type": "application/json"}
423
+ request_payload: Dict[str, str] = {"payload": message}
424
+ req_timeout: Tuple[float, Optional[float]] = self._timeout
305
425
 
306
426
  # Ensure content-type is present
307
427
  headers = {"Content-Type": "application/json"}
308
428
  headers.update(self._headers)
309
429
 
310
430
  while True:
311
- try:
312
- # Submit via HTTP
313
- result = requests.post(
314
- url,
315
- json={"payload": message},
316
- headers=headers,
317
- auth=self._auth,
318
- timeout=self._connection_timeout,
319
- )
431
+ result: Optional[Any] = None
432
+ trace_id: Optional[str] = None
433
+ response_code: int = -1
320
434
 
321
- response_code = result.status_code
322
- if response_code in _TERMINAL_RESPONSE_STATUSES:
323
- # Terminal response code; return error ResponseSchema
324
- return ResponseSchema(
325
- response_code=1,
326
- response_reason=f"Terminal response code {response_code} received when submitting JobSpec",
327
- trace_id=result.headers.get("x-trace-id"),
435
+ try:
436
+ if isinstance(self._client, requests.Session):
437
+ result = self._client.post(
438
+ url,
439
+ json=request_payload,
440
+ headers=headers,
441
+ auth=self._auth,
442
+ timeout=req_timeout,
328
443
  )
329
- else:
330
- # If 200 we are good, otherwise let's try again
331
- if response_code == 200:
332
- logger.debug(f"JobSpec successfully submitted to http endpoint {self._submit_endpoint}")
333
- # The REST interface returns a JobId, so we capture that here
334
- x_trace_id = result.headers.get("x-trace-id")
444
+ response_code = result.status_code
445
+ trace_id = result.headers.get("x-trace-id")
446
+ response_text: str = result.text
447
+
448
+ if response_code in _TERMINAL_RESPONSE_STATUSES:
449
+ error_reason: str = f"Terminal response code {response_code} submitting job."
450
+ logger.error(f"{error_reason} Response: {response_text[:200]}")
451
+ return ResponseSchema(
452
+ response_code=1, response_reason=error_reason, response=response_text, trace_id=trace_id
453
+ )
454
+ elif response_code == 200:
455
+ server_job_id_raw: str = response_text
456
+ cleaned_job_id: str = server_job_id_raw.strip('"')
457
+ logger.debug(f"Submit successful. Server Job ID: {cleaned_job_id}, Trace: {trace_id}")
335
458
  return ResponseSchema(
336
459
  response_code=0,
337
460
  response_reason="OK",
338
- response=result.text,
339
- transaction_id=result.text,
340
- trace_id=x_trace_id,
461
+ response=server_job_id_raw,
462
+ transaction_id=cleaned_job_id,
463
+ trace_id=trace_id,
341
464
  )
342
465
  else:
343
- # Retry the operation
344
- retries = self.perform_retry_backoff(retries)
345
- except requests.RequestException as e:
346
- logger.error(f"Failed to submit job, retrying... Error: {e}")
347
- self._client = None # Invalidate client to force reconnection
348
- if "Connection refused" in str(e):
349
- logger.debug(
350
- "Connection refused encountered during submission; sleeping for 10 seconds before retrying."
466
+ logger.warning(f"Unexpected status {response_code} on submit. Retrying if possible.")
467
+ else:
468
+ raise TypeError(
469
+ f"Unsupported client type for submit_message: {type(self._client)}. "
470
+ f"Requires a requests.Session compatible API."
351
471
  )
352
- time.sleep(10)
472
+ except requests.exceptions.RequestException as err:
473
+ logger.warning(
474
+ f"RequestException submitting job: {err}. Attempting retry ({retries + 1}/{self._max_retries})..."
475
+ )
353
476
  try:
354
477
  retries = self.perform_retry_backoff(retries)
478
+ continue
355
479
  except RuntimeError as rte:
356
- # Max retries reached
357
- return ResponseSchema(response_code=1, response_reason=str(rte), response=str(e))
480
+ logger.error(f"Max retries hit submitting job after RequestException: {rte}")
481
+ return ResponseSchema(response_code=1, response_reason=str(rte), response=str(err))
358
482
  except Exception as e:
359
- # Handle non-http specific exceptions
360
- logger.error(f"Unexpected error during submission of JobSpec to {url}: {e}")
483
+ logger.exception(f"Unexpected error submitting job: {e}")
484
+ return ResponseSchema(response_code=1, response_reason=f"Unexpected submit error: {e}")
485
+
486
+ try:
487
+ retries = self.perform_retry_backoff(retries)
488
+ continue
489
+ except RuntimeError as rte:
490
+ logger.error(f"Max retries hit submitting job after HTTP {response_code}: {rte}")
491
+ resp_text_snippet: Optional[str] = response_text[:500] if "response_text" in locals() else None
361
492
  return ResponseSchema(
362
- response_code=1, response_reason=f"Unexpected error during JobSpec submission: {e}", response=None
493
+ response_code=1,
494
+ response_reason=f"Max retries after HTTP {response_code}: {rte}",
495
+ response=resp_text_snippet,
496
+ trace_id=trace_id,
363
497
  )
364
498
 
365
- def perform_retry_backoff(self, existing_retries) -> int:
499
+ def perform_retry_backoff(self, existing_retries: int) -> int:
366
500
  """
367
- Attempts to perform a backoff retry delay. This function accepts the
368
- current number of retries that have been attempted and compares
369
- that with the maximum number of retries allowed. If the current
370
- number of retries exceeds the max then a RuntimeError is raised.
501
+ Performs exponential backoff sleep if retries are permitted.
502
+
503
+ Calculates the delay using exponential backoff (2^existing_retries) capped by self._max_backoff.
504
+ Sleeps for the calculated delay if the number of existing_retries is less than max_retries.
371
505
 
372
506
  Parameters
373
507
  ----------
374
508
  existing_retries : int
375
- The number of retries that have been attempted for this operation thus far
509
+ The number of retries already attempted for the current operation.
376
510
 
377
511
  Returns
378
512
  -------
379
513
  int
380
- The updated number of retry attempts that have been made for this operation
514
+ The incremented retry count (existing_retries + 1).
381
515
 
382
516
  Raises
383
517
  ------
384
518
  RuntimeError
385
- Raised if the maximum number of retry attempts has been reached.
519
+ If existing_retries is greater than or equal to max_retries (when max_retries > 0).
386
520
  """
387
- backoff_delay = min(2**existing_retries, self._max_backoff)
521
+ if self._max_retries > 0 and existing_retries >= self._max_retries:
522
+ raise RuntimeError(f"Max retry attempts ({self._max_retries}) reached")
523
+ backoff_delay: int = min(2**existing_retries, self._max_backoff)
524
+ retry_attempt_num: int = existing_retries + 1
388
525
  logger.debug(
389
- f"Retry #: {existing_retries} of max_retries: {self.max_retries} | "
390
- f"current backoff_delay: {backoff_delay}s of max_backoff: {self._max_backoff}s"
526
+ f"Operation failed. Retrying attempt "
527
+ f"{retry_attempt_num}/{self._max_retries if self._max_retries > 0 else 'infinite'} "
528
+ f"in {backoff_delay:.2f}s..."
391
529
  )
392
-
393
- if self.max_retries > 0 and existing_retries < self.max_retries:
394
- logger.error(f"Operation failed, retrying in {backoff_delay}s...")
395
- time.sleep(backoff_delay)
396
- return existing_retries + 1
397
- else:
398
- raise RuntimeError(f"Max retry attempts of {self.max_retries} reached")
530
+ time.sleep(backoff_delay)
531
+ return retry_attempt_num