nv-ingest-api 2025.4.17.dev20250417__py3-none-any.whl → 2025.4.18.dev20250418__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

@@ -5,53 +5,40 @@
5
5
  import json
6
6
  import logging
7
7
  import time
8
- from typing import Any
8
+ import random
9
+ from typing import Any, Callable, Union
9
10
  from typing import Dict
10
11
  from typing import List
11
12
  from typing import Optional
12
13
  from typing import Tuple
13
- from typing import Union
14
14
 
15
15
  import redis
16
- from redis.exceptions import RedisError
17
16
 
18
- from nv_ingest_api.util.service_clients.client_base import MessageBrokerClientBase
17
+
18
+ from nv_ingest_api.util.service_clients.client_base import MessageBrokerClientBase, FetchMode
19
+
20
+ try:
21
+ from diskcache import Cache
22
+
23
+ DISKCACHE_AVAILABLE = True
24
+ except ImportError:
25
+ DISKCACHE_AVAILABLE = False
19
26
 
20
27
  # pylint: skip-file
21
28
 
22
29
  logger = logging.getLogger(__name__)
23
30
 
31
+ # Default cache path and TTL (adjust as needed)
32
+ DEFAULT_CACHE_DIR = "/tmp/.fetch_cache"
33
+ DEFAULT_CACHE_TTL_SECONDS = 3600 # 1 hour
34
+
24
35
 
25
36
  class RedisClient(MessageBrokerClientBase):
26
37
  """
27
38
  A client for interfacing with Redis, providing mechanisms for sending and receiving messages
28
- with retry logic and connection management.
29
-
30
- Parameters
31
- ----------
32
- host : str
33
- The hostname of the Redis server.
34
- port : int
35
- The port number of the Redis server.
36
- db : int, optional
37
- The database number to connect to. Default is 0.
38
- max_retries : int, optional
39
- The maximum number of retry attempts for operations. Default is 0 (no retries).
40
- max_backoff : int, optional
41
- The maximum backoff delay between retries in seconds. Default is 32 seconds.
42
- connection_timeout : int, optional
43
- The timeout in seconds for connecting to the Redis server. Default is 300 seconds.
44
- max_pool_size : int, optional
45
- The maximum number of connections in the Redis connection pool. Default is 128.
46
- use_ssl : bool, optional
47
- Specifies if SSL should be used for the connection. Default is False.
48
- redis_allocator : Any, optional
49
- The Redis client allocator, allowing for custom Redis client instances. Default is redis.Redis.
50
-
51
- Attributes
52
- ----------
53
- client : Any
54
- The Redis client instance used for operations.
39
+ with retry logic, connection management, configurable fetch modes, and optional local caching.
40
+
41
+ Handles message fragmentation transparently during fetch operations.
55
42
  """
56
43
 
57
44
  def __init__(
@@ -59,105 +46,248 @@ class RedisClient(MessageBrokerClientBase):
59
46
  host: str,
60
47
  port: int,
61
48
  db: int = 0,
62
- max_retries: int = 0,
49
+ max_retries: int = 3,
63
50
  max_backoff: int = 32,
64
51
  connection_timeout: int = 300,
65
52
  max_pool_size: int = 128,
66
53
  use_ssl: bool = False,
67
- redis_allocator: Any = redis.Redis, # Type hint as 'Any' due to dynamic nature
68
- ):
69
- self._host = host
70
- self._port = port
71
- self._db = db
72
- self._max_retries = max_retries
73
- self._max_backoff = max_backoff
74
- self._connection_timeout = connection_timeout
75
- self._use_ssl = use_ssl
76
- self._pool = redis.ConnectionPool(
77
- host=self._host,
78
- port=self._port,
79
- db=self._db,
80
- socket_connect_timeout=self._connection_timeout,
81
- max_connections=max_pool_size,
82
- )
83
- self._redis_allocator = redis_allocator
84
- self._client = self._redis_allocator(connection_pool=self._pool)
85
- self._retries = 0
54
+ redis_allocator: Callable[..., redis.Redis] = redis.Redis,
55
+ fetch_mode: "FetchMode" = None, # Replace with appropriate default if FetchMode.DESTRUCTIVE is available.
56
+ cache_config: Optional[Dict[str, Any]] = None,
57
+ message_ttl_seconds: Optional[int] = 600,
58
+ ) -> None:
59
+ """
60
+ Initializes the Redis client with connection pooling, retry/backoff configuration,
61
+ and optional caching for non-destructive or hybrid fetch modes.
62
+
63
+ Parameters
64
+ ----------
65
+ host : str
66
+ The Redis server hostname or IP address.
67
+ port : int
68
+ The Redis server port.
69
+ db : int, optional
70
+ The Redis logical database to use. Default is 0.
71
+ max_retries : int, optional
72
+ Maximum number of retries allowed for operations. Default is 3.
73
+ max_backoff : int, optional
74
+ Maximum backoff in seconds for retry delays. Default is 32.
75
+ connection_timeout : int, optional
76
+ Timeout in seconds for establishing a Redis connection. Default is 300.
77
+ max_pool_size : int, optional
78
+ Maximum size of the Redis connection pool. Default is 128.
79
+ use_ssl : bool, optional
80
+ Whether to use SSL for the connection. Default is False.
81
+ redis_allocator : Callable[..., redis.Redis], optional
82
+ Callable that returns a Redis client instance. Default is redis.Redis.
83
+ fetch_mode : FetchMode, optional
84
+ Fetch mode configuration (e.g., DESTRUCTIVE, NON_DESTRUCTIVE, CACHE_BEFORE_DELETE).
85
+ Default should be set appropriately (e.g., FetchMode.DESTRUCTIVE).
86
+ cache_config : dict, optional
87
+ Configuration dictionary for local caching, e.g., {"directory": "/path/to/cache", "ttl": 7200}.
88
+ message_ttl_seconds : int, optional
89
+ TTL (in seconds) for messages in NON_DESTRUCTIVE mode. If not provided,
90
+ messages may persist indefinitely.
91
+
92
+ Returns
93
+ -------
94
+ None
95
+ """
96
+ self._host: str = host
97
+ self._port: int = port
98
+ self._db: int = db
99
+ self._max_retries: int = max_retries
100
+ self._max_backoff: int = max_backoff
101
+ self._connection_timeout: int = connection_timeout
102
+ self._use_ssl: bool = use_ssl # TODO: Implement SSL specifics.
103
+ # If no fetch_mode is provided, assume a default value.
104
+ self._fetch_mode: "FetchMode" = fetch_mode if fetch_mode is not None else FetchMode.DESTRUCTIVE
105
+ self._message_ttl_seconds: Optional[int] = message_ttl_seconds
106
+ self._redis_allocator: Callable[..., redis.Redis] = redis_allocator
107
+
108
+ if self._fetch_mode == FetchMode.NON_DESTRUCTIVE and message_ttl_seconds is None:
109
+ logger.warning(
110
+ "FetchMode.NON_DESTRUCTIVE selected without setting message_ttl_seconds. "
111
+ "Messages fetched non-destructively may persist indefinitely in Redis."
112
+ )
113
+
114
+ # Configure Connection Pool
115
+ pool_kwargs: Dict[str, Any] = {
116
+ "host": self._host,
117
+ "port": self._port,
118
+ "db": self._db,
119
+ "socket_connect_timeout": self._connection_timeout,
120
+ "max_connections": max_pool_size,
121
+ }
122
+ if self._use_ssl:
123
+ pool_kwargs["ssl"] = True
124
+ pool_kwargs["ssl_cert_reqs"] = None # Or specify requirements as needed.
125
+ logger.debug("Redis connection configured with SSL.")
126
+
127
+ self._pool: redis.ConnectionPool = redis.ConnectionPool(**pool_kwargs)
128
+
129
+ # Allocate initial client
130
+ self._client: Optional[redis.Redis] = self._redis_allocator(connection_pool=self._pool)
131
+
132
+ # Configure Cache if mode requires it
133
+ self._cache: Optional[Any] = None
134
+ if self._fetch_mode == FetchMode.CACHE_BEFORE_DELETE and DISKCACHE_AVAILABLE:
135
+ cache_dir: str = (cache_config or {}).get("directory", DEFAULT_CACHE_DIR)
136
+ self._cache_ttl: int = (cache_config or {}).get("ttl", DEFAULT_CACHE_TTL_SECONDS)
137
+ try:
138
+ # TODO: make size_limit configurable
139
+ self._cache = Cache(cache_dir, timeout=self._cache_ttl, size_limit=int(50e9))
140
+ logger.debug(f"Fetch cache enabled: mode={self._fetch_mode}, dir={cache_dir}, ttl={self._cache_ttl}s")
141
+ except Exception as e:
142
+ logger.exception(f"Failed to initialize disk cache at {cache_dir}. Caching disabled. Error: {e}")
143
+ self._fetch_mode = FetchMode.DESTRUCTIVE
144
+ logger.warning("Falling back to FetchMode.DESTRUCTIVE due to cache init failure.")
145
+
146
+ # Validate max_retries on init using setter
147
+ self.max_retries = max_retries
86
148
 
87
149
  def _connect(self) -> None:
88
150
  """
89
- Attempts to reconnect to the Redis server if the current connection is not responsive.
151
+ Attempts to reconnect to the Redis server by allocating a new client from the pool.
152
+
153
+ Returns
154
+ -------
155
+ None
156
+
157
+ Raises
158
+ ------
159
+ ConnectionError
160
+ If the newly allocated client fails to respond to a ping.
90
161
  """
91
- if not self.ping():
92
- logger.debug("Reconnecting to Redis")
162
+ logger.debug("Attempting to reconnect to Redis by re-allocating client.")
163
+ try:
93
164
  self._client = self._redis_allocator(connection_pool=self._pool)
165
+ if not self.ping():
166
+ raise ConnectionError("Re-allocated client failed to ping.")
167
+ logger.info("Successfully reconnected to Redis.")
168
+ except Exception as e:
169
+ logger.error(f"Failed to reconnect to Redis: {e}")
170
+ self._client = None
94
171
 
95
172
  @property
96
173
  def max_retries(self) -> int:
174
+ """
175
+ Gets the maximum number of allowed retries for Redis operations.
176
+
177
+ Returns
178
+ -------
179
+ int
180
+ The maximum number of retries.
181
+ """
97
182
  return self._max_retries
98
183
 
99
184
  @max_retries.setter
100
185
  def max_retries(self, value: int) -> None:
186
+ """
187
+ Sets the maximum number of allowed retries for Redis operations.
188
+
189
+ Parameters
190
+ ----------
191
+ value : int
192
+ The new maximum retries value; must be a non-negative integer.
193
+
194
+ Raises
195
+ ------
196
+ ValueError
197
+ If the value is not a non-negative integer.
198
+ """
199
+ if not isinstance(value, int) or value < 0:
200
+ raise ValueError("max_retries must be a non-negative integer.")
101
201
  self._max_retries = value
102
202
 
103
- def get_client(self) -> Any:
203
+ def get_client(self) -> redis.Redis:
104
204
  """
105
- Returns a Redis client instance, reconnecting if necessary.
205
+ Returns a Redis client instance, attempting reconnection if the current client is invalid.
106
206
 
107
207
  Returns
108
208
  -------
109
- Any
110
- The Redis client instance.
209
+ redis.Redis
210
+ The active Redis client instance.
211
+
212
+ Raises
213
+ ------
214
+ RuntimeError
215
+ If no valid client can be established.
111
216
  """
112
- if self._client is None or not self.ping():
113
- self._connect()
217
+ if self._client is None:
218
+ logger.info("Redis client is None, attempting to connect.")
219
+ try:
220
+ self._connect()
221
+ except Exception as connect_err:
222
+ logger.error(f"Error during _connect attempt: {connect_err}")
223
+ self._client = None
224
+
225
+ if self._client is None:
226
+ raise RuntimeError("Failed to establish or re-establish connection to Redis.")
227
+
114
228
  return self._client
115
229
 
116
230
  def ping(self) -> bool:
117
231
  """
118
- Checks if the Redis server is responsive.
232
+ Checks if the Redis client connection is alive by issuing a PING command.
119
233
 
120
234
  Returns
121
235
  -------
122
236
  bool
123
- True if the server responds to a ping, False otherwise.
237
+ True if the ping is successful, False otherwise.
124
238
  """
239
+ if self._client is None:
240
+ logger.debug("Ping check: No client instance exists.")
241
+ return False
125
242
  try:
126
- self._client.ping()
127
- return True
128
- except (RedisError, AttributeError):
243
+ is_alive: bool = self._client.ping()
244
+ if is_alive:
245
+ logger.debug("Ping successful.")
246
+ return True
247
+ else:
248
+ logger.warning("Ping command returned non-True value unexpectedly.")
249
+ self._client = None
250
+ return False
251
+ except (OSError, AttributeError) as e:
252
+ logger.warning(f"Ping failed, invalidating client connection: ({type(e).__name__}) {e}")
253
+ self._client = None
254
+ return False
255
+ except redis.RedisError as e:
256
+ logger.warning(f"Ping failed due to RedisError: {e}. Invalidating client.")
257
+ self._client = None
258
+ return False
259
+ except Exception as e:
260
+ logger.exception(f"Unexpected error during ping, invalidating client: {e}")
261
+ self._client = None
129
262
  return False
130
263
 
131
264
  def _check_response(
132
265
  self, channel_name: str, timeout: float
133
266
  ) -> Tuple[Optional[Dict[str, Any]], Optional[int], Optional[int]]:
134
267
  """
135
- Checks for a response from the Redis queue and processes it into a message, fragment, and fragment count.
268
+ Checks for a response from a Redis queue and processes it into a message and its fragmentation metadata.
136
269
 
137
270
  Parameters
138
271
  ----------
139
272
  channel_name : str
140
- The name of the Redis channel from which to receive the response.
273
+ The Redis channel from which to retrieve the response.
141
274
  timeout : float
142
- The time in seconds to wait for a response from the Redis queue before timing out.
275
+ The time in seconds to wait for a response.
143
276
 
144
277
  Returns
145
278
  -------
146
- Tuple[Optional[Dict[str, Any]], Optional[int], Optional[int]]
147
- A tuple containing:
148
- - message: A dictionary containing the decoded message if successful,
149
- or None if no message was retrieved.
150
- - fragment: An integer representing the fragment number of the message,
151
- or None if no fragment was found.
152
- - fragment_count: An integer representing the total number of message fragments,
153
- or None if no fragment count was found.
279
+ tuple of (Optional[Dict[str, Any]], Optional[int], Optional[int])
280
+ - The decoded message as a dictionary, or None if not retrieved.
281
+ - The fragment number (default 0 if absent), or None.
282
+ - The total number of fragments, or None.
154
283
 
155
284
  Raises
156
285
  ------
286
+ TimeoutError
287
+ If no response is received within the specified timeout.
157
288
  ValueError
158
- If the message retrieved from Redis cannot be decoded from JSON.
289
+ If the message cannot be decoded from JSON.
159
290
  """
160
-
161
291
  response = self.get_client().blpop([channel_name], timeout)
162
292
  if response is None:
163
293
  raise TimeoutError("No response was received in the specified timeout period")
@@ -165,9 +295,8 @@ class RedisClient(MessageBrokerClientBase):
165
295
  if len(response) > 1 and response[1]:
166
296
  try:
167
297
  message = json.loads(response[1])
168
- fragment = message.get("fragment", 0)
169
- fragment_count = message.get("fragment_count", 1)
170
-
298
+ fragment: int = message.get("fragment", 0)
299
+ fragment_count: int = message.get("fragment_count", 1)
171
300
  return message, fragment, fragment_count
172
301
  except json.JSONDecodeError as e:
173
302
  logger.error(f"Failed to decode message: {e}")
@@ -175,160 +304,520 @@ class RedisClient(MessageBrokerClientBase):
175
304
 
176
305
  return None, None, None
177
306
 
178
- def fetch_message(self, channel_name: str, timeout: float = 10) -> Optional[Union[str, Dict]]:
307
+ def _fetch_first_or_all_fragments_destructive(
308
+ self, channel_name: str, timeout: float
309
+ ) -> Union[Dict[str, Any], List[Dict[str, Any]]]:
179
310
  """
180
- Fetches a message from the specified queue with retries on failure. If the message is fragmented, it will
181
- continue fetching fragments until all parts have been collected.
311
+ Fetches message fragments destructively using BLPOP, returning either a single message
312
+ or a list of fragments if the message is split.
182
313
 
183
314
  Parameters
184
315
  ----------
185
- channel_name: str
186
- Channel to fetch the message from.
316
+ channel_name : str
317
+ The Redis list key from which to pop the message.
187
318
  timeout : float
188
- The timeout in seconds for blocking until a message is available. If we receive a multi-part message,
189
- this value will be temporarily extended in order to collect all fragments.
319
+ The timeout in seconds for the BLPOP command.
190
320
 
191
321
  Returns
192
322
  -------
193
- Optional[str or Dict]
194
- The full fetched message, or None if no message could be fetched after retries.
323
+ dict or list of dict
324
+ If the message is not fragmented, returns a single dictionary.
325
+ If fragmented, returns a list of dictionaries representing each fragment.
195
326
 
196
327
  Raises
197
328
  ------
329
+ TimeoutError
330
+ If the initial BLPOP times out or if subsequent fragments are not retrieved within the allotted time.
198
331
  ValueError
199
- If fetching the message fails after the specified number of retries or due to other critical errors.
332
+ If JSON decoding fails or if fragment indices are inconsistent.
200
333
  """
201
- accumulated_time = 0
202
- collected_fragments = []
203
- fragment_count = None
204
- retries = 0
334
+ fragments: List[Dict[str, Any]] = []
335
+ expected_count: int = 1
336
+ first_message: Optional[Dict[str, Any]] = None
337
+ accumulated_fetch_time: float = 0.0
205
338
 
206
- logger.debug(f"Starting fetch_message on channel '{channel_name}' with timeout {timeout}s.")
339
+ logger.debug(f"Destructive fetch: Popping first item from '{channel_name}' with timeout {timeout:.2f}s")
340
+ start_pop_time: float = time.monotonic()
341
+ response = self.get_client().blpop([channel_name], timeout=int(max(1, timeout)))
342
+ fetch_duration: float = time.monotonic() - start_pop_time
207
343
 
208
- while True:
344
+ if response is None:
345
+ logger.debug(f"BLPOP timed out on '{channel_name}', no message available.")
346
+ raise TimeoutError("No message received within the initial timeout period")
347
+
348
+ if len(response) > 1 and response[1]:
349
+ message_bytes = response[1]
209
350
  try:
210
- # Attempt to fetch a message from the Redis queue
211
- message, fragment, fragment_count = self._check_response(channel_name, timeout)
212
- logger.debug(f"Fetched fragment: {fragment} (fragment_count: {fragment_count}).")
213
-
214
- if message is not None:
215
- if fragment_count == 1:
216
- return message
217
-
218
- collected_fragments.append(message)
219
- logger.debug(f"Collected {len(collected_fragments)} of {fragment_count} fragments so far.")
220
-
221
- # If we have collected all fragments, combine and return
222
- if len(collected_fragments) == fragment_count:
223
- logger.debug("All fragments received. Sorting and combining fragments.")
224
- # Sort fragments by the 'fragment' field to ensure correct order
225
- collected_fragments.sort(key=lambda x: x["fragment"])
226
- reconstructed_message = self._combine_fragments(collected_fragments)
227
- logger.debug("Message reconstructed successfully. Returning combined message.")
228
- return reconstructed_message
351
+ first_message = json.loads(message_bytes)
352
+ expected_count = first_message.get("fragment_count", 1)
353
+ fragment_idx: int = first_message.get("fragment", 0)
354
+ if expected_count == 1:
355
+ logger.debug(f"Fetched single (non-fragmented) message from '{channel_name}'.")
356
+ return first_message
357
+ logger.info(
358
+ f"Fetched fragment {fragment_idx + 1}/{expected_count} from '{channel_name}'. "
359
+ f"Need to fetch remaining."
360
+ )
361
+ if fragment_idx != 0:
362
+ logger.error(
363
+ f"Expected first fragment (index 0) but got {fragment_idx} from '{channel_name}'. "
364
+ f"Aborting fetch."
365
+ )
366
+ raise ValueError(f"First fragment fetched was index {fragment_idx}, expected 0.")
367
+ fragments.append(first_message)
368
+ accumulated_fetch_time += fetch_duration
369
+
370
+ remaining_timeout: float = max(0.1, timeout - accumulated_fetch_time)
371
+ for i in range(1, expected_count):
372
+ start_frag_pop_time: float = time.monotonic()
373
+ frag_timeout: float = max(1, remaining_timeout / max(1, expected_count - i))
374
+ logger.debug(f"Popping fragment {i + 1}/{expected_count} with timeout {frag_timeout:.2f}s")
375
+ frag_response = self.get_client().blpop([channel_name], timeout=int(frag_timeout))
376
+ frag_fetch_duration: float = time.monotonic() - start_frag_pop_time
377
+ accumulated_fetch_time += frag_fetch_duration
378
+ remaining_timeout = max(0, timeout - accumulated_fetch_time)
379
+ if frag_response is None:
380
+ logger.error(f"Timeout waiting for fragment {i + 1}/{expected_count} on '{channel_name}'.")
381
+ raise TimeoutError(f"Timeout collecting fragments for {channel_name}")
382
+ if len(frag_response) > 1 and frag_response[1]:
383
+ frag_bytes = frag_response[1]
384
+ try:
385
+ frag_message = json.loads(frag_bytes)
386
+ fragments.append(frag_message)
387
+ except json.JSONDecodeError as e_frag:
388
+ logger.error(
389
+ f"Failed to decode fragment {i + 1} JSON from '{channel_name}': {e_frag}. "
390
+ f"Data: {frag_bytes[:200]}"
391
+ )
392
+ raise ValueError(f"Failed to decode message fragment {i + 1}: {e_frag}")
393
+ else:
394
+ logger.error(
395
+ f"Unexpected BLPOP response format for fragment {i + 1} "
396
+ f"on '{channel_name}': {frag_response}"
397
+ )
398
+ raise ValueError(f"Unexpected BLPOP response format for fragment {i + 1}")
399
+ logger.debug(f"Successfully fetched all {expected_count} fragments destructively.")
400
+ return fragments
401
+ except json.JSONDecodeError as e:
402
+ logger.error(
403
+ f"Failed to decode first message JSON from '{channel_name}': {e}. Data: {message_bytes[:200]}"
404
+ )
405
+ raise ValueError(f"Failed to decode first message: {e}") from e
406
+ else:
407
+ logger.warning(f"BLPOP for '{channel_name}' returned unexpected response format: {response}")
408
+ raise ValueError("Unexpected response format from BLPOP")
409
+
410
+ def _fetch_fragments_non_destructive(self, channel_name: str, timeout: float) -> List[Dict[str, Any]]:
411
+ """
412
+ Fetches all message fragments non-destructively by polling the Redis list. Uses LINDEX,
413
+ LLEN, and LRANGE to collect fragments, respecting a total timeout.
414
+
415
+ Parameters
416
+ ----------
417
+ channel_name : str
418
+ The Redis list key where fragments are stored.
419
+ timeout : float
420
+ The total allowed time in seconds for collecting all fragments.
421
+
422
+ Returns
423
+ -------
424
+ List[Dict[str, Any]]
425
+ A list of unique fragment dictionaries.
426
+
427
+ Raises
428
+ ------
429
+ TimeoutError
430
+ If the overall timeout is exceeded before all expected fragments are collected.
431
+ ValueError
432
+ If JSON decoding fails or inconsistent fragment counts are detected.
433
+ ConnectionError
434
+ If the Redis connection fails.
435
+ redis.RedisError
436
+ For other Redis-related errors.
437
+ """
438
+ start_time: float = time.monotonic()
439
+ polling_delay: float = 0.1
440
+ expected_count: Optional[int] = None
441
+ fragments_map: Dict[int, Dict[str, Any]] = {}
442
+
443
+ logger.debug(f"Starting non-destructive fetch for '{channel_name}' with total timeout {timeout:.2f}s.")
444
+
445
+ while True:
446
+ current_time: float = time.monotonic()
447
+ elapsed_time: float = current_time - start_time
448
+ if elapsed_time > timeout:
449
+ logger.warning(f"Overall timeout ({timeout}s) exceeded for non-destructive fetch of '{channel_name}'.")
450
+ if expected_count:
451
+ raise TimeoutError(
452
+ f"Timeout collecting fragments for {channel_name}. "
453
+ f"Collected {len(fragments_map)}/{expected_count}."
454
+ )
229
455
  else:
230
- logger.debug("Received empty response; returning None.")
231
- return message
456
+ raise TimeoutError(f"Timeout waiting for initial fragment 0 for {channel_name}.")
232
457
 
233
- except TimeoutError:
234
- # When fragments are expected but not all received before timeout
235
- if fragment_count and fragment_count > 1:
236
- accumulated_time += timeout
458
+ client = self.get_client()
459
+ try:
460
+ if expected_count is None:
461
+ logger.debug(f"Polling for fragment 0 on '{channel_name}'. Elapsed: {elapsed_time:.2f}s")
462
+ frag0_bytes: Optional[bytes] = client.lindex(channel_name, 0)
463
+ if frag0_bytes is not None:
464
+ try:
465
+ message = json.loads(frag0_bytes)
466
+ fragment_idx: int = message.get("fragment", -1)
467
+ current_expected: int = message.get("fragment_count", 1)
468
+ if fragment_idx == 0:
469
+ logger.debug(
470
+ f"Found fragment 0 for '{channel_name}'. "
471
+ f"Expecting {current_expected} total fragments."
472
+ )
473
+ expected_count = current_expected
474
+ if fragment_idx not in fragments_map:
475
+ fragments_map[fragment_idx] = message
476
+ if expected_count == 1:
477
+ logger.debug("Single fragment expected and found. Fetch complete.")
478
+ break
479
+ else:
480
+ logger.warning(
481
+ f"Expected fragment 0 but found index {fragment_idx} "
482
+ f"at LINDEX 0 for '{channel_name}'. List state potentially inconsistent. "
483
+ f"Will keep polling."
484
+ )
485
+ except json.JSONDecodeError as e:
486
+ logger.error(
487
+ f"Failed to decode JSON at index 0 for '{channel_name}': {e}. Data: {frag0_bytes[:200]}"
488
+ )
489
+ raise ValueError(f"Failed to decode potential fragment 0: {e}")
490
+
491
+ if expected_count is not None and len(fragments_map) < expected_count:
492
+ current_len: int = client.llen(channel_name)
237
493
  logger.debug(
238
- f"Timeout occurred waiting for fragments. "
239
- f"Accumulated timeout: {accumulated_time}s (Threshold: {timeout * fragment_count}s)."
494
+ f"Polling '{channel_name}': Current length {current_len}, "
495
+ f"have {len(fragments_map)}/{expected_count} fragments. Elapsed: {elapsed_time:.2f}s"
240
496
  )
241
- if accumulated_time >= (timeout * fragment_count):
242
- err_msg = f"Failed to reconstruct message from {channel_name} after {accumulated_time} sec."
243
- logger.error(err_msg)
244
- raise ValueError(err_msg)
245
- else:
246
- raise # This is expected in many cases, so re-raise it
497
+ if current_len >= expected_count:
498
+ fetch_end_index: int = expected_count - 1
499
+ logger.debug(f"Fetching full expected range: LRANGE 0 {fetch_end_index}")
500
+ raw_potential_fragments: List[bytes] = client.lrange(channel_name, 0, fetch_end_index)
501
+ processed_count_this_pass: int = 0
502
+ for item_bytes in raw_potential_fragments:
503
+ try:
504
+ message = json.loads(item_bytes)
505
+ fragment_idx: int = message.get("fragment", -1)
506
+ current_expected_in_frag: int = message.get("fragment_count", 1)
507
+ if current_expected_in_frag != expected_count:
508
+ logger.error(
509
+ f"Inconsistent fragment_count in fragment {fragment_idx} for '{channel_name}' "
510
+ f"({current_expected_in_frag} vs expected {expected_count})."
511
+ )
512
+ raise ValueError("Inconsistent fragment count detected in list")
513
+ if 0 <= fragment_idx < expected_count and fragment_idx not in fragments_map:
514
+ fragments_map[fragment_idx] = message
515
+ processed_count_this_pass += 1
516
+ logger.debug(f"Processed fragment {fragment_idx + 1}/{expected_count} from LRANGE.")
517
+ except json.JSONDecodeError as e:
518
+ logger.error(
519
+ f"Failed to decode JSON fragment during poll for "
520
+ f"'{channel_name}': {e}. Data: {item_bytes[:200]}"
521
+ )
522
+ raise ValueError(f"Failed to decode message fragment: {e}")
523
+ if processed_count_this_pass > 0:
524
+ logger.debug(f"Found {processed_count_this_pass} new fragments this pass.")
525
+ if len(fragments_map) == expected_count:
526
+ logger.debug(f"Collected all {expected_count} expected fragments for '{channel_name}'.")
527
+ break
528
+ if expected_count is None or len(fragments_map) < expected_count:
529
+ time.sleep(polling_delay)
530
+ except (ValueError, json.JSONDecodeError) as e:
531
+ logger.error(f"Validation or decoding error during non-destructive fetch for '{channel_name}': {e}")
532
+ raise e
533
+ except (redis.RedisError, ConnectionError) as e:
534
+ logger.warning(
535
+ f"Redis/Connection error during non-destructive poll for '{channel_name}': {e}. Propagating up."
536
+ )
537
+ raise e
538
+ except Exception as e:
539
+ logger.exception(f"Unexpected error during non-destructive poll for '{channel_name}': {e}")
540
+ raise RuntimeError(f"Unexpected polling error: {e}") from e
247
541
 
248
- except RedisError as err:
249
- retries += 1
250
- logger.error(f"Redis error during fetch: {err}")
251
- backoff_delay = min(2**retries, self._max_backoff)
542
+ if expected_count is None or len(fragments_map) != expected_count:
543
+ logger.error(
544
+ f"Exited non-destructive fetch loop for '{channel_name}' but collection is incomplete. "
545
+ f"Have {len(fragments_map)}/{expected_count}. This should not happen."
546
+ )
547
+ raise RuntimeError(f"Internal logic error: Incomplete fragment collection for {channel_name}")
548
+
549
+ fragment_list: List[Dict[str, Any]] = list(fragments_map.values())
550
+ logger.debug(f"Successfully collected {len(fragment_list)} fragments for '{channel_name}' non-destructively.")
551
+ return fragment_list
552
+
553
+ def _fetch_fragments_cached(self, channel_name: str, timeout: float) -> List[Dict[str, Any]]:
554
+ """
555
+ Attempts to retrieve cached message fragments; if unsuccessful, fetches destructively from Redis
556
+ and writes the result to cache.
557
+
558
+ Parameters
559
+ ----------
560
+ channel_name : str
561
+ The Redis channel key to fetch the message from.
562
+ timeout : float
563
+ The timeout in seconds for fetching from Redis.
564
+
565
+ Returns
566
+ -------
567
+ List[Dict[str, Any]]
568
+ A list of message fragments retrieved either from cache or Redis.
569
+
570
+ Raises
571
+ ------
572
+ RuntimeError
573
+ If caching is not configured.
574
+ NotImplementedError
575
+ If caching of fragments is not implemented.
576
+ """
577
+ if not self._cache:
578
+ logger.error("Cache is not configured or failed to initialize. Cannot use CACHE_BEFORE_DELETE mode.")
579
+ raise RuntimeError("Cache not available for cached fetch mode.")
580
+
581
+ cache_key: str = f"fetch_cache:{channel_name}"
582
+ try:
583
+ cached_result = self._cache.get(cache_key)
584
+ if cached_result is not None:
585
+ logger.debug(f"Cache hit for '{channel_name}'. Returning cached data.")
586
+ self._cache.delete(cache_key)
587
+ # TODO: Decide on final caching design.
588
+ raise NotImplementedError("Caching fragments is complex; cache final result instead.")
589
+ except Exception as e:
590
+ logger.exception(f"Error accessing cache for '{channel_name}': {e}. Proceeding to Redis fetch.")
591
+
592
+ logger.debug(f"Cache miss for '{channel_name}'. Fetching destructively from Redis.")
593
+ fragments = self._fetch_first_or_all_fragments_destructive(channel_name, timeout)
594
+ try:
595
+ self._cache.set(cache_key, fragments, expire=self._cache_ttl)
596
+ logger.debug(f"Stored fetched fragments for '{channel_name}' in cache.")
597
+ except Exception as e:
598
+ logger.exception(f"Failed to write fragments for '{channel_name}' to cache: {e}")
599
+ return fragments
600
+
601
+ def fetch_message(
602
+ self, channel_name: str, timeout: float = 10, override_fetch_mode: Optional["FetchMode"] = None
603
+ ) -> Optional[Dict[str, Any]]:
604
+ """
605
+ Fetches a complete message from Redis. It handles fragmentation according to the specified
606
+ or configured fetch mode and retries on connection errors.
607
+
608
+ Parameters
609
+ ----------
610
+ channel_name : str
611
+ The Redis channel key from which to fetch the message.
612
+ timeout : float, optional
613
+ The timeout in seconds for fetching the message. Default is 10 seconds.
614
+ override_fetch_mode : FetchMode, optional
615
+ If provided, overrides the configured fetch mode for this operation.
252
616
 
617
+ Returns
618
+ -------
619
+ dict or None
620
+ The final reconstructed message dictionary if successful, or None if not found.
621
+
622
+ Raises
623
+ ------
624
+ TimeoutError
625
+ If fetching times out.
626
+ ValueError
627
+ If non-retryable errors occur or max retries are exceeded.
628
+ RuntimeError
629
+ For other runtime errors.
630
+ """
631
+ retries: int = 0
632
+ effective_fetch_mode: "FetchMode" = override_fetch_mode if override_fetch_mode is not None else self._fetch_mode
633
+ log_prefix: str = f"fetch_message(mode={effective_fetch_mode.name}, channel='{channel_name}')"
634
+ if override_fetch_mode:
635
+ logger.debug(f"{log_prefix}: Using overridden mode.")
636
+ else:
637
+ logger.debug(f"{log_prefix}: Using configured mode.")
638
+
639
+ if effective_fetch_mode == FetchMode.CACHE_BEFORE_DELETE and DISKCACHE_AVAILABLE:
640
+ if not self._cache:
641
+ raise RuntimeError(f"{log_prefix}: Cache not available.")
642
+
643
+ cache_key: str = f"fetch_cache:{channel_name}"
644
+ try:
645
+ cached_final_result = self._cache.get(cache_key)
646
+ if cached_final_result is not None:
647
+ logger.info(f"{log_prefix}: Cache hit.")
648
+ self._cache.delete(cache_key)
649
+ return cached_final_result
650
+ except Exception as e:
651
+ logger.exception(f"{log_prefix}: Cache read error: {e}. Trying Redis.")
652
+
653
+ while True:
654
+ try:
655
+ fetch_result: Union[Dict[str, Any], List[Dict[str, Any]]]
656
+ if effective_fetch_mode == FetchMode.DESTRUCTIVE:
657
+ fetch_result = self._fetch_first_or_all_fragments_destructive(channel_name, timeout)
658
+ elif effective_fetch_mode == FetchMode.NON_DESTRUCTIVE:
659
+ fetch_result = self._fetch_fragments_non_destructive(channel_name, timeout)
660
+ elif effective_fetch_mode == FetchMode.CACHE_BEFORE_DELETE:
661
+ fetch_result = self._fetch_first_or_all_fragments_destructive(channel_name, timeout)
662
+ else:
663
+ raise ValueError(f"{log_prefix}: Unsupported fetch mode: {effective_fetch_mode}")
664
+
665
+ if isinstance(fetch_result, dict):
666
+ logger.debug(f"{log_prefix}: Received single message directly.")
667
+ final_message: Dict[str, Any] = fetch_result
668
+ elif isinstance(fetch_result, list):
669
+ logger.debug(f"{log_prefix}: Received {len(fetch_result)} fragments, combining.")
670
+ final_message = self._combine_fragments(fetch_result)
671
+ else:
672
+ logger.error(f"{log_prefix}: Fetch helper returned unexpected type: {type(fetch_result)}")
673
+ raise TypeError("Internal error: Unexpected fetch result type.")
674
+
675
+ if effective_fetch_mode == FetchMode.CACHE_BEFORE_DELETE and self._cache:
676
+ cache_key = f"fetch_cache:{channel_name}"
677
+ try:
678
+ self._cache.set(cache_key, final_message, expire=self._cache_ttl)
679
+ logger.info(f"{log_prefix}: Stored reconstructed message in cache.")
680
+ except Exception as e:
681
+ logger.exception(f"{log_prefix}: Cache write error: {e}")
682
+ return final_message
683
+
684
+ except TimeoutError as e:
685
+ logger.debug(f"{log_prefix}: Timeout during fetch operation: {e}")
686
+ raise e
687
+
688
+ except (redis.RedisError, ConnectionError) as e:
689
+ retries += 1
690
+ logger.warning(
691
+ f"{log_prefix}: Redis/Connection error ({type(e).__name__}): {e}. "
692
+ f"Attempt {retries}/{self.max_retries}"
693
+ )
694
+ self._client = None
253
695
  if self.max_retries > 0 and retries <= self.max_retries:
254
- logger.error(f"Fetch attempt failed, retrying in {backoff_delay}s...")
255
- time.sleep(backoff_delay)
696
+ backoff_delay: float = min(2 ** (retries - 1), self._max_backoff)
697
+ jitter: float = random.uniform(0, backoff_delay * 0.2)
698
+ sleep_time: float = backoff_delay + jitter
699
+ logger.info(f"{log_prefix}: Retrying in {sleep_time:.2f}s...")
700
+ time.sleep(sleep_time)
701
+ continue
256
702
  else:
257
- logger.error(f"Failed to fetch message from {channel_name} after {retries} attempts.")
258
- raise ValueError(f"Failed to fetch message from Redis queue after {retries} attempts: {err}")
703
+ logger.error(f"{log_prefix}: Max retries ({self.max_retries}) exceeded. Last error: {e}")
704
+ raise ValueError(f"Failed to fetch from Redis after {retries} attempts: {e}") from e
259
705
 
260
- # Invalidate client to force reconnection on the next try
261
- self._client = None
706
+ except (ValueError, RuntimeError, TypeError, NotImplementedError) as e:
707
+ logger.error(f"{log_prefix}: Non-retryable error during fetch: ({type(e).__name__}) {e}")
708
+ raise e
262
709
 
263
710
  except Exception as e:
264
- # Handle non-Redis specific exceptions
265
- logger.error(f"Unexpected error during fetch from {channel_name}: {e}")
266
- raise ValueError(f"Unexpected error during fetch: {e}")
711
+ logger.exception(f"{log_prefix}: Unexpected error during fetch: {e}")
712
+ raise ValueError(f"Unexpected error during fetch: {e}") from e
267
713
 
268
714
  @staticmethod
269
- def _combine_fragments(fragments: List[Dict[str, Any]]) -> Dict:
715
+ def _combine_fragments(fragments: List[Dict[str, Any]]) -> Dict[str, Any]:
270
716
  """
271
- Combines multiple message fragments into a single message by extending the 'data' elements,
272
- retaining the 'status' and 'description' of the first fragment, and removing 'fragment' and 'fragment_counts'.
717
+ Combines a list of message fragments into a single message by merging shared metadata
718
+ and concatenating the fragment data lists.
273
719
 
274
720
  Parameters
275
721
  ----------
276
722
  fragments : List[Dict[str, Any]]
277
- A list of fragments to be combined.
723
+ A list of fragment dictionaries containing at least a 'data' key and optional metadata.
278
724
 
279
725
  Returns
280
726
  -------
281
- str
282
- The combined message as a JSON string, containing 'status', 'description', and combined 'data'.
727
+ dict
728
+ A combined message dictionary.
729
+
730
+ Raises
731
+ ------
732
+ ValueError
733
+ If the fragments list is empty.
283
734
  """
284
735
  if not fragments:
285
- raise ValueError("Fragments list is empty")
286
-
287
- # Use 'status' and 'description' from the first fragment
288
- combined_message = {
289
- "status": fragments[0]["status"],
290
- "description": fragments[0]["description"],
291
- "data": [],
292
- "trace": fragments[0].get("trace", {}),
293
- }
736
+ raise ValueError("Cannot combine empty list of fragments")
737
+
738
+ fragments.sort(key=lambda x: x.get("fragment", 0))
739
+ combined_message: Dict[str, Any] = {"data": []}
740
+ first_frag: Dict[str, Any] = fragments[0]
741
+
742
+ for key in ["status", "description", "trace", "annotations"]:
743
+ if key in first_frag:
744
+ combined_message[key] = first_frag[key]
294
745
 
295
- # Combine the 'data' elements from all fragments
296
746
  for fragment in fragments:
297
- combined_message["data"].extend(fragment["data"])
747
+ fragment_data = fragment.get("data")
748
+ if isinstance(fragment_data, list):
749
+ combined_message["data"].extend(fragment_data)
750
+ else:
751
+ fragment_idx = fragment.get("fragment", "unknown")
752
+ logger.warning(f"Fragment {fragment_idx} missing 'data' list or has wrong type. Skipping its data.")
298
753
 
299
754
  return combined_message
300
755
 
301
- def submit_message(self, channel_name: str, message: str) -> None:
756
+ def submit_message(
757
+ self,
758
+ channel_name: str,
759
+ message: str,
760
+ ttl_seconds: Optional[int] = None,
761
+ ) -> None:
302
762
  """
303
- Submits a message to a specified Redis queue with retries on failure.
763
+ Submits a message to Redis using RPUSH and optionally sets a TTL on the channel key.
304
764
 
305
765
  Parameters
306
766
  ----------
307
767
  channel_name : str
308
- The name of the queue to submit the message to.
768
+ The Redis list key (queue name) to which the message will be appended.
309
769
  message : str
310
- The message to submit.
770
+ The message payload as a JSON string.
771
+ ttl_seconds : int, optional
772
+ Time-To-Live for the Redis key in seconds. If not provided, uses message_ttl_seconds.
773
+
774
+ Returns
775
+ -------
776
+ None
311
777
 
312
778
  Raises
313
779
  ------
314
- RedisError
315
- If submitting the message fails after the specified number of retries.
780
+ ValueError
781
+ If maximum retry attempts are exceeded.
782
+ ConnectionError
783
+ If there is a connection error with Redis.
784
+ redis.RedisError
785
+ For other non-recoverable Redis errors.
316
786
  """
317
- retries = 0
787
+ retries: int = 0
788
+
318
789
  while True:
319
790
  try:
320
- self.get_client().rpush(channel_name, message)
321
- logger.debug(f"Message submitted to {channel_name}")
322
- break
323
- except RedisError as e:
324
- logger.error(f"Failed to submit message, retrying... Error: {e}")
325
- self._client = None # Invalidate client to force reconnection
791
+ client: redis.Redis = self.get_client()
792
+ pipe = client.pipeline()
793
+ pipe.rpush(channel_name, message)
794
+ effective_ttl: Optional[int] = ttl_seconds if ttl_seconds is not None else self._message_ttl_seconds
795
+ if effective_ttl is not None and effective_ttl > 0:
796
+ pipe.expire(channel_name, effective_ttl)
797
+ pipe.execute()
798
+ logger.debug(
799
+ f"Message submitted to '{channel_name}'"
800
+ + (f" with TTL {effective_ttl}s." if effective_ttl else ".")
801
+ )
802
+ return
803
+ except (redis.RedisError, ConnectionError) as e:
326
804
  retries += 1
327
- backoff_delay = min(2**retries, self._max_backoff)
328
-
329
- if self.max_retries == 0 or retries < self.max_retries:
330
- logger.error(f"Submit attempt failed, retrying in {backoff_delay}s...")
331
- time.sleep(backoff_delay)
805
+ logger.warning(
806
+ f"Redis/Connection error submitting to '{channel_name}': {e}. Attempt {retries}/{self.max_retries}"
807
+ )
808
+ self._client = None
809
+ if self.max_retries > 0 and retries <= self.max_retries:
810
+ backoff_delay: float = min(2 ** (retries - 1), self._max_backoff)
811
+ jitter: float = random.uniform(0, backoff_delay * 0.2)
812
+ sleep_time: float = backoff_delay + jitter
813
+ logger.debug(f"Retrying submit for '{channel_name}' in {sleep_time:.2f}s...")
814
+ time.sleep(sleep_time)
815
+ continue
332
816
  else:
333
- logger.error(f"Failed to submit message to {channel_name} after {retries} attempts.")
334
- raise
817
+ logger.error(
818
+ f"Max retries ({self.max_retries}) exceeded submitting to '{channel_name}'. Last error: {e}"
819
+ )
820
+ raise ValueError(f"Failed to submit to Redis after {retries} attempts: {e}") from e
821
+ except Exception as e:
822
+ logger.exception(f"Unexpected error during submit to '{channel_name}': {e}")
823
+ raise ValueError(f"Unexpected error during submit: {e}") from e