nv-ingest-api 2025.10.22.dev20251022__py3-none-any.whl → 2025.11.2.dev20251102__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (25) hide show
  1. nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +4 -0
  2. nv_ingest_api/internal/primitives/nim/nim_client.py +124 -14
  3. nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +4 -2
  4. nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +10 -1
  5. nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +4 -2
  6. nv_ingest_api/internal/schemas/extract/extract_image_schema.py +4 -2
  7. nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +10 -1
  8. nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +6 -4
  9. nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +4 -2
  10. nv_ingest_api/internal/schemas/extract/extract_table_schema.py +9 -1
  11. nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +39 -0
  12. nv_ingest_api/internal/schemas/meta/metadata_schema.py +9 -0
  13. nv_ingest_api/internal/schemas/mixins.py +39 -0
  14. nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +4 -0
  15. nv_ingest_api/internal/transform/embed_text.py +82 -0
  16. nv_ingest_api/util/dataloader/dataloader.py +20 -9
  17. nv_ingest_api/util/message_brokers/qos_scheduler.py +283 -0
  18. nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +1 -0
  19. nv_ingest_api/util/multi_processing/mp_pool_singleton.py +8 -2
  20. nv_ingest_api/util/service_clients/redis/redis_client.py +160 -0
  21. {nv_ingest_api-2025.10.22.dev20251022.dist-info → nv_ingest_api-2025.11.2.dev20251102.dist-info}/METADATA +2 -1
  22. {nv_ingest_api-2025.10.22.dev20251022.dist-info → nv_ingest_api-2025.11.2.dev20251102.dist-info}/RECORD +25 -23
  23. {nv_ingest_api-2025.10.22.dev20251022.dist-info → nv_ingest_api-2025.11.2.dev20251102.dist-info}/WHEEL +0 -0
  24. {nv_ingest_api-2025.10.22.dev20251022.dist-info → nv_ingest_api-2025.11.2.dev20251102.dist-info}/licenses/LICENSE +0 -0
  25. {nv_ingest_api-2025.10.22.dev20251022.dist-info → nv_ingest_api-2025.11.2.dev20251102.dist-info}/top_level.txt +0 -0
@@ -7,6 +7,7 @@ from concurrent.futures import ThreadPoolExecutor
7
7
  from functools import partial
8
8
  from typing import Any, Dict, Tuple, Optional, Iterable, List
9
9
 
10
+ import glom
10
11
  import pandas as pd
11
12
  from openai import OpenAI
12
13
 
@@ -282,6 +283,33 @@ def _add_embeddings(row, embeddings, info_msgs):
282
283
  return row
283
284
 
284
285
 
286
+ def _add_custom_embeddings(row, embeddings, result_target_field):
287
+ """
288
+ Updates a DataFrame row with embedding data and associated error info
289
+ based on a user supplied custom content field.
290
+
291
+ Parameters
292
+ ----------
293
+ row : pandas.Series
294
+ A row of the DataFrame.
295
+ embeddings : dict
296
+ Dictionary mapping row indices to embeddings.
297
+ result_target_field: str
298
+ The field in custom_content to output the embeddings to
299
+
300
+ Returns
301
+ -------
302
+ pandas.Series
303
+ The updated row
304
+ """
305
+ embedding = embeddings.get(row.name, None)
306
+
307
+ if embedding is not None:
308
+ row["metadata"] = glom.assign(row["metadata"], "custom_content." + result_target_field, embedding, missing=dict)
309
+
310
+ return row
311
+
312
+
285
313
  def _format_image_input_string(image_b64: Optional[str]) -> str:
286
314
  if not image_b64:
287
315
  return
@@ -381,6 +409,20 @@ def _get_pandas_audio_content(row, modality="text"):
381
409
  return row.get("audio_metadata", {}).get("audio_transcript")
382
410
 
383
411
 
412
+ def _get_pandas_custom_content(row, custom_content_field):
413
+ custom_content = row.get("custom_content", {})
414
+ content = glom.glom(custom_content, custom_content_field, default=None)
415
+ if content is None:
416
+ logger.warning(f"Custom content field: {custom_content_field} not found")
417
+ return None
418
+
419
+ try:
420
+ return str(content)
421
+ except (TypeError, ValueError):
422
+ logger.warning(f"Cannot convert custom content field: {custom_content_field} to string")
423
+ return None
424
+
425
+
384
426
  # ------------------------------------------------------------------------------
385
427
  # Batch Processing Utilities
386
428
  # ------------------------------------------------------------------------------
@@ -519,6 +561,7 @@ def transform_create_text_embeddings_internal(
519
561
  api_key = task_config.get("api_key") or transform_config.api_key
520
562
  endpoint_url = task_config.get("endpoint_url") or transform_config.embedding_nim_endpoint
521
563
  model_name = task_config.get("model_name") or transform_config.embedding_model
564
+ custom_content_field = task_config.get("custom_content_field") or transform_config.custom_content_field
522
565
 
523
566
  if execution_trace_log is None:
524
567
  execution_trace_log = {}
@@ -612,4 +655,43 @@ def transform_create_text_embeddings_internal(
612
655
  content_masks.append(content_mask)
613
656
 
614
657
  combined_df = _concatenate_extractions_pandas(df_transform_ledger, embedding_dataframes, content_masks)
658
+
659
+ # Embed custom content
660
+ if custom_content_field is not None:
661
+ result_target_field = task_config.get("result_target_field") or custom_content_field + "_embedding"
662
+
663
+ extracted_custom_content = (
664
+ combined_df["metadata"]
665
+ .apply(partial(_get_pandas_custom_content, custom_content_field=custom_content_field))
666
+ .apply(lambda x: x.strip() if isinstance(x, str) and x.strip() else None)
667
+ )
668
+
669
+ valid_custom_content_mask = extracted_custom_content.notna()
670
+ if valid_custom_content_mask.any():
671
+ custom_content_list = extracted_custom_content[valid_custom_content_mask].to_list()
672
+ custom_content_batches = _generate_batches(custom_content_list, batch_size=transform_config.batch_size)
673
+
674
+ custom_content_embeddings = _async_runner(
675
+ custom_content_batches,
676
+ api_key,
677
+ endpoint_url,
678
+ model_name,
679
+ transform_config.encoding_format,
680
+ transform_config.input_type,
681
+ transform_config.truncate,
682
+ False,
683
+ )
684
+ custom_embeddings_dict = dict(
685
+ zip(
686
+ extracted_custom_content.loc[valid_custom_content_mask].index,
687
+ custom_content_embeddings.get("embeddings", []),
688
+ )
689
+ )
690
+ else:
691
+ custom_embeddings_dict = {}
692
+
693
+ combined_df = combined_df.apply(
694
+ _add_custom_embeddings, embeddings=custom_embeddings_dict, result_target_field=result_target_field, axis=1
695
+ )
696
+
615
697
  return combined_df, {"trace_info": execution_trace_log}
@@ -254,22 +254,29 @@ else:
254
254
  file = None
255
255
  try:
256
256
  for file in paths:
257
+ if thread_stop.is_set():
258
+ return
257
259
  if isinstance(file, tuple):
258
260
  video_file, audio_file = file
261
+ if thread_stop.is_set():
262
+ return
259
263
  with open(video_file, "rb") as f:
260
264
  video = f.read()
265
+ if thread_stop.is_set():
266
+ return
261
267
  with open(audio_file, "rb") as f:
262
268
  audio = f.read()
263
269
  queue.put((video, audio))
264
270
  else:
265
- if thread_stop:
271
+ if thread_stop.is_set():
266
272
  return
267
273
  with open(file, "rb") as f:
268
274
  queue.put(f.read())
269
275
  except Exception as e:
270
276
  logging.error(f"Error processing file {file}: {e}")
271
277
  queue.put(RuntimeError(f"Error processing file {file}: {e}"))
272
- queue.put(StopIteration)
278
+ finally:
279
+ queue.put(StopIteration)
273
280
 
274
281
  class DataLoader:
275
282
  """
@@ -290,7 +297,7 @@ else:
290
297
  ):
291
298
  interface = interface if interface else MediaInterface()
292
299
  self.thread = None
293
- self.thread_stop = False
300
+ self.thread_stop = threading.Event()
294
301
  self.queue = queue.Queue(size)
295
302
  self.path = Path(path)
296
303
  self.output_dir = output_dir
@@ -323,16 +330,20 @@ else:
323
330
  Reset itertor by stopping the thread and clearing the queue.
324
331
  """
325
332
  if self.thread:
326
- self.thread_stop = True
333
+ self.thread_stop.set()
327
334
  self.thread.join()
328
- self.thread_stop = False
329
- while self.queue.qsize() != 0:
330
- with self.queue.mutex:
331
- self.queue.queue.clear()
335
+ self.thread = None
336
+ try:
337
+ while True:
338
+ self.queue.get_nowait()
339
+ except Exception:
340
+ pass
341
+ finally:
342
+ self.thread_stop.clear()
332
343
 
333
344
  def __iter__(self):
334
345
  self.stop()
335
- self.thread_stop = False
346
+ self.thread_stop.clear()
336
347
  self.thread = threading.Thread(
337
348
  target=load_data,
338
349
  args=(
@@ -0,0 +1,283 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ from __future__ import annotations
6
+
7
+ from typing import Dict, Optional
8
+ import logging
9
+ import time
10
+ import random
11
+
12
+
13
+ class _SchedulingStrategy:
14
+ """
15
+ Base scheduling strategy interface. Implementations must provide a non-blocking
16
+ single-sweep attempt over non-immediate queues and return a job or None.
17
+ """
18
+
19
+ def try_once(self, client, queues: Dict[str, str], order: list[str]) -> Optional[dict]:
20
+ raise NotImplementedError
21
+
22
+
23
+ class _LotteryStrategy(_SchedulingStrategy):
24
+ """
25
+ Lottery scheduling with fixed weights.
26
+ Weights: micro=4, small=2, large=1, medium=1, default=1
27
+ """
28
+
29
+ def __init__(self, prioritize_immediate: bool = True) -> None:
30
+ self._weights: Dict[str, int] = {
31
+ "micro": 4,
32
+ "small": 2,
33
+ "large": 1,
34
+ "medium": 1,
35
+ "default": 1,
36
+ }
37
+ self._prioritize_immediate: bool = bool(prioritize_immediate)
38
+
39
+ def try_once(self, client, queues: Dict[str, str], order: list[str]) -> Optional[dict]:
40
+ # Immediate-first if enabled (non-blocking)
41
+ if self._prioritize_immediate:
42
+ try:
43
+ job = client.fetch_message(queues["immediate"], 0)
44
+ if job is not None:
45
+ return job
46
+ except TimeoutError:
47
+ pass
48
+ candidates = list(order)
49
+ weights = [self._weights[q] for q in candidates]
50
+ while candidates:
51
+ try:
52
+ chosen = random.choices(candidates, weights=weights, k=1)[0]
53
+ job = client.fetch_message(queues[chosen], 0)
54
+ if job is not None:
55
+ return job
56
+ except TimeoutError:
57
+ pass
58
+ finally:
59
+ idx = candidates.index(chosen)
60
+ del candidates[idx]
61
+ del weights[idx]
62
+ return None
63
+
64
+
65
+ class _SimpleStrategy(_SchedulingStrategy):
66
+ """
67
+ Simple strategy placeholder. Actual simple-mode handling is done in QosScheduler.fetch_next
68
+ to directly fetch from the base 'default' queue using the provided timeout.
69
+ """
70
+
71
+ def try_once(self, client, queues: Dict[str, str], order: list[str]) -> Optional[dict]:
72
+ # Block up to 30s on the base/default queue and return first available job
73
+ try:
74
+ return client.fetch_message(queues["default"], 30.0)
75
+ except TimeoutError:
76
+ return None
77
+
78
+
79
+ class _RoundRobinStrategy(_SchedulingStrategy):
80
+ """
81
+ Simple round-robin over non-immediate queues. Maintains rotation across calls.
82
+ """
83
+
84
+ def __init__(self, order: list[str], prioritize_immediate: bool = True) -> None:
85
+ self._order = list(order)
86
+ self._len = len(self._order)
87
+ self._idx = 0
88
+ self._prioritize_immediate: bool = bool(prioritize_immediate)
89
+
90
+ def try_once(self, client, queues: Dict[str, str], order: list[str]) -> Optional[dict]:
91
+ # Immediate-first if enabled (non-blocking)
92
+ if self._prioritize_immediate:
93
+ try:
94
+ job = client.fetch_message(queues["immediate"], 0)
95
+ if job is not None:
96
+ return job
97
+ except TimeoutError:
98
+ pass
99
+ start_idx = self._idx
100
+ for step in range(self._len):
101
+ i = (start_idx + step) % self._len
102
+ qname = self._order[i]
103
+ try:
104
+ job = client.fetch_message(queues[qname], 0)
105
+ if job is not None:
106
+ # advance rotation to the position after the chosen one
107
+ self._idx = (i + 1) % self._len
108
+ return job
109
+ except TimeoutError:
110
+ continue
111
+ return None
112
+
113
+
114
+ class _WeightedRoundRobinStrategy(_SchedulingStrategy):
115
+ """
116
+ Smooth Weighted Round Robin (SWRR) using weights micro=4, small=2, large=1, medium=1, default=1.
117
+ Maintains current weights across calls.
118
+ """
119
+
120
+ def __init__(self, prioritize_immediate: bool = True) -> None:
121
+ self._weights: Dict[str, int] = {
122
+ "micro": 4,
123
+ "small": 2,
124
+ "large": 1,
125
+ "medium": 1,
126
+ "default": 1,
127
+ }
128
+ self._current: Dict[str, int] = {k: 0 for k in self._weights.keys()}
129
+ self._total: int = sum(self._weights.values())
130
+ self._prioritize_immediate: bool = bool(prioritize_immediate)
131
+
132
+ def try_once(self, client, queues: Dict[str, str], order: list[str]) -> Optional[dict]:
133
+ # Immediate-first if enabled (non-blocking)
134
+ if self._prioritize_immediate:
135
+ try:
136
+ job = client.fetch_message(queues["immediate"], 0)
137
+ if job is not None:
138
+ return job
139
+ except TimeoutError:
140
+ pass
141
+ # Attempt up to len(order) selections per sweep, excluding queues that prove empty
142
+ active = list(order)
143
+ for _ in range(len(order)):
144
+ if not active:
145
+ break
146
+ for q in active:
147
+ self._current[q] += self._weights[q]
148
+ chosen = max(active, key=lambda q: self._current[q])
149
+ self._current[chosen] -= self._total
150
+ try:
151
+ job = client.fetch_message(queues[chosen], 0)
152
+ if job is not None:
153
+ return job
154
+ except TimeoutError:
155
+ job = None
156
+ # If no job available from chosen, exclude it for the remainder of this sweep
157
+ if job is None and chosen in active:
158
+ active.remove(chosen)
159
+ # Fallback: single non-blocking attempt for each queue in order
160
+ for q in order:
161
+ try:
162
+ job = client.fetch_message(queues[q], 0)
163
+ if job is not None:
164
+ return job
165
+ except TimeoutError:
166
+ continue
167
+ return None
168
+
169
+
170
+ class QosScheduler:
171
+ """
172
+ Simplified scheduler that fetches jobs from the default queue only.
173
+ Uses the provided timeout value when polling the broker.
174
+ """
175
+
176
+ def __init__(
177
+ self,
178
+ base_queue: str,
179
+ total_buffer_capacity: int = 1,
180
+ num_prefetch_threads: int = 0,
181
+ prefetch_poll_interval: float = 0.0,
182
+ prefetch_non_immediate: bool = False,
183
+ strategy: str = "lottery",
184
+ prioritize_immediate: bool = True,
185
+ ) -> None:
186
+ self.base_queue = base_queue
187
+
188
+ # Define all derived queues; default behavior still uses only "default"
189
+ self.queues: Dict[str, str] = {
190
+ "default": f"{base_queue}",
191
+ "immediate": f"{base_queue}_immediate",
192
+ "micro": f"{base_queue}_micro",
193
+ "small": f"{base_queue}_small",
194
+ "medium": f"{base_queue}_medium",
195
+ "large": f"{base_queue}_large",
196
+ }
197
+
198
+ # Priority order for multi-queue fetching; "immediate" always first
199
+ self._priority_order = [
200
+ "immediate",
201
+ "micro",
202
+ "small",
203
+ "medium",
204
+ "large",
205
+ "default",
206
+ ]
207
+
208
+ # Non-immediate queue order reference
209
+ self._non_immediate_order = ["micro", "small", "large", "medium", "default"]
210
+
211
+ # Logger
212
+ self._logger = logging.getLogger(__name__)
213
+
214
+ # No prefetching - just direct calls
215
+ self._total_buffer_capacity: int = int(total_buffer_capacity)
216
+ self._num_prefetch_threads: int = int(num_prefetch_threads)
217
+ self._prefetch_poll_interval: float = float(prefetch_poll_interval)
218
+ self._prefetch_non_immediate: bool = bool(prefetch_non_immediate)
219
+
220
+ # Strategy selection
221
+ self._simple_mode: bool = False
222
+ if strategy == "simple":
223
+ self._strategy_impl: _SchedulingStrategy = _SimpleStrategy()
224
+ self._simple_mode = True
225
+ elif strategy == "round_robin":
226
+ self._strategy_impl = _RoundRobinStrategy(self._non_immediate_order, prioritize_immediate)
227
+ elif strategy == "weighted_round_robin":
228
+ self._strategy_impl = _WeightedRoundRobinStrategy(prioritize_immediate)
229
+ else:
230
+ self._strategy_impl = _LotteryStrategy(prioritize_immediate)
231
+
232
+ # Context manager helpers for clean shutdown
233
+ def __enter__(self) -> "QosScheduler":
234
+ return self
235
+
236
+ def __exit__(self, exc_type, exc, tb) -> None:
237
+ self.close()
238
+
239
+ # ---------------------------- Public API ----------------------------
240
+ def close(self) -> None:
241
+ """
242
+ Cleanly close the scheduler. No-op for the current implementation
243
+ since we do not spin background threads.
244
+ """
245
+ return None
246
+
247
+ def fetch_next(self, client, timeout: float = 0.0) -> Optional[dict]:
248
+ """
249
+ Immediate-first, then strategy-based scheduling among non-immediate queues.
250
+
251
+ Behavior:
252
+ - Always check 'immediate' first (non-blocking). If present, return immediately.
253
+ - If not, select using the configured strategy (lottery, round_robin, weighted_round_robin).
254
+ - If no job is found in a full pass:
255
+ - If timeout <= 0: return None.
256
+ - Else: sleep in 0.5s increments and retry until accumulated elapsed time >= timeout.
257
+ """
258
+ # Simple mode: delegate to the strategy (blocks up to 30s on base queue)
259
+ if getattr(self, "_simple_mode", False):
260
+ return self._strategy_impl.try_once(client, self.queues, self._non_immediate_order)
261
+
262
+ start = time.monotonic()
263
+ while True:
264
+ # Strategy-based attempt (strategy may include immediate priority internally)
265
+ job = self._strategy_impl.try_once(client, self.queues, self._non_immediate_order)
266
+ if job is not None:
267
+ return job
268
+
269
+ # No job found in this sweep
270
+ if timeout <= 0:
271
+ return None
272
+
273
+ elapsed = time.monotonic() - start
274
+ if elapsed >= timeout:
275
+ return None
276
+
277
+ # Sleep up to 0.5s, but not beyond remaining timeout
278
+ remaining = timeout - elapsed
279
+ sleep_time = 0.5 if remaining > 0.5 else remaining
280
+ if sleep_time > 0:
281
+ time.sleep(sleep_time)
282
+ else:
283
+ return None
@@ -35,6 +35,7 @@ class SimpleClient(MessageBrokerClientBase):
35
35
  connection_timeout: int = 300,
36
36
  max_pool_size: int = 128,
37
37
  use_ssl: bool = False,
38
+ api_version: str = "v1",
38
39
  ):
39
40
  """
40
41
  Initialize the SimpleClient with configuration parameters.
@@ -5,8 +5,9 @@
5
5
 
6
6
  import logging
7
7
  import math
8
- import multiprocessing as mp
9
8
  import os
9
+ import sys
10
+ import multiprocessing as mp
10
11
  from threading import Lock
11
12
  from typing import Any, Callable, Optional
12
13
 
@@ -103,7 +104,12 @@ class ProcessWorkerPoolSingleton:
103
104
  The total number of worker processes to start.
104
105
  """
105
106
  self._total_workers = total_max_workers
106
- self._context: mp.context.ForkContext = mp.get_context("fork")
107
+
108
+ start_method = "fork"
109
+ if sys.platform.lower() == "darwin":
110
+ start_method = "spawn"
111
+ self._context: mp.context.ForkContext = mp.get_context(start_method)
112
+
107
113
  # Bounded task queue: maximum tasks queued = 2 * total_max_workers.
108
114
  self._task_queue: mp.Queue = self._context.Queue(maxsize=2 * total_max_workers)
109
115
  self._next_task_id: int = 0
@@ -650,6 +650,22 @@ class RedisClient(MessageBrokerClientBase):
650
650
  except Exception as e:
651
651
  logger.exception(f"{log_prefix}: Cache read error: {e}. Trying Redis.")
652
652
 
653
+ # If caller requests non-blocking behavior (timeout <= 0), attempt immediate pop.
654
+ if timeout is not None and timeout <= 0:
655
+ try:
656
+ client = self.get_client()
657
+ popped = client.lpop(channel_name)
658
+ if popped is None:
659
+ return None
660
+ try:
661
+ return json.loads(popped)
662
+ except json.JSONDecodeError as e:
663
+ logger.error(f"Failed to decode JSON from non-blocking LPOP on '{channel_name}': {e}")
664
+ return None
665
+ except Exception as e:
666
+ logger.warning(f"Non-blocking LPOP failed for '{channel_name}': {e}")
667
+ return None
668
+
653
669
  while True:
654
670
  try:
655
671
  fetch_result: Union[Dict[str, Any], List[Dict[str, Any]]]
@@ -711,6 +727,150 @@ class RedisClient(MessageBrokerClientBase):
711
727
  logger.exception(f"{log_prefix}: Unexpected error during fetch: {e}")
712
728
  raise ValueError(f"Unexpected error during fetch: {e}") from e
713
729
 
730
+ def fetch_message_from_any(self, channel_names: List[str], timeout: float = 0) -> Optional[Dict[str, Any]]:
731
+ """
732
+ Attempt to fetch a message from the first non-empty list among the provided channel names
733
+ using Redis BLPOP. If the popped item represents a fragmented message, this method will
734
+ continue popping from the same channel to reconstruct the full message.
735
+
736
+ Parameters
737
+ ----------
738
+ channel_names : List[str]
739
+ Ordered list of Redis list keys to attempt in priority order.
740
+ timeout : float, optional
741
+ Timeout in seconds to wait for any item across the provided lists. Redis supports
742
+ integer-second timeouts; sub-second values will be truncated.
743
+
744
+ Returns
745
+ -------
746
+ dict or None
747
+ The reconstructed message dictionary if an item was fetched; otherwise None on timeout.
748
+ """
749
+ if not channel_names:
750
+ return None
751
+
752
+ client = self.get_client()
753
+ blpop_timeout = int(max(0, timeout))
754
+ try:
755
+ res = client.blpop(channel_names, timeout=blpop_timeout)
756
+ except (redis.RedisError, ConnectionError) as e:
757
+ logger.debug(f"BLPOP error on {channel_names}: {e}")
758
+ return None
759
+
760
+ if res is None:
761
+ return None
762
+
763
+ list_key, first_bytes = res
764
+ if isinstance(list_key, bytes):
765
+ try:
766
+ list_key = list_key.decode("utf-8")
767
+ except Exception:
768
+ list_key = str(list_key)
769
+ # Decode first element
770
+ try:
771
+ first_msg = json.loads(first_bytes)
772
+ except json.JSONDecodeError as e:
773
+ logger.error(f"Failed to decode JSON popped from '{list_key}': {e}")
774
+ return None
775
+
776
+ expected_count: int = int(first_msg.get("fragment_count", 1))
777
+ if expected_count <= 1:
778
+ return first_msg
779
+
780
+ # Collect remaining fragments from the same list key
781
+ fragments: List[Dict[str, Any]] = [first_msg]
782
+ accumulated = 0.0
783
+ start_time = time.monotonic()
784
+ for i in range(1, expected_count):
785
+ remaining = max(0, timeout - accumulated)
786
+ per_frag_timeout = int(max(1, remaining)) if timeout else 1
787
+ try:
788
+ frag_res = client.blpop([list_key], timeout=per_frag_timeout)
789
+ except (redis.RedisError, ConnectionError) as e:
790
+ logger.error(f"BLPOP error while collecting fragments from '{list_key}': {e}")
791
+ return None
792
+ if frag_res is None:
793
+ logger.error(f"Timeout while collecting fragment {i}/{expected_count-1} from '{list_key}'")
794
+ return None
795
+ _, frag_key_bytes_or_val = frag_res
796
+ # Redis returns (key, value); we don't need the key here
797
+ frag_bytes = frag_key_bytes_or_val
798
+ try:
799
+ frag_msg = json.loads(frag_bytes)
800
+ fragments.append(frag_msg)
801
+ except json.JSONDecodeError as e:
802
+ logger.error(f"Failed to decode fragment JSON from '{list_key}': {e}")
803
+ return None
804
+ accumulated = time.monotonic() - start_time
805
+
806
+ # Combine and return
807
+ try:
808
+ return self._combine_fragments(fragments)
809
+ except Exception as e:
810
+ logger.error(f"Error combining fragments from '{list_key}': {e}")
811
+ return None
812
+
813
+ def fetch_message_from_any_with_key(
814
+ self, channel_names: List[str], timeout: float = 0
815
+ ) -> Optional[Tuple[str, Dict[str, Any]]]:
816
+ """
817
+ Like fetch_message_from_any(), but returns the Redis list key together with the message.
818
+ This is useful for higher-level schedulers that need to apply per-category quotas.
819
+ """
820
+ if not channel_names:
821
+ return None
822
+
823
+ client = self.get_client()
824
+ blpop_timeout = int(max(0, timeout))
825
+ try:
826
+ res = client.blpop(channel_names, timeout=blpop_timeout)
827
+ except (redis.RedisError, ConnectionError) as e:
828
+ logger.debug(f"BLPOP error on {channel_names}: {e}")
829
+ return None
830
+
831
+ if res is None:
832
+ return None
833
+
834
+ list_key, first_bytes = res
835
+ try:
836
+ first_msg = json.loads(first_bytes)
837
+ except json.JSONDecodeError as e:
838
+ logger.error(f"Failed to decode JSON popped from '{list_key}': {e}")
839
+ return None
840
+
841
+ expected_count: int = int(first_msg.get("fragment_count", 1))
842
+ if expected_count <= 1:
843
+ return list_key, first_msg
844
+
845
+ fragments: List[Dict[str, Any]] = [first_msg]
846
+ accumulated = 0.0
847
+ start_time = time.monotonic()
848
+ for i in range(1, expected_count):
849
+ remaining = max(0, timeout - accumulated)
850
+ per_frag_timeout = int(max(1, remaining)) if timeout else 1
851
+ try:
852
+ frag_res = client.blpop([list_key], timeout=per_frag_timeout)
853
+ except (redis.RedisError, ConnectionError) as e:
854
+ logger.error(f"BLPOP error while collecting fragments from '{list_key}': {e}")
855
+ return None
856
+ if frag_res is None:
857
+ logger.error(f"Timeout while collecting fragment {i}/{expected_count-1} from '{list_key}'")
858
+ return None
859
+ _, frag_bytes = frag_res
860
+ try:
861
+ frag_msg = json.loads(frag_bytes)
862
+ fragments.append(frag_msg)
863
+ except json.JSONDecodeError as e:
864
+ logger.error(f"Failed to decode fragment JSON from '{list_key}': {e}")
865
+ return None
866
+ accumulated = time.monotonic() - start_time
867
+
868
+ try:
869
+ return list_key, self._combine_fragments(fragments)
870
+ except Exception as e:
871
+ logger.error(f"Error combining fragments from '{list_key}': {e}")
872
+ return None
873
+
714
874
  @staticmethod
715
875
  def _combine_fragments(fragments: List[Dict[str, Any]]) -> Dict[str, Any]:
716
876
  """
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-api
3
- Version: 2025.10.22.dev20251022
3
+ Version: 2025.11.2.dev20251102
4
4
  Summary: Python module with core document ingestion functions.
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -222,6 +222,7 @@ Requires-Dist: fsspec>=2025.5.1
222
222
  Requires-Dist: universal_pathlib>=0.2.6
223
223
  Requires-Dist: ffmpeg-python==0.2.0
224
224
  Requires-Dist: tritonclient
225
+ Requires-Dist: glom
225
226
  Dynamic: license-file
226
227
 
227
228
  # nv-ingest-api