nv-ingest 2025.8.16.dev20250816__py3-none-any.whl → 2025.11.21.dev20251121__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. nv_ingest/api/__init__.py +6 -0
  2. nv_ingest/api/main.py +2 -0
  3. nv_ingest/api/tracing.py +82 -0
  4. nv_ingest/api/v2/README.md +203 -0
  5. nv_ingest/api/v2/__init__.py +3 -0
  6. nv_ingest/api/v2/ingest.py +1300 -0
  7. nv_ingest/framework/orchestration/process/dependent_services.py +43 -14
  8. nv_ingest/framework/orchestration/process/execution.py +92 -94
  9. nv_ingest/framework/orchestration/process/lifecycle.py +98 -6
  10. nv_ingest/framework/orchestration/process/strategies.py +41 -5
  11. nv_ingest/framework/orchestration/process/termination.py +147 -0
  12. nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +2 -2
  13. nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +9 -15
  14. nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +2 -3
  15. nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +5 -2
  16. nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +2 -1
  17. nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py +2 -1
  18. nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +2 -1
  19. nv_ingest/framework/orchestration/ray/stages/extractors/ocr_extractor.py +71 -0
  20. nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +5 -2
  21. nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +2 -1
  22. nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +2 -1
  23. nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +2 -1
  24. nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +2 -1
  25. nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +46 -9
  26. nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +2 -1
  27. nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +5 -1
  28. nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +5 -1
  29. nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +4 -3
  30. nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +215 -11
  31. nv_ingest/pipeline/config/loaders.py +33 -2
  32. nv_ingest/pipeline/default_libmode_pipeline_impl.py +514 -0
  33. nv_ingest/pipeline/default_pipeline_impl.py +111 -88
  34. {nv_ingest-2025.8.16.dev20250816.dist-info → nv_ingest-2025.11.21.dev20251121.dist-info}/METADATA +4 -3
  35. {nv_ingest-2025.8.16.dev20250816.dist-info → nv_ingest-2025.11.21.dev20251121.dist-info}/RECORD +38 -31
  36. {nv_ingest-2025.8.16.dev20250816.dist-info → nv_ingest-2025.11.21.dev20251121.dist-info}/WHEEL +0 -0
  37. {nv_ingest-2025.8.16.dev20250816.dist-info → nv_ingest-2025.11.21.dev20251121.dist-info}/licenses/LICENSE +0 -0
  38. {nv_ingest-2025.8.16.dev20250816.dist-info → nv_ingest-2025.11.21.dev20251121.dist-info}/top_level.txt +0 -0
@@ -16,6 +16,7 @@ from nv_ingest_api.internal.transform.split_text import transform_text_split_and
16
16
  from nv_ingest_api.util.exception_handlers.decorators import (
17
17
  nv_ingest_node_failure_try_except,
18
18
  )
19
+ from nv_ingest_api.util.logging.sanitize import sanitize_for_logging
19
20
 
20
21
  from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
21
22
 
@@ -36,7 +37,7 @@ class TextSplitterStage(RayActorStage):
36
37
  super().__init__(config, stage_name=stage_name)
37
38
  # Store the validated configuration (assumed to be an instance of TextSplitterSchema)
38
39
  self.validated_config: TextSplitterSchema = config
39
- logger.debug("TextSplitterStage initialized with config: %s", config)
40
+ logger.info("TextSplitterStage initialized with config: %s", sanitize_for_logging(config))
40
41
 
41
42
  @nv_ingest_node_failure_try_except()
42
43
  @traceable()
@@ -63,7 +64,7 @@ class TextSplitterStage(RayActorStage):
63
64
 
64
65
  # Remove the "split" task to obtain task-specific configuration.
65
66
  task_config = remove_task_by_type(message, "split")
66
- logger.debug("Extracted task config: %s", task_config)
67
+ logger.debug("Extracted task config: %s", sanitize_for_logging(task_config))
67
68
 
68
69
  # Transform the DataFrame (split text and tokenize).
69
70
  df_updated = transform_text_split_and_tokenize_internal(
@@ -107,7 +108,7 @@ def text_splitter_fn(control_message: IngestControlMessage, stage_config: TextSp
107
108
 
108
109
  # Remove the "split" task to obtain task-specific configuration.
109
110
  task_config = remove_task_by_type(control_message, "split")
110
- logger.debug("Extracted task config: %s", task_config)
111
+ logger.debug("Extracted task config: %s", sanitize_for_logging(task_config))
111
112
 
112
113
  # Transform the DataFrame (split text and tokenize).
113
114
  df_updated = transform_text_split_and_tokenize_internal(
@@ -7,9 +7,7 @@ import json
7
7
  import logging
8
8
  import os
9
9
  from json import JSONDecodeError
10
- from typing import Optional, Dict, Any
11
-
12
- from typing import List
10
+ from typing import Optional, Dict, Any, List
13
11
 
14
12
  import redis
15
13
 
@@ -133,6 +131,8 @@ class RedisIngestService(IngestServiceMeta):
133
131
  self._bulk_vdb_cache_prefix: str = "vdb_bulk_upload_cache:"
134
132
  self._cache_prefix: str = "processing_cache:"
135
133
  self._state_prefix: str = "job_state:"
134
+ # Bound async-to-thread concurrency slightly below Redis connection pool
135
+ self._async_operation_semaphore: Optional[asyncio.Semaphore] = None
136
136
 
137
137
  self._ingest_client = RedisClient(
138
138
  host=self._redis_hostname,
@@ -151,6 +151,16 @@ class RedisIngestService(IngestServiceMeta):
151
151
  f"FetchMode: {fetch_mode.name}, ResultTTL: {result_data_ttl_seconds}, StateTTL: {state_ttl_seconds}"
152
152
  )
153
153
 
154
+ def _get_async_semaphore(self) -> asyncio.Semaphore:
155
+ if self._async_operation_semaphore is None:
156
+ semaphore_limit = max(1, self._concurrency_level - 2)
157
+ self._async_operation_semaphore = asyncio.Semaphore(semaphore_limit)
158
+ return self._async_operation_semaphore
159
+
160
+ async def _run_bounded_to_thread(self, func, *args, **kwargs):
161
+ async with self._get_async_semaphore():
162
+ return await asyncio.to_thread(func, *args, **kwargs)
163
+
154
164
  async def submit_job(self, job_spec_wrapper: "MessageWrapper", trace_id: str) -> str:
155
165
  """
156
166
  Validates, prepares, and submits a job specification to the Redis task queue.
@@ -208,12 +218,33 @@ class RedisIngestService(IngestServiceMeta):
208
218
  ttl_for_result: Optional[int] = (
209
219
  self._result_data_ttl_seconds if self._fetch_mode == FetchMode.NON_DESTRUCTIVE else None
210
220
  )
221
+ # Determine target queue based on optional QoS hint
222
+ queue_hint = None
223
+ try:
224
+ routing_opts = job_spec.get("routing_options") or {}
225
+ tracing_opts = job_spec.get("tracing_options") or {}
226
+ queue_hint = routing_opts.get("queue_hint") or tracing_opts.get("queue_hint")
227
+ except Exception:
228
+ queue_hint = None
229
+ allowed = {"default", "immediate", "micro", "small", "medium", "large"}
230
+ if isinstance(queue_hint, str) and queue_hint in allowed:
231
+ if queue_hint == "default":
232
+ channel_name = self._redis_task_queue
233
+ else:
234
+ channel_name = f"{self._redis_task_queue}_{queue_hint}"
235
+ else:
236
+ channel_name = self._redis_task_queue
237
+ logger.debug(
238
+ f"Submitting job {trace_id} to queue '{channel_name}' (hint={queue_hint}) "
239
+ f"with result TTL: {ttl_for_result}"
240
+ )
241
+
211
242
  logger.debug(
212
243
  f"Submitting job {trace_id} to queue '{self._redis_task_queue}' with result TTL: {ttl_for_result}"
213
244
  )
214
- await asyncio.to_thread(
245
+ await self._run_bounded_to_thread(
215
246
  self._ingest_client.submit_message,
216
- channel_name=self._redis_task_queue,
247
+ channel_name=channel_name,
217
248
  message=job_spec_json,
218
249
  ttl_seconds=ttl_for_result,
219
250
  )
@@ -252,7 +283,7 @@ class RedisIngestService(IngestServiceMeta):
252
283
  try:
253
284
  result_channel: str = f"{job_id}"
254
285
  logger.debug(f"Attempting to fetch job result for {job_id} using mode {self._fetch_mode.name}")
255
- message = await asyncio.to_thread(
286
+ message = await self._run_bounded_to_thread(
256
287
  self._ingest_client.fetch_message,
257
288
  channel_name=result_channel,
258
289
  timeout=10,
@@ -264,7 +295,7 @@ class RedisIngestService(IngestServiceMeta):
264
295
  logger.warning(f"fetch_message for {job_id} returned None unexpectedly.")
265
296
  raise TimeoutError("No data found (unexpected None response).")
266
297
  except (TimeoutError, redis.RedisError, ConnectionError, ValueError, RuntimeError) as e:
267
- logger.info(f"Fetch operation for job {job_id} did not complete: ({type(e).__name__}) {e}")
298
+ logger.debug(f"Fetch operation for job {job_id} did not complete: ({type(e).__name__}) {e}")
268
299
  raise e
269
300
  except Exception as e:
270
301
  logger.exception(f"Unexpected error during async fetch_job for {job_id}: {e}")
@@ -289,7 +320,7 @@ class RedisIngestService(IngestServiceMeta):
289
320
  ttl_to_set: Optional[int] = self._state_ttl_seconds
290
321
  try:
291
322
  logger.debug(f"Setting state for {job_id} to {state} with TTL {ttl_to_set}")
292
- await asyncio.to_thread(
323
+ await self._run_bounded_to_thread(
293
324
  self._ingest_client.get_client().set,
294
325
  state_key,
295
326
  state,
@@ -317,7 +348,10 @@ class RedisIngestService(IngestServiceMeta):
317
348
  """
318
349
  state_key: str = f"{self._state_prefix}{job_id}"
319
350
  try:
320
- data_bytes: Optional[bytes] = await asyncio.to_thread(self._ingest_client.get_client().get, state_key)
351
+ data_bytes: Optional[bytes] = await self._run_bounded_to_thread(
352
+ self._ingest_client.get_client().get,
353
+ state_key,
354
+ )
321
355
  if data_bytes:
322
356
  state: str = data_bytes.decode("utf-8")
323
357
  logger.debug(f"Retrieved state for {job_id}: {state}")
@@ -350,7 +384,7 @@ class RedisIngestService(IngestServiceMeta):
350
384
  cache_key: str = f"{self._cache_prefix}{job_id}"
351
385
  try:
352
386
  data_to_store: str = json.dumps([job.model_dump(mode="json") for job in jobs_data])
353
- await asyncio.to_thread(
387
+ await self._run_bounded_to_thread(
354
388
  self._ingest_client.get_client().set,
355
389
  cache_key,
356
390
  data_to_store,
@@ -375,7 +409,10 @@ class RedisIngestService(IngestServiceMeta):
375
409
  """
376
410
  cache_key: str = f"{self._cache_prefix}{job_id}"
377
411
  try:
378
- data_bytes: Optional[bytes] = await asyncio.to_thread(self._ingest_client.get_client().get, cache_key)
412
+ data_bytes: Optional[bytes] = await self._run_bounded_to_thread(
413
+ self._ingest_client.get_client().get,
414
+ cache_key,
415
+ )
379
416
  if data_bytes is None:
380
417
  return []
381
418
  return [ProcessingJob(**job) for job in json.loads(data_bytes)]
@@ -393,3 +430,170 @@ class RedisIngestService(IngestServiceMeta):
393
430
  The current fetch mode.
394
431
  """
395
432
  return self._fetch_mode
433
+
434
+ async def set_parent_job_mapping(
435
+ self,
436
+ parent_job_id: str,
437
+ subjob_ids: List[str],
438
+ metadata: Dict[str, Any],
439
+ *,
440
+ subjob_descriptors: Optional[List[Dict[str, Any]]] = None,
441
+ ) -> None:
442
+ """
443
+ Store parent-subjob mapping in Redis for V2 PDF splitting.
444
+
445
+ Parameters
446
+ ----------
447
+ parent_job_id : str
448
+ The parent job identifier
449
+ subjob_ids : List[str]
450
+ List of subjob identifiers
451
+ metadata : Dict[str, Any]
452
+ Metadata about the parent job (total_pages, original_source_id, etc.)
453
+ subjob_descriptors : List[Dict[str, Any]], optional
454
+ Detailed descriptors (job_id, chunk_index, start/end pages) for subjobs
455
+ """
456
+ parent_key = f"parent:{parent_job_id}:subjobs"
457
+ metadata_key = f"parent:{parent_job_id}:metadata"
458
+
459
+ try:
460
+ # Store subjob IDs as a set (only if there are subjobs)
461
+ if subjob_ids:
462
+ await self._run_bounded_to_thread(
463
+ self._ingest_client.get_client().sadd,
464
+ parent_key,
465
+ *subjob_ids,
466
+ )
467
+
468
+ # Store metadata as hash (including original subjob ordering for deterministic fetches)
469
+ metadata_to_store = dict(metadata)
470
+ try:
471
+ metadata_to_store["subjob_order"] = json.dumps(subjob_ids)
472
+ except (TypeError, ValueError):
473
+ logger.warning(
474
+ "Unable to serialize subjob ordering for parent %s; falling back to Redis set ordering",
475
+ parent_job_id,
476
+ )
477
+ metadata_to_store.pop("subjob_order", None)
478
+
479
+ if subjob_descriptors:
480
+ metadata_to_store["subjob_descriptors"] = json.dumps(subjob_descriptors)
481
+
482
+ await self._run_bounded_to_thread(
483
+ self._ingest_client.get_client().hset,
484
+ metadata_key,
485
+ mapping=metadata_to_store,
486
+ )
487
+
488
+ # Set TTL on both keys to match state TTL
489
+ if self._state_ttl_seconds:
490
+ await self._run_bounded_to_thread(
491
+ self._ingest_client.get_client().expire,
492
+ parent_key,
493
+ self._state_ttl_seconds,
494
+ )
495
+ await self._run_bounded_to_thread(
496
+ self._ingest_client.get_client().expire,
497
+ metadata_key,
498
+ self._state_ttl_seconds,
499
+ )
500
+
501
+ logger.debug(f"Stored parent job mapping for {parent_job_id} with {len(subjob_ids)} subjobs")
502
+
503
+ except Exception as err:
504
+ logger.exception(f"Error storing parent job mapping for {parent_job_id}: {err}")
505
+ raise
506
+
507
+ async def get_parent_job_info(self, parent_job_id: str) -> Optional[Dict[str, Any]]:
508
+ """
509
+ Retrieve parent job information including subjob IDs and metadata.
510
+
511
+ Parameters
512
+ ----------
513
+ parent_job_id : str
514
+ The parent job identifier
515
+
516
+ Returns
517
+ -------
518
+ Dict[str, Any] or None
519
+ Dictionary with 'subjob_ids' and 'metadata' keys, or None if not a parent job
520
+ """
521
+ parent_key = f"parent:{parent_job_id}:subjobs"
522
+ metadata_key = f"parent:{parent_job_id}:metadata"
523
+
524
+ try:
525
+ # Check if this is a parent job (check metadata_key since non-split PDFs may not have parent_key)
526
+ exists = await self._run_bounded_to_thread(
527
+ self._ingest_client.get_client().exists,
528
+ metadata_key, # Check metadata instead of parent_key for non-split PDF support
529
+ )
530
+
531
+ if not exists:
532
+ return None
533
+
534
+ # Get subjob IDs (may be empty for non-split PDFs)
535
+ subjob_ids_bytes = await self._run_bounded_to_thread(
536
+ self._ingest_client.get_client().smembers,
537
+ parent_key,
538
+ )
539
+ subjob_id_set = {id.decode("utf-8") for id in subjob_ids_bytes} if subjob_ids_bytes else set()
540
+
541
+ # Get metadata
542
+ metadata_dict = await self._run_bounded_to_thread(
543
+ self._ingest_client.get_client().hgetall,
544
+ metadata_key,
545
+ )
546
+ metadata = {k.decode("utf-8"): v.decode("utf-8") for k, v in metadata_dict.items()}
547
+
548
+ # Convert numeric strings back to numbers
549
+ if "total_pages" in metadata:
550
+ metadata["total_pages"] = int(metadata["total_pages"])
551
+ if "pages_per_chunk" in metadata:
552
+ try:
553
+ metadata["pages_per_chunk"] = int(metadata["pages_per_chunk"])
554
+ except ValueError:
555
+ metadata.pop("pages_per_chunk", None)
556
+
557
+ ordered_ids: Optional[List[str]] = None
558
+ stored_order = metadata.pop("subjob_order", None)
559
+ if stored_order:
560
+ try:
561
+ candidate_order = json.loads(stored_order)
562
+ if isinstance(candidate_order, list):
563
+ ordered_ids = [sid for sid in candidate_order if sid in subjob_id_set]
564
+ except (ValueError, TypeError) as exc:
565
+ logger.warning(
566
+ "Failed to parse stored subjob order for parent %s: %s",
567
+ parent_job_id,
568
+ exc,
569
+ )
570
+
571
+ if ordered_ids is None:
572
+ ordered_ids = sorted(subjob_id_set)
573
+ else:
574
+ remaining_ids = sorted(subjob_id_set - set(ordered_ids))
575
+ ordered_ids.extend(remaining_ids)
576
+
577
+ subjob_descriptors: Optional[List[Dict[str, Any]]] = None
578
+ stored_descriptors = metadata.pop("subjob_descriptors", None)
579
+ if stored_descriptors:
580
+ try:
581
+ decoded = json.loads(stored_descriptors)
582
+ if isinstance(decoded, list):
583
+ subjob_descriptors = decoded
584
+ except (ValueError, TypeError) as exc:
585
+ logger.warning(
586
+ "Failed to parse stored subjob descriptors for parent %s: %s",
587
+ parent_job_id,
588
+ exc,
589
+ )
590
+
591
+ return {
592
+ "subjob_ids": ordered_ids,
593
+ "metadata": metadata,
594
+ "subjob_descriptors": subjob_descriptors or [],
595
+ }
596
+
597
+ except Exception as err:
598
+ logger.error(f"Error retrieving parent job info for {parent_job_id}: {err}")
599
+ return None
@@ -14,7 +14,8 @@ import yaml
14
14
  from typing import Optional
15
15
 
16
16
  from nv_ingest.pipeline.pipeline_schema import PipelineConfigSchema
17
- from nv_ingest.pipeline.default_pipeline_impl import DEFAULT_LIBMODE_PIPELINE_YAML
17
+ from nv_ingest.pipeline.default_libmode_pipeline_impl import DEFAULT_LIBMODE_PIPELINE_YAML
18
+ from nv_ingest.pipeline.default_pipeline_impl import DEFAULT_PIPELINE_YAML
18
19
  from nv_ingest.framework.orchestration.execution.options import PipelineRuntimeOverrides
19
20
  from nv_ingest_api.util.string_processing.yaml import substitute_env_vars_in_yaml_content
20
21
 
@@ -64,6 +65,35 @@ def load_pipeline_config(config_path: str) -> PipelineConfigSchema:
64
65
  return PipelineConfigSchema(**processed_config)
65
66
 
66
67
 
68
+ def load_default_pipeline_config() -> PipelineConfigSchema:
69
+ """
70
+ Load and validate the embedded default (non-libmode) pipeline configuration.
71
+
72
+ Returns
73
+ -------
74
+ PipelineConfigSchema
75
+ Validated default pipeline configuration.
76
+
77
+ Raises
78
+ ------
79
+ ValueError
80
+ If the default YAML cannot be parsed or validated.
81
+ """
82
+ logger.info("Loading embedded default pipeline configuration")
83
+
84
+ substituted_content = substitute_env_vars_in_yaml_content(DEFAULT_PIPELINE_YAML)
85
+
86
+ try:
87
+ processed_config = yaml.safe_load(substituted_content)
88
+ except yaml.YAMLError as e:
89
+ error_message = (
90
+ f"Failed to parse embedded default pipeline YAML after environment variable substitution. Error: {e}"
91
+ )
92
+ raise ValueError(error_message) from e
93
+
94
+ return PipelineConfigSchema(**processed_config)
95
+
96
+
67
97
  def load_default_libmode_config() -> PipelineConfigSchema:
68
98
  """
69
99
  Load and validate the default libmode pipeline configuration.
@@ -195,4 +225,5 @@ def resolve_pipeline_config(provided_config: Optional[PipelineConfigSchema], lib
195
225
  if libmode:
196
226
  return load_default_libmode_config()
197
227
  else:
198
- raise ValueError("pipeline_config must be provided when libmode is False")
228
+ # For non-libmode, fall back to embedded default pipeline implementation
229
+ return load_default_pipeline_config()