nv-ingest 2025.10.22.dev20251022__py3-none-any.whl → 2025.10.24.dev20251024__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest might be problematic. Click here for more details.

@@ -672,11 +672,15 @@ async def submit_job_v2(
672
672
  original_source_id = source_ids[0] if source_ids else "unknown_source.pdf"
673
673
  original_source_name = source_names[0] if source_names else "unknown_source.pdf"
674
674
 
675
+ # Track page count for all PDFs (used for both splitting logic and metadata)
676
+ pdf_page_count_cache = None
677
+
675
678
  # Check if this is a PDF that needs splitting
676
679
  if document_types and payloads and document_types[0].lower() == "pdf":
677
680
  # Decode the payload to check page count
678
681
  pdf_content = base64.b64decode(payloads[0])
679
682
  page_count = get_pdf_page_count(pdf_content)
683
+ pdf_page_count_cache = page_count # Cache for later use
680
684
  pages_per_chunk = get_pdf_split_page_count(client_override=client_split_page_count)
681
685
 
682
686
  # Split if the document has more pages than our chunk size
@@ -762,6 +766,34 @@ async def submit_job_v2(
762
766
  await ingest_service.submit_job(updated_job_spec, parent_job_id)
763
767
  await ingest_service.set_job_state(parent_job_id, STATE_SUBMITTED)
764
768
 
769
+ # If this was a PDF (even if not split), store page count metadata for tracking
770
+ if pdf_page_count_cache is not None:
771
+ try:
772
+ # Use cached page count from earlier check to avoid re-decoding
773
+ # Store minimal metadata for non-split PDFs (consistent with split PDFs)
774
+ single_pdf_metadata: Dict[str, Any] = {
775
+ "total_pages": pdf_page_count_cache,
776
+ "pages_per_chunk": pdf_page_count_cache, # Single chunk = entire document
777
+ "original_source_id": original_source_id,
778
+ "original_source_name": original_source_name,
779
+ "document_type": document_types[0],
780
+ "subjob_order": [], # No subjobs for non-split PDFs
781
+ }
782
+
783
+ # Store as parent job metadata with empty subjob list for consistency
784
+ await ingest_service.set_parent_job_mapping(
785
+ parent_job_id,
786
+ [], # Empty subjob list
787
+ single_pdf_metadata,
788
+ subjob_descriptors=[],
789
+ )
790
+ logger.debug(
791
+ f"Stored page count metadata for non-split PDF {original_source_name}: {pdf_page_count_cache} pages"
792
+ )
793
+ except Exception as metadata_err:
794
+ # Don't fail the job if metadata storage fails
795
+ logger.warning(f"Failed to store page count metadata for {parent_job_id}: {metadata_err}")
796
+
765
797
  response.headers["x-trace-id"] = trace.format_trace_id(current_trace_id)
766
798
  return parent_job_id
767
799
 
@@ -898,6 +930,32 @@ async def fetch_job_v2(job_id: str, ingest_service: INGEST_SERVICE_T):
898
930
 
899
931
  logger.debug(f"Parent job {job_id} has {len(subjob_ids)} subjobs")
900
932
 
933
+ # Special case: Non-split PDFs have metadata but no subjobs
934
+ # Fetch the result directly and augment with page count metadata
935
+ if len(subjob_ids) == 0:
936
+ logger.debug(f"Job {job_id} is a non-split PDF, fetching result directly")
937
+ try:
938
+ job_response = await ingest_service.fetch_job(job_id)
939
+
940
+ # Augment response with page count metadata
941
+ if isinstance(job_response, dict):
942
+ if "metadata" not in job_response:
943
+ job_response["metadata"] = {}
944
+ job_response["metadata"]["total_pages"] = metadata.get("total_pages")
945
+ job_response["metadata"]["original_source_id"] = metadata.get("original_source_id")
946
+ job_response["metadata"]["original_source_name"] = metadata.get("original_source_name")
947
+
948
+ # Update job state after successful fetch
949
+ await _update_job_state_after_fetch(job_id, ingest_service)
950
+
951
+ return _stream_json_response(job_response)
952
+ except (TimeoutError, RedisError, ConnectionError):
953
+ logger.debug(f"Job {job_id} (non-split PDF) not ready yet")
954
+ raise HTTPException(status_code=202, detail="Job is processing. Retry later.")
955
+ except Exception as e:
956
+ logger.exception(f"Error fetching non-split PDF job {job_id}: {e}")
957
+ raise HTTPException(status_code=500, detail="Internal server error during job fetch.")
958
+
901
959
  # Build ordered descriptors for subjobs
902
960
  stored_descriptors = subjob_info.get("subjob_descriptors") or []
903
961
  descriptor_lookup = {entry.get("job_id"): entry for entry in stored_descriptors if isinstance(entry, dict)}
@@ -436,12 +436,13 @@ class RedisIngestService(IngestServiceMeta):
436
436
  metadata_key = f"parent:{parent_job_id}:metadata"
437
437
 
438
438
  try:
439
- # Store subjob IDs as a set
440
- await self._run_bounded_to_thread(
441
- self._ingest_client.get_client().sadd,
442
- parent_key,
443
- *subjob_ids,
444
- )
439
+ # Store subjob IDs as a set (only if there are subjobs)
440
+ if subjob_ids:
441
+ await self._run_bounded_to_thread(
442
+ self._ingest_client.get_client().sadd,
443
+ parent_key,
444
+ *subjob_ids,
445
+ )
445
446
 
446
447
  # Store metadata as hash (including original subjob ordering for deterministic fetches)
447
448
  metadata_to_store = dict(metadata)
@@ -318,6 +318,7 @@ stages:
318
318
  config:
319
319
  api_key: $NGC_API_KEY|$NVIDIA_API_KEY
320
320
  model_name: $VLM_CAPTION_MODEL_NAME|"nvidia/llama-3.1-nemotron-nano-vl-8b-v1"
321
+ endpoint_url: $VLM_CAPTION_ENDPOINT|"http://vlm:8000/v1/chat/completions"
321
322
  prompt: "Caption the content of this image:"
322
323
  replicas:
323
324
  min_replicas: 0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest
3
- Version: 2025.10.22.dev20251022
3
+ Version: 2025.10.24.dev20251024
4
4
  Summary: Python module for multimodal document ingestion
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -9,7 +9,7 @@ nv_ingest/api/v1/ingest.py,sha256=LWk3LN4lBd3uO8h30EN42g3LHCVcO00avVd5ohVK7NI,19
9
9
  nv_ingest/api/v1/metrics.py,sha256=ZGVRApYLnzc2f2C7wRgGd7deqiXan-jxfA-33a16clY,981
10
10
  nv_ingest/api/v2/README.md,sha256=tbQOcD_67YWedboAcDRlZJgjvVZZTW1-ZodcqP0iynk,7133
11
11
  nv_ingest/api/v2/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
12
- nv_ingest/api/v2/ingest.py,sha256=v5l1c1BdmgyPqMzRj8CezI3dR6HpKOuevfomT1v4RGc,37313
12
+ nv_ingest/api/v2/ingest.py,sha256=ja0sNV0muQxnYdcXO1VLUFoT3jb3Cg0XLdB3YhGc1ZI,40634
13
13
  nv_ingest/framework/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
14
14
  nv_ingest/framework/orchestration/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
15
15
  nv_ingest/framework/orchestration/execution/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
@@ -103,7 +103,7 @@ nv_ingest/framework/util/flow_control/udf_intercept.py,sha256=zQ9uuCcHLEd0P52Eiw
103
103
  nv_ingest/framework/util/service/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
104
104
  nv_ingest/framework/util/service/impl/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
105
105
  nv_ingest/framework/util/service/impl/ingest/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
106
- nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py,sha256=OuGC3FFhkLQLR3x4s-tyxGguYYn8ORKr2xkzMy2br0g,22552
106
+ nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py,sha256=cBR9G2YCcOtuzi9_6igleQK03CSpK1X6v5ibeSUijmo,22627
107
107
  nv_ingest/framework/util/service/meta/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
108
108
  nv_ingest/framework/util/service/meta/ingest/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
109
109
  nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py,sha256=QS3uNxWBl5dIcmIpJKNe8_TLcTUuN2vcKyHeAwa-eSo,1589
@@ -111,14 +111,14 @@ nv_ingest/framework/util/telemetry/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusX
111
111
  nv_ingest/framework/util/telemetry/global_stats.py,sha256=nq65pEEdiwjAfGiqsxG1CeQMC96O3CfQxsZuGFCY-ds,4554
112
112
  nv_ingest/pipeline/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
113
113
  nv_ingest/pipeline/default_libmode_pipeline_impl.py,sha256=MiyKe8RS18PNYwEVvrASiHFpynR_BavOe0hhVnUdbEc,15618
114
- nv_ingest/pipeline/default_pipeline_impl.py,sha256=irVm_wmJW5a7a3xTJd18AFZfwLheERkhCty-0XZrIMY,15288
114
+ nv_ingest/pipeline/default_pipeline_impl.py,sha256=m1m5iK9zs91zF-I_kixJvbHsNzFena7QxnNAkS8P0OM,15368
115
115
  nv_ingest/pipeline/ingest_pipeline.py,sha256=wHAJhqAM2s8nbY-8itVogmSU-yVN4PZONGWcKnhzgfg,17794
116
116
  nv_ingest/pipeline/pipeline_schema.py,sha256=rLZZz2It2o2hVNWrZUJU8CarrqRei1fho3ZEMkkoBcg,17940
117
117
  nv_ingest/pipeline/config/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
118
118
  nv_ingest/pipeline/config/loaders.py,sha256=75Yr9WYO7j7ghvKTnYLfZXQZEH3J3VEZo5J4TunC_Us,7590
119
119
  nv_ingest/pipeline/config/replica_resolver.py,sha256=3zjh8gmepEYORFZRM4inq7GoBW0YL3gzUDiixUugjzQ,8899
120
- nv_ingest-2025.10.22.dev20251022.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
121
- nv_ingest-2025.10.22.dev20251022.dist-info/METADATA,sha256=fBAiUkJijOoKO-QsdNYEpDF9X1ovQ2BBSBBhLP-Yykw,15122
122
- nv_ingest-2025.10.22.dev20251022.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
123
- nv_ingest-2025.10.22.dev20251022.dist-info/top_level.txt,sha256=sjb0ajIsgn3YgftSjZHlYO0HjYAIIhNuXG_AmywCvaU,10
124
- nv_ingest-2025.10.22.dev20251022.dist-info/RECORD,,
120
+ nv_ingest-2025.10.24.dev20251024.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
121
+ nv_ingest-2025.10.24.dev20251024.dist-info/METADATA,sha256=nrFpry5brNE51Hx3aydd_LiJt_4xAGUHDoL2QcndNOw,15122
122
+ nv_ingest-2025.10.24.dev20251024.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
123
+ nv_ingest-2025.10.24.dev20251024.dist-info/top_level.txt,sha256=sjb0ajIsgn3YgftSjZHlYO0HjYAIIhNuXG_AmywCvaU,10
124
+ nv_ingest-2025.10.24.dev20251024.dist-info/RECORD,,