PyPI - nv-ingest - Versions diffs - 2025.10.22.dev20251022__py3-none-any.whl → 2025.10.24.dev20251024__py3-none-any.whl - Mend

nv-ingest 2025.10.22.dev20251022py3-none-any.whl → 2025.10.24.dev20251024py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nv-ingest might be problematic. Click here for more details.

Files changed (8) hide show

nv_ingest/api/v2/ingest.py CHANGED Viewed

@@ -672,11 +672,15 @@ async def submit_job_v2(
         original_source_id = source_ids[0] if source_ids else "unknown_source.pdf"
         original_source_name = source_names[0] if source_names else "unknown_source.pdf"
+        # Track page count for all PDFs (used for both splitting logic and metadata)
+        pdf_page_count_cache = None
         # Check if this is a PDF that needs splitting
         if document_types and payloads and document_types[0].lower() == "pdf":
             # Decode the payload to check page count
             pdf_content = base64.b64decode(payloads[0])
             page_count = get_pdf_page_count(pdf_content)
+            pdf_page_count_cache = page_count  # Cache for later use
             pages_per_chunk = get_pdf_split_page_count(client_override=client_split_page_count)
             # Split if the document has more pages than our chunk size
@@ -762,6 +766,34 @@ async def submit_job_v2(
         await ingest_service.submit_job(updated_job_spec, parent_job_id)
         await ingest_service.set_job_state(parent_job_id, STATE_SUBMITTED)
+        # If this was a PDF (even if not split), store page count metadata for tracking
+        if pdf_page_count_cache is not None:
+            try:
+                # Use cached page count from earlier check to avoid re-decoding
+                # Store minimal metadata for non-split PDFs (consistent with split PDFs)
+                single_pdf_metadata: Dict[str, Any] = {
+                    "total_pages": pdf_page_count_cache,
+                    "pages_per_chunk": pdf_page_count_cache,  # Single chunk = entire document
+                    "original_source_id": original_source_id,
+                    "original_source_name": original_source_name,
+                    "document_type": document_types[0],
+                    "subjob_order": [],  # No subjobs for non-split PDFs
+                }
+                # Store as parent job metadata with empty subjob list for consistency
+                await ingest_service.set_parent_job_mapping(
+                    parent_job_id,
+                    [],  # Empty subjob list
+                    single_pdf_metadata,
+                    subjob_descriptors=[],
+                )
+                logger.debug(
+                    f"Stored page count metadata for non-split PDF {original_source_name}: {pdf_page_count_cache} pages"
+                )
+            except Exception as metadata_err:
+                # Don't fail the job if metadata storage fails
+                logger.warning(f"Failed to store page count metadata for {parent_job_id}: {metadata_err}")
         response.headers["x-trace-id"] = trace.format_trace_id(current_trace_id)
         return parent_job_id
@@ -898,6 +930,32 @@ async def fetch_job_v2(job_id: str, ingest_service: INGEST_SERVICE_T):
             logger.debug(f"Parent job {job_id} has {len(subjob_ids)} subjobs")
+            # Special case: Non-split PDFs have metadata but no subjobs
+            # Fetch the result directly and augment with page count metadata
+            if len(subjob_ids) == 0:
+                logger.debug(f"Job {job_id} is a non-split PDF, fetching result directly")
+                try:
+                    job_response = await ingest_service.fetch_job(job_id)
+                    # Augment response with page count metadata
+                    if isinstance(job_response, dict):
+                        if "metadata" not in job_response:
+                            job_response["metadata"] = {}
+                        job_response["metadata"]["total_pages"] = metadata.get("total_pages")
+                        job_response["metadata"]["original_source_id"] = metadata.get("original_source_id")
+                        job_response["metadata"]["original_source_name"] = metadata.get("original_source_name")
+                    # Update job state after successful fetch
+                    await _update_job_state_after_fetch(job_id, ingest_service)
+                    return _stream_json_response(job_response)
+                except (TimeoutError, RedisError, ConnectionError):
+                    logger.debug(f"Job {job_id} (non-split PDF) not ready yet")
+                    raise HTTPException(status_code=202, detail="Job is processing. Retry later.")
+                except Exception as e:
+                    logger.exception(f"Error fetching non-split PDF job {job_id}: {e}")
+                    raise HTTPException(status_code=500, detail="Internal server error during job fetch.")
             # Build ordered descriptors for subjobs
             stored_descriptors = subjob_info.get("subjob_descriptors") or []
             descriptor_lookup = {entry.get("job_id"): entry for entry in stored_descriptors if isinstance(entry, dict)}

nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py CHANGED Viewed

@@ -436,12 +436,13 @@ class RedisIngestService(IngestServiceMeta):
         metadata_key = f"parent:{parent_job_id}:metadata"
         try:
-            # Store subjob IDs as a set
-            await self._run_bounded_to_thread(
-                self._ingest_client.get_client().sadd,
-                parent_key,
-                *subjob_ids,
-            )
+            # Store subjob IDs as a set (only if there are subjobs)
+            if subjob_ids:
+                await self._run_bounded_to_thread(
+                    self._ingest_client.get_client().sadd,
+                    parent_key,
+                    *subjob_ids,
+                )
             # Store metadata as hash (including original subjob ordering for deterministic fetches)
             metadata_to_store = dict(metadata)

nv_ingest/pipeline/default_pipeline_impl.py CHANGED Viewed

@@ -318,6 +318,7 @@ stages:
     config:
       api_key: $NGC_API_KEY|$NVIDIA_API_KEY
       model_name: $VLM_CAPTION_MODEL_NAME|"nvidia/llama-3.1-nemotron-nano-vl-8b-v1"
+      endpoint_url: $VLM_CAPTION_ENDPOINT|"http://vlm:8000/v1/chat/completions"
       prompt: "Caption the content of this image:"
     replicas:
       min_replicas: 0

{nv_ingest-2025.10.22.dev20251022.dist-info → nv_ingest-2025.10.24.dev20251024.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: nv-ingest
-Version: 2025.10.22.dev20251022
+Version: 2025.10.24.dev20251024
 Summary: Python module for multimodal document ingestion
 Author-email: Jeremy Dyer <jdyer@nvidia.com>
 License:                                  Apache License

{nv_ingest-2025.10.22.dev20251022.dist-info → nv_ingest-2025.10.24.dev20251024.dist-info}/RECORD RENAMED Viewed

@@ -9,7 +9,7 @@ nv_ingest/api/v1/ingest.py,sha256=LWk3LN4lBd3uO8h30EN42g3LHCVcO00avVd5ohVK7NI,19
 nv_ingest/api/v1/metrics.py,sha256=ZGVRApYLnzc2f2C7wRgGd7deqiXan-jxfA-33a16clY,981
 nv_ingest/api/v2/README.md,sha256=tbQOcD_67YWedboAcDRlZJgjvVZZTW1-ZodcqP0iynk,7133
 nv_ingest/api/v2/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
-nv_ingest/api/v2/ingest.py,sha256=v5l1c1BdmgyPqMzRj8CezI3dR6HpKOuevfomT1v4RGc,37313
+nv_ingest/api/v2/ingest.py,sha256=ja0sNV0muQxnYdcXO1VLUFoT3jb3Cg0XLdB3YhGc1ZI,40634
 nv_ingest/framework/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
 nv_ingest/framework/orchestration/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
 nv_ingest/framework/orchestration/execution/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
@@ -103,7 +103,7 @@ nv_ingest/framework/util/flow_control/udf_intercept.py,sha256=zQ9uuCcHLEd0P52Eiw
 nv_ingest/framework/util/service/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
 nv_ingest/framework/util/service/impl/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
 nv_ingest/framework/util/service/impl/ingest/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
-nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py,sha256=OuGC3FFhkLQLR3x4s-tyxGguYYn8ORKr2xkzMy2br0g,22552
+nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py,sha256=cBR9G2YCcOtuzi9_6igleQK03CSpK1X6v5ibeSUijmo,22627
 nv_ingest/framework/util/service/meta/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
 nv_ingest/framework/util/service/meta/ingest/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
 nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py,sha256=QS3uNxWBl5dIcmIpJKNe8_TLcTUuN2vcKyHeAwa-eSo,1589
@@ -111,14 +111,14 @@ nv_ingest/framework/util/telemetry/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusX
 nv_ingest/framework/util/telemetry/global_stats.py,sha256=nq65pEEdiwjAfGiqsxG1CeQMC96O3CfQxsZuGFCY-ds,4554
 nv_ingest/pipeline/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
 nv_ingest/pipeline/default_libmode_pipeline_impl.py,sha256=MiyKe8RS18PNYwEVvrASiHFpynR_BavOe0hhVnUdbEc,15618
-nv_ingest/pipeline/default_pipeline_impl.py,sha256=irVm_wmJW5a7a3xTJd18AFZfwLheERkhCty-0XZrIMY,15288
+nv_ingest/pipeline/default_pipeline_impl.py,sha256=m1m5iK9zs91zF-I_kixJvbHsNzFena7QxnNAkS8P0OM,15368
 nv_ingest/pipeline/ingest_pipeline.py,sha256=wHAJhqAM2s8nbY-8itVogmSU-yVN4PZONGWcKnhzgfg,17794
 nv_ingest/pipeline/pipeline_schema.py,sha256=rLZZz2It2o2hVNWrZUJU8CarrqRei1fho3ZEMkkoBcg,17940
 nv_ingest/pipeline/config/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
 nv_ingest/pipeline/config/loaders.py,sha256=75Yr9WYO7j7ghvKTnYLfZXQZEH3J3VEZo5J4TunC_Us,7590
 nv_ingest/pipeline/config/replica_resolver.py,sha256=3zjh8gmepEYORFZRM4inq7GoBW0YL3gzUDiixUugjzQ,8899
-nv_ingest-2025.10.22.dev20251022.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-nv_ingest-2025.10.22.dev20251022.dist-info/METADATA,sha256=fBAiUkJijOoKO-QsdNYEpDF9X1ovQ2BBSBBhLP-Yykw,15122
-nv_ingest-2025.10.22.dev20251022.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-nv_ingest-2025.10.22.dev20251022.dist-info/top_level.txt,sha256=sjb0ajIsgn3YgftSjZHlYO0HjYAIIhNuXG_AmywCvaU,10
-nv_ingest-2025.10.22.dev20251022.dist-info/RECORD,,
+nv_ingest-2025.10.24.dev20251024.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+nv_ingest-2025.10.24.dev20251024.dist-info/METADATA,sha256=nrFpry5brNE51Hx3aydd_LiJt_4xAGUHDoL2QcndNOw,15122
+nv_ingest-2025.10.24.dev20251024.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+nv_ingest-2025.10.24.dev20251024.dist-info/top_level.txt,sha256=sjb0ajIsgn3YgftSjZHlYO0HjYAIIhNuXG_AmywCvaU,10
+nv_ingest-2025.10.24.dev20251024.dist-info/RECORD,,

{nv_ingest-2025.10.22.dev20251022.dist-info → nv_ingest-2025.10.24.dev20251024.dist-info}/WHEEL RENAMED Viewed

File without changes

{nv_ingest-2025.10.22.dev20251022.dist-info → nv_ingest-2025.10.24.dev20251024.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{nv_ingest-2025.10.22.dev20251022.dist-info → nv_ingest-2025.10.24.dev20251024.dist-info}/top_level.txt RENAMED Viewed

File without changes

nv-ingest 2025.10.22.dev20251022__py3-none-any.whl → 2025.10.24.dev20251024__py3-none-any.whl

Potentially problematic release.

nv-ingest 2025.10.22.dev20251022py3-none-any.whl → 2025.10.24.dev20251024py3-none-any.whl