PyPI - nv-ingest - Versions diffs - 2025.10.22.dev20251022__py3-none-any.whl → 2025.11.19.dev20251119__py3-none-any.whl - Mend

nv-ingest 2025.10.22.dev20251022py3-none-any.whl → 2025.11.19.dev20251119py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

nv_ingest/api/v2/ingest.py CHANGED Viewed

@@ -12,6 +12,9 @@ import logging
 import os
 import time
 import uuid
+import random
+from pathlib import Path
+import fsspec
 from fastapi import APIRouter, Request, Response
 from fastapi import HTTPException
@@ -20,6 +23,8 @@ from redis import RedisError
 from nv_ingest.framework.schemas.framework_message_wrapper_schema import MessageWrapper
 from nv_ingest_api.util.service_clients.client_base import FetchMode
+from nv_ingest_api.util.dataloader.dataloader import DataLoader
+from nv_ingest_api.internal.schemas.meta.ingest_job_schema import DocumentTypeEnum
 # For PDF splitting
 import pypdfium2 as pdfium
@@ -44,6 +49,42 @@ router = APIRouter()
 DEFAULT_PDF_SPLIT_PAGE_COUNT = 32
+# Default QoS thresholds (pages). Tunable via environment variables:
+# QOS_MAX_PAGES_MICRO, QOS_MAX_PAGES_SMALL, QOS_MAX_PAGES_MEDIUM
+_QOS_DEFAULTS = {
+    "micro": 8,
+    "small": 64,
+    "medium": 256,
+}
+def get_qos_tier_for_page_count(page_count: int) -> str:
+    """
+    Select QoS tier for a document based on its total page count.
+    Tiers: 'micro', 'small', 'medium', 'large', 'default'
+    Thresholds can be tuned via environment variables:
+      - QOS_MAX_PAGES_MICRO (default: 4)
+      - QOS_MAX_PAGES_SMALL (default: 16)
+      - QOS_MAX_PAGES_MEDIUM (default: 64)
+    Anything above MEDIUM is 'large'. Non-positive page_count returns 'default'.
+    """
+    try:
+        micro_max = int(os.getenv("QOS_MAX_PAGES_MICRO", str(_QOS_DEFAULTS["micro"])))
+        small_max = int(os.getenv("QOS_MAX_PAGES_SMALL", str(_QOS_DEFAULTS["small"])))
+        medium_max = int(os.getenv("QOS_MAX_PAGES_MEDIUM", str(_QOS_DEFAULTS["medium"])))
+    except ValueError:
+        micro_max, small_max, medium_max = _QOS_DEFAULTS["micro"], _QOS_DEFAULTS["small"], _QOS_DEFAULTS["medium"]
+    if page_count <= 0:
+        return "default"
+    if page_count <= micro_max:
+        return "micro"
+    if page_count <= small_max:
+        return "small"
+    if page_count <= medium_max:
+        return "medium"
+    return "large"
 def get_pdf_split_page_count(client_override: Optional[int] = None) -> int:
     """
@@ -151,28 +192,42 @@ def get_pdf_page_count(pdf_content: bytes) -> int:
         return 1  # Assume single page on error
-def _prepare_chunk_submission(
+def _create_subjob_dict(
+    job_id: str,
+    job_payload: Dict[str, Any],
     job_spec_template: Dict[str, Any],
-    chunk: Dict[str, Any],
-    *,
-    parent_uuid: uuid.UUID,
-    parent_job_id: str,
     current_trace_id: int,
-    original_source_id: str,
-    original_source_name: str,
-) -> Tuple[str, MessageWrapper]:
-    """Create a subjob MessageWrapper for a PDF chunk and return its identifier."""
-    chunk_number = chunk["chunk_index"] + 1
-    start_page = chunk["start_page"]
-    end_page = chunk["end_page"]
-    subjob_spec = {
+    parent_job_id: str,
+    start_key: Dict[str, Any],
+) -> Dict[str, Any]:
+    job_spec = {
         key: value
         for key, value in job_spec_template.items()
         if key not in {"job_payload", "job_id", "tracing_options"}
     }
+    job_spec["job_payload"] = job_payload
+    job_spec["job_id"] = job_id
+    base_tracing_options = job_spec_template.get("tracing_options") or {}
+    tracing_options = dict(base_tracing_options)
+    tracing_options.setdefault("trace", True)
+    tracing_options["trace_id"] = str(current_trace_id)
+    tracing_options["ts_send"] = int(time.time() * 1000)
+    tracing_options["parent_job_id"] = parent_job_id
+    for key, value in start_key.items():
+        tracing_options[key] = value
+    job_spec["tracing_options"] = tracing_options
+    return job_spec
+def _create_payload_dict(
+    job_spec_template: Dict[str, Any],
+    content: str,
+    source_id: str,
+    source_name: str,
+    document_type: str,
+) -> Dict[str, Any]:
     subjob_payload_template = job_spec_template.get("job_payload", {})
     subjob_payload = {
         key: value
@@ -180,27 +235,40 @@ def _prepare_chunk_submission(
         if key not in {"content", "source_id", "source_name"}
     }
-    chunk_bytes = chunk["bytes"]
-    subjob_payload["content"] = [base64.b64encode(chunk_bytes).decode("utf-8")]
+    subjob_payload["content"] = [content]
-    page_suffix = f"page_{start_page}" if start_page == end_page else f"pages_{start_page}-{end_page}"
-    subjob_payload["source_id"] = [f"{original_source_id}#{page_suffix}"]
-    subjob_payload["source_name"] = [f"{original_source_name}#{page_suffix}"]
+    subjob_payload["source_id"] = [source_id]
+    subjob_payload["source_name"] = [source_name]
+    subjob_payload["document_type"] = [document_type]
+    return subjob_payload
+def _prepare_chunk_submission(
+    job_spec_template: Dict[str, Any],
+    chunk: Dict[str, Any],
+    *,
+    parent_uuid: uuid.UUID,
+    parent_job_id: str,
+    current_trace_id: int,
+    source_id: str,
+    source_name: str,
+    document_type: str,
+) -> Tuple[str, MessageWrapper]:
+    """Create a subjob MessageWrapper for a PDF chunk and return its identifier."""
+    chunk_number = chunk["chunk_index"] + 1
     subjob_uuid = uuid.uuid5(parent_uuid, f"chunk-{chunk_number}")
     subjob_id = str(subjob_uuid)
-    subjob_spec["job_payload"] = subjob_payload
-    subjob_spec["job_id"] = subjob_id
-    base_tracing_options = job_spec_template.get("tracing_options") or {}
-    tracing_options = dict(base_tracing_options)
-    tracing_options.setdefault("trace", True)
-    tracing_options["trace_id"] = str(current_trace_id)
-    tracing_options["ts_send"] = int(time.time() * 1000)
-    tracing_options["parent_job_id"] = parent_job_id
-    tracing_options["page_num"] = start_page
+    subjob_payload_template = job_spec_template.get("job_payload", {})
+    chunk_bytes = base64.b64encode(chunk["bytes"]).decode("utf-8")
+    subjob_payload = _create_payload_dict(subjob_payload_template, chunk_bytes, source_id, source_name, document_type)
+    start = chunk["start_page"] if "start_page" in chunk else chunk["start"]
-    subjob_spec["tracing_options"] = tracing_options
+    subjob_spec = _create_subjob_dict(
+        subjob_id, subjob_payload, job_spec_template, current_trace_id, parent_job_id, {"page_num": start}
+    )
     return subjob_id, MessageWrapper(payload=json.dumps(subjob_spec))
@@ -432,6 +500,76 @@ def _extract_ray_telemetry(result: Dict[str, Any]) -> Tuple[Optional[Dict[str, A
     return trace_dict, annotations_dict
+def _normalize_chunk_records(
+    records: Optional[List[Any]],
+    descriptor: Dict[str, Any],
+    parent_metadata: Dict[str, Any],
+) -> List[Any]:
+    """Re-map chunk-local metadata to document-level context for aggregation."""
+    if not isinstance(records, list):
+        return []
+    total_pages = parent_metadata.get("total_pages")
+    original_source_id = parent_metadata.get("original_source_id")
+    original_source_name = parent_metadata.get("original_source_name")
+    start_page = descriptor.get("start_page")
+    page_offset = start_page - 1 if isinstance(start_page, int) and start_page > 0 else 0
+    normalized_entries: List[Any] = []
+    for entry in records:
+        if not isinstance(entry, dict):
+            normalized_entries.append(entry)
+            continue
+        normalized_entry = entry.copy()
+        original_metadata = entry.get("metadata")
+        if isinstance(original_metadata, dict):
+            normalized_metadata = original_metadata.copy()
+            normalized_entry["metadata"] = normalized_metadata
+            original_source_meta = original_metadata.get("source_metadata")
+            if isinstance(original_source_meta, dict):
+                normalized_source_meta = original_source_meta.copy()
+                normalized_metadata["source_metadata"] = normalized_source_meta
+                if original_source_id:
+                    normalized_source_meta["source_id"] = original_source_id
+                if original_source_name:
+                    normalized_source_meta["source_name"] = original_source_name
+            original_content_meta = original_metadata.get("content_metadata")
+            if isinstance(original_content_meta, dict):
+                normalized_content_meta = original_content_meta.copy()
+                normalized_metadata["content_metadata"] = normalized_content_meta
+                page_number = normalized_content_meta.get("page_number")
+                if isinstance(page_number, int) and page_number >= 0:
+                    normalized_content_meta["page_number"] = page_number + page_offset
+                if isinstance(total_pages, int) and isinstance(normalized_content_meta.get("page_count"), int):
+                    # Ensure optional per-record page count reflects the full document
+                    normalized_content_meta["page_count"] = total_pages
+                original_hierarchy = original_content_meta.get("hierarchy")
+                if isinstance(original_hierarchy, dict):
+                    normalized_hierarchy = original_hierarchy.copy()
+                    normalized_content_meta["hierarchy"] = normalized_hierarchy
+                    hierarchy_page = normalized_hierarchy.get("page")
+                    if isinstance(hierarchy_page, int) and hierarchy_page >= 0:
+                        normalized_hierarchy["page"] = hierarchy_page + page_offset
+                    if isinstance(total_pages, int):
+                        normalized_hierarchy["page_count"] = total_pages
+        normalized_entries.append(normalized_entry)
+    return normalized_entries
 def _aggregate_parent_traces(chunk_traces: Dict[str, Any]) -> Dict[str, Any]:
     """
     Aggregate chunk-level traces into parent-level metrics.
@@ -574,7 +712,8 @@ def _build_aggregated_response(
         if result is not None:
             # Add page data to aggregated result
             if "data" in result:
-                aggregated_result["data"].extend(result["data"])
+                normalized_records = _normalize_chunk_records(result.get("data"), descriptor, metadata)
+                aggregated_result["data"].extend(normalized_records)
             chunk_entry = dict(descriptor)
             aggregated_result["metadata"]["chunks"].append(chunk_entry)
@@ -631,6 +770,51 @@ def _build_aggregated_response(
     return aggregated_result
+# ---------------------------------------------------------------------------
+# Bursty submission helpers (fairness without long-lived in-flight tasks)
+# ---------------------------------------------------------------------------
+def _get_submit_burst_params() -> Tuple[int, int, int]:
+    """
+    Returns (burst_size, pause_ms, jitter_ms) from environment with sane defaults.
+    - V2_SUBMIT_BURST_SIZE (default: 16)
+    - V2_SUBMIT_BURST_PAUSE_MS (default: 25)
+    - V2_SUBMIT_BURST_JITTER_MS (default: 10)
+    """
+    burst_size = int(os.getenv("V2_SUBMIT_BURST_SIZE", "16"))
+    pause_ms = int(os.getenv("V2_SUBMIT_BURST_PAUSE_MS", "50"))
+    jitter_ms = int(os.getenv("V2_SUBMIT_BURST_JITTER_MS", "15"))
+    return max(1, burst_size), max(0, pause_ms), max(0, jitter_ms)
+async def _submit_subjobs_in_bursts(
+    items: List[Tuple[str, MessageWrapper]],
+    ingest_service: "INGEST_SERVICE_T",
+    *,
+    burst_size: int,
+    pause_ms: int,
+    jitter_ms: int,
+) -> None:
+    """
+    Submit subjobs in sequential bursts and await each burst to completion.
+    This avoids keeping a large number of pending tasks in the REST handler
+    and allows other concurrent requests to interleave enqueue work between bursts.
+    """
+    for offset in range(0, len(items), burst_size):
+        burst = items[offset : offset + burst_size]
+        tasks = [ingest_service.submit_job(wrapper, subjob_id) for (subjob_id, wrapper) in burst]
+        # Propagate any errors from this burst
+        await asyncio.gather(*tasks)
+        # Pause with jitter to yield to other request handlers before next burst
+        if offset + burst_size < len(items):
+            delay_ms = pause_ms + (random.randint(0, jitter_ms) if jitter_ms > 0 else 0)
+            if delay_ms > 0:
+                await asyncio.sleep(delay_ms / 1000.0)
 # POST /v2/submit_job
 @router.post(
     "/submit_job",
@@ -648,6 +832,8 @@ async def submit_job_v2(
     request: Request, response: Response, job_spec: MessageWrapper, ingest_service: INGEST_SERVICE_T
 ):
     span = trace.get_current_span()
+    source_id = None
+    document_type = None
     try:
         span.add_event("Submitting file for processing (V2)")
@@ -672,28 +858,45 @@ async def submit_job_v2(
         original_source_id = source_ids[0] if source_ids else "unknown_source.pdf"
         original_source_name = source_names[0] if source_names else "unknown_source.pdf"
+        # Track page count for all PDFs (used for both splitting logic and metadata)
+        pdf_page_count_cache = None
+        submission_items: List[Tuple[str, MessageWrapper]] = []
+        subjob_ids: List[str] = []
+        subjob_descriptors: List[Dict[str, Any]] = []
+        parent_metadata: Dict[str, Any] = {}
+        submission_items: List[Tuple[str, MessageWrapper]] = []
+        try:
+            parent_uuid = uuid.UUID(parent_job_id)
+        except ValueError:
+            logger.warning(
+                "Parent job id %s is not a valid UUID; generating fallback namespace for subjobs",
+                parent_job_id,
+            )
+            parent_uuid = uuid.uuid4()
         # Check if this is a PDF that needs splitting
         if document_types and payloads and document_types[0].lower() == "pdf":
             # Decode the payload to check page count
             pdf_content = base64.b64decode(payloads[0])
             page_count = get_pdf_page_count(pdf_content)
+            pdf_page_count_cache = page_count  # Cache for later use
+            qos_tier = get_qos_tier_for_page_count(page_count)
             pages_per_chunk = get_pdf_split_page_count(client_override=client_split_page_count)
+            document_type = DocumentTypeEnum.PDF
             # Split if the document has more pages than our chunk size
             if page_count > pages_per_chunk:
                 logger.warning(
-                    "Splitting PDF %s into %s-page chunks (total pages: %s)",
+                    "Splitting PDF %s into %s-page chunks (total pages: %s) -> (qos_tier: %s)",
                     original_source_name,
                     pages_per_chunk,
                     page_count,
+                    qos_tier,
                 )
                 chunks = split_pdf_to_chunks(pdf_content, pages_per_chunk)
                 subjob_ids: List[str] = []
                 subjob_descriptors: List[Dict[str, Any]] = []
-                submission_tasks = []
+                submission_items: List[Tuple[str, MessageWrapper]] = []
                 try:
                     parent_uuid = uuid.UUID(parent_job_id)
                 except ValueError:
@@ -704,16 +907,34 @@ async def submit_job_v2(
                     parent_uuid = uuid.uuid4()
                 for chunk in chunks:
+                    start = chunk["start_page"]
+                    end = chunk["end_page"]
+                    page_suffix = f"page_{start}" if start == end else f"pages_{start}-{end}"
+                    source_id = f"{original_source_id}#{page_suffix}"
+                    source_name = f"{original_source_name}#{page_suffix}"
                     subjob_id, subjob_wrapper = _prepare_chunk_submission(
                         job_spec_dict,
                         chunk,
+                        document_type=DocumentTypeEnum.PDF,
                         parent_uuid=parent_uuid,
                         parent_job_id=parent_job_id,
                         current_trace_id=current_trace_id,
-                        original_source_id=original_source_id,
-                        original_source_name=original_source_name,
+                        source_id=source_id,
+                        source_name=source_name,
                     )
-                    submission_tasks.append(ingest_service.submit_job(subjob_wrapper, subjob_id))
+                    # Inject QoS routing hint into subjob routing_options (keeps API and service loosely coupled)
+                    try:
+                        sub_spec = json.loads(subjob_wrapper.payload)
+                        routing_opts = sub_spec.get("routing_options") or {}
+                        routing_opts["queue_hint"] = qos_tier
+                        sub_spec["routing_options"] = routing_opts
+                        subjob_wrapper = MessageWrapper(payload=json.dumps(sub_spec))
+                    except Exception:
+                        # Best-effort; if we cannot inject, fall back to default routing
+                        pass
+                    submission_items.append((subjob_id, subjob_wrapper))
                     subjob_ids.append(subjob_id)
                     subjob_descriptors.append(
                         {
@@ -724,36 +945,113 @@ async def submit_job_v2(
                             "page_count": chunk.get("page_count"),
                         }
                     )
+                parent_metadata.update(
+                    {
+                        "total_pages": page_count,
+                        "pages_per_chunk": pages_per_chunk,
+                        "original_source_id": original_source_id,
+                        "original_source_name": original_source_name,
+                        "document_type": document_types[0] if document_types else "pdf",
+                        "subjob_order": subjob_ids,
+                    }
+                )
+        elif document_types and payloads and document_types[0].lower() in ["mp4", "mov", "avi", "mp3", "wav"]:
+            document_type = document_types[0]
+            upload_path = f"./{Path(original_source_id).name}"
+            # dump the payload to a file, just came from client
+            with fsspec.open(upload_path, "wb") as f:
+                f.write(base64.b64decode(payloads[0]))
+            dataloader = DataLoader(
+                path=upload_path, output_dir="./audio_chunks/", audio_only=True, split_interval=50000000
+            )
+            document_type = DocumentTypeEnum.MP3
+            parent_uuid = uuid.UUID(parent_job_id)
+            for task in job_spec_dict["tasks"]:
+                if "task_properties" in task and "document_type" in task["task_properties"]:
+                    task["task_properties"]["document_type"] = document_type
+            end = 0
+            for idx, (file_path, duration) in enumerate(dataloader.files_completed):
+                start = end
+                end = int(start + duration)
+                chunk = {
+                    "bytes": file_path.encode("utf-8"),
+                    "chunk_index": idx,
+                    "start": start,
+                    "end": end,
+                }
+                subjob_id, subjob_wrapper = _prepare_chunk_submission(
+                    job_spec_dict,
+                    chunk,
+                    parent_uuid=parent_uuid,
+                    parent_job_id=parent_job_id,
+                    current_trace_id=current_trace_id,
+                    source_id=file_path,
+                    source_name=upload_path,
+                    document_type=document_type,
+                )
-                if submission_tasks:
-                    await asyncio.gather(*submission_tasks)
+                submission_items.append((subjob_id, subjob_wrapper))
+                subjob_ids.append(subjob_id)
+                subjob_descriptors.append(
+                    {
+                        "job_id": subjob_id,
+                        "chunk_index": idx + 1,
+                        "start_page": chunk.get("start"),
+                        "end_page": chunk.get("end"),
+                        "page_count": chunk.get("page_count", 0),
+                    }
+                )
+            logger.error(f"Removing uploaded file {upload_path}")
+            os.remove(upload_path)
+        if submission_items:
+            burst_size, pause_ms, jitter_ms = _get_submit_burst_params()
+            await _submit_subjobs_in_bursts(
+                submission_items,
+                ingest_service,
+                burst_size=burst_size,
+                pause_ms=pause_ms,
+                jitter_ms=jitter_ms,
+            )
-                parent_metadata: Dict[str, Any] = {
-                    "total_pages": page_count,
-                    "pages_per_chunk": pages_per_chunk,
+            parent_metadata.update(
+                {
                     "original_source_id": original_source_id,
                     "original_source_name": original_source_name,
-                    "document_type": document_types[0] if document_types else "pdf",
+                    "document_type": document_type,
                     "subjob_order": subjob_ids,
                 }
+            )
+            # raise ValueError(f"Setting parent job mapping for {parent_job_id} with {len(subjob_ids)} subjobs")
+            await ingest_service.set_parent_job_mapping(
+                parent_job_id,
+                subjob_ids,
+                parent_metadata,
+                subjob_descriptors=subjob_descriptors,
+            )
-                await ingest_service.set_parent_job_mapping(
-                    parent_job_id,
-                    subjob_ids,
-                    parent_metadata,
-                    subjob_descriptors=subjob_descriptors,
-                )
-                await ingest_service.set_job_state(parent_job_id, STATE_SUBMITTED)
+            await ingest_service.set_job_state(parent_job_id, STATE_SUBMITTED)
-                span.add_event(f"Split into {len(subjob_ids)} subjobs")
-                response.headers["x-trace-id"] = trace.format_trace_id(current_trace_id)
-                return parent_job_id
+            span.add_event(f"Split into {len(subjob_ids)} subjobs")
+            response.headers["x-trace-id"] = trace.format_trace_id(current_trace_id)
+            return parent_job_id
         # For non-PDFs or cases where splitting is not required, submit as normal
         if "tracing_options" not in job_spec_dict:
             job_spec_dict["tracing_options"] = {"trace": True}
         job_spec_dict["tracing_options"]["trace_id"] = str(current_trace_id)
+        # If this was a PDF and we computed page_count, route the single job using the same QoS tier
+        try:
+            if (
+                document_types
+                and document_types[0].lower() == "pdf"
+                and "queue_hint" not in (job_spec_dict.get("routing_options") or {})
+            ):
+                job_spec_dict.setdefault("routing_options", {})["queue_hint"] = qos_tier
+        except Exception:
+            pass
         updated_job_spec = MessageWrapper(payload=json.dumps(job_spec_dict))
         span.add_event("Submitting as single job (no split needed)")
@@ -762,12 +1060,40 @@ async def submit_job_v2(
         await ingest_service.submit_job(updated_job_spec, parent_job_id)
         await ingest_service.set_job_state(parent_job_id, STATE_SUBMITTED)
+        # If this was a PDF (even if not split), store page count metadata for tracking
+        if pdf_page_count_cache is not None:
+            try:
+                # Use cached page count from earlier check to avoid re-decoding
+                # Store minimal metadata for non-split PDFs (consistent with split PDFs)
+                single_pdf_metadata: Dict[str, Any] = {
+                    "total_pages": pdf_page_count_cache,
+                    "pages_per_chunk": pdf_page_count_cache,  # Single chunk = entire document
+                    "original_source_id": original_source_id,
+                    "original_source_name": original_source_name,
+                    "document_type": document_types[0],
+                    "subjob_order": [],  # No subjobs for non-split PDFs
+                }
+                # Store as parent job metadata with empty subjob list for consistency
+                await ingest_service.set_parent_job_mapping(
+                    parent_job_id,
+                    [],  # Empty subjob list
+                    single_pdf_metadata,
+                    subjob_descriptors=[],
+                )
+                logger.debug(
+                    f"Stored page count metadata for non-split PDF {original_source_name}: {pdf_page_count_cache} pages"
+                )
+            except Exception as metadata_err:
+                # Don't fail the job if metadata storage fails
+                logger.warning(f"Failed to store page count metadata for {parent_job_id}: {metadata_err}")
         response.headers["x-trace-id"] = trace.format_trace_id(current_trace_id)
         return parent_job_id
     except Exception as ex:
-        logger.exception(f"Error submitting job: {str(ex)}")
-        raise HTTPException(status_code=500, detail=f"Nv-Ingest Internal Server Error: {str(ex)}")
+        logger.exception(f"Error submitting job: {str(ex)}, {source_id}")
+        raise HTTPException(status_code=500, detail=f"Nv-Ingest Internal Server Error: {str(ex)}, for: \n{source_id}")
 # GET /v2/fetch_job
@@ -898,6 +1224,32 @@ async def fetch_job_v2(job_id: str, ingest_service: INGEST_SERVICE_T):
             logger.debug(f"Parent job {job_id} has {len(subjob_ids)} subjobs")
+            # Special case: Non-split PDFs have metadata but no subjobs
+            # Fetch the result directly and augment with page count metadata
+            if len(subjob_ids) == 0:
+                logger.debug(f"Job {job_id} is a non-split PDF, fetching result directly")
+                try:
+                    job_response = await ingest_service.fetch_job(job_id)
+                    # Augment response with page count metadata
+                    if isinstance(job_response, dict):
+                        if "metadata" not in job_response:
+                            job_response["metadata"] = {}
+                        job_response["metadata"]["total_pages"] = metadata.get("total_pages")
+                        job_response["metadata"]["original_source_id"] = metadata.get("original_source_id")
+                        job_response["metadata"]["original_source_name"] = metadata.get("original_source_name")
+                    # Update job state after successful fetch
+                    await _update_job_state_after_fetch(job_id, ingest_service)
+                    return _stream_json_response(job_response)
+                except (TimeoutError, RedisError, ConnectionError):
+                    logger.debug(f"Job {job_id} (non-split PDF) not ready yet")
+                    raise HTTPException(status_code=202, detail="Job is processing. Retry later.")
+                except Exception as e:
+                    logger.exception(f"Error fetching non-split PDF job {job_id}: {e}")
+                    raise HTTPException(status_code=500, detail="Internal server error during job fetch.")
             # Build ordered descriptors for subjobs
             stored_descriptors = subjob_info.get("subjob_descriptors") or []
             descriptor_lookup = {entry.get("job_id"): entry for entry in stored_descriptors if isinstance(entry, dict)}

nv_ingest/framework/orchestration/process/dependent_services.py CHANGED Viewed

@@ -18,6 +18,18 @@ from nv_ingest_api.util.message_brokers.simple_message_broker.broker import Simp
 logger = logging.getLogger(__name__)
+def _broker_server_target(host, port, max_queue_size):
+    """
+    Target function to be run in a separate process for the SimpleMessageBroker.
+    """
+    server = SimpleMessageBroker(host, port, max_queue_size)
+    try:
+        server.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
+    except Exception:
+        pass
+    server.serve_forever()
 def start_simple_message_broker(broker_client: dict) -> multiprocessing.Process:
     """
     Starts a SimpleMessageBroker server in a separate process.
@@ -58,16 +70,11 @@ def start_simple_message_broker(broker_client: dict) -> multiprocessing.Process:
             f"continuing to spawn a broker process (tests expect a Process to be returned)"
         )
-    def broker_server():
-        # Optionally, set socket options here for reuse (note: binding occurs in server __init__).
-        server = SimpleMessageBroker(server_host, server_port, max_queue_size)
-        try:
-            server.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
-        except Exception:
-            pass
-        server.serve_forever()
-    p = multiprocessing.Process(target=broker_server)
+    p = multiprocessing.Process(
+        target=_broker_server_target,
+        args=(server_host, server_port, max_queue_size),
+        daemon=True,
+    )
     # If we're launching from inside the pipeline subprocess, mark daemon so the
     # broker dies automatically when the subprocess exits.
     p.daemon = os.environ.get("NV_INGEST_BROKER_IN_SUBPROCESS") == "1"

nv_ingest/framework/orchestration/process/strategies.py CHANGED Viewed

@@ -11,9 +11,10 @@ Strategy pattern for clean separation of execution concerns.
 """
 import atexit
-import os
 import logging
 import multiprocessing
+import os
+import sys
 import time
 from abc import ABC, abstractmethod
@@ -132,7 +133,10 @@ class SubprocessStrategy(ProcessExecutionStrategy):
         logger.info("Launching pipeline in Python subprocess using multiprocessing.")
         # Create subprocess using fork context
-        ctx = multiprocessing.get_context("fork")
+        start_method = "fork"
+        if sys.platform.lower() == "darwin":
+            start_method = "spawn"
+        ctx = multiprocessing.get_context(start_method)
         process = ctx.Process(
             target=run_pipeline_process,
             args=(

nv-ingest 2025.10.22.dev20251022__py3-none-any.whl → 2025.11.19.dev20251119__py3-none-any.whl

nv-ingest 2025.10.22.dev20251022py3-none-any.whl → 2025.11.19.dev20251119py3-none-any.whl