PyPI - nv-ingest - Versions diffs - 2025.5.22.dev20250522__py3-none-any.whl → 2025.5.29.dev20250529__py3-none-any.whl - Mend

nv-ingest 2025.5.22.dev20250522py3-none-any.whl → 2025.5.29.dev20250529py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nv-ingest might be problematic. Click here for more details.

Files changed (15) hide show

nv_ingest/api/main.py CHANGED Viewed

@@ -14,6 +14,7 @@ from opentelemetry.sdk.trace.export import BatchSpanProcessor
 from .v1.health import router as HealthApiRouter
 from .v1.ingest import router as IngestApiRouter
+from .v1.metrics import router as MetricsApiRouter
 logger = logging.getLogger(__name__)
@@ -21,7 +22,7 @@ logger = logging.getLogger(__name__)
 app = FastAPI(
     title="NV-Ingest Microservice",
     description="Service for ingesting heterogenous datatypes",
-    version="25.3.0",
+    version="25.4.2",
     contact={
         "name": "NVIDIA Corporation",
         "url": "https://nvidia.com",
@@ -31,6 +32,7 @@ app = FastAPI(
 app.include_router(IngestApiRouter, prefix="/v1")
 app.include_router(HealthApiRouter, prefix="/v1/health")
+app.include_router(MetricsApiRouter, prefix="/v1")
 # Set up the tracer provider and add a processor for exporting traces
 resource = Resource(attributes={"service.name": "nv-ingest"})

nv_ingest/api/v1/metrics.py ADDED Viewed

@@ -0,0 +1,29 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+import logging
+from fastapi import APIRouter, Response, status
+from prometheus_client import Counter, Histogram, generate_latest, CONTENT_TYPE_LATEST
+router = APIRouter()
+# logger = logging.getLogger("uvicorn")
+logger = logging.getLogger(__name__)
+# Prometheus metrics
+REQUEST_COUNT = Counter("http_requests_total", "Total HTTP Requests", ["method", "endpoint"])
+REQUEST_LATENCY = Histogram("http_request_duration_seconds", "Request latency", ["method", "endpoint"])
+@router.get(
+    "/metrics",
+    tags=["Health"],
+    summary="Provide prometheus formatted metrics for consumption",
+    description="""
+        Provide prometheus formatted metrics for consumption by a prometheus scraping server.
+    """,
+    status_code=status.HTTP_200_OK,
+)
+def metrics():
+    return Response(generate_latest(), media_type=CONTENT_TYPE_LATEST)

nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py CHANGED Viewed

@@ -74,9 +74,26 @@ class PipelineTopology:
         self._start_cleanup_thread()  # Start background cleanup on init
     def __del__(self):
-        """Ensure cleanup thread is stopped when topology object is destroyed."""
-        logger.debug("PipelineTopology destructor called, ensuring cleanup thread is stopped.")
-        self._stop_cleanup_thread()
+        """Ensure cleanup thread is stopped and internal actor references are released."""
+        logger.debug("PipelineTopology destructor called. Cleaning up thread and actor references.")
+        # Stop the background cleanup thread
+        try:
+            self._stop_cleanup_thread()
+        except Exception as e:
+            logger.warning(f"Error stopping cleanup thread during __del__: {e}")
+        # Clear references to actor handles and shutdown futures
+        try:
+            self._stage_actors.clear()
+            self._edge_queues.clear()
+            self._scaling_state.clear()
+            self._stage_memory_overhead.clear()
+            self._pending_removal_actors.clear()
+            self._stages.clear()
+            self._connections.clear()
+        except Exception as e:
+            logger.warning(f"Error clearing internal state during __del__: {e}")
     # --- Lock Context Manager ---
     @contextlib.contextmanager

nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py CHANGED Viewed

@@ -2,7 +2,11 @@
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
+import multiprocessing
+import os
+import signal
 import threading
+from abc import ABC, abstractmethod
 from collections import defaultdict
 from dataclasses import dataclass
@@ -24,6 +28,35 @@ from nv_ingest.framework.orchestration.ray.util.pipeline.pid_controller import P
 logger = logging.getLogger(__name__)
+class PipelineInterface(ABC):
+    """
+    Abstract base class for pipeline implementations.
+    Any concrete pipeline must implement start and stop methods.
+    """
+    @abstractmethod
+    def start(self, monitor_poll_interval: float = 5.0, scaling_poll_interval: float = 30.0) -> None:
+        """
+        Start the pipeline.
+        Parameters
+        ----------
+        monitor_poll_interval : float
+            Interval in seconds for monitoring poll (default: 5.0).
+        scaling_poll_interval : float
+            Interval in seconds for scaling decisions (default: 30.0).
+        """
+        pass
+    @abstractmethod
+    def stop(self) -> None:
+        """
+        Stop the pipeline and perform any necessary cleanup.
+        """
+        pass
 # --- Configuration Objects ---
@@ -62,7 +95,90 @@ class StatsConfig:
     queue_timeout_seconds: float = 2.0
-class RayPipeline:
+class RayPipelineSubprocessInterface(PipelineInterface):
+    """
+    Pipeline interface implementation for a subprocess-based Ray pipeline.
+    """
+    def __init__(self, process: multiprocessing.Process):
+        """
+        Parameters
+        ----------
+        process : multiprocessing.Process
+            A handle to the running subprocess.
+        """
+        self._process: multiprocessing.Process = process
+    def start(self, monitor_poll_interval: float = 5.0, scaling_poll_interval: float = 30.0) -> None:
+        """
+        Start is not supported because the subprocess is assumed to already be running.
+        """
+        pass
+    def stop(self) -> None:
+        """
+        Stops the subprocess pipeline. Tries terminate(), then escalates to SIGKILL on the process group if needed.
+        """
+        if not self._process.is_alive():
+            return
+        try:
+            self._process.terminate()
+            self._process.join(timeout=5.0)
+        except Exception as e:
+            logger.warning(f"Failed to terminate process cleanly: {e}")
+        if self._process.is_alive():
+            try:
+                pgid = os.getpgid(self._process.pid)
+                os.killpg(pgid, signal.SIGKILL)
+            except Exception as e:
+                logger.error(f"Failed to force-kill process group: {e}")
+            self._process.join(timeout=3.0)
+class RayPipelineInterface(PipelineInterface):
+    """
+    Pipeline interface for an in-process RayPipeline instance.
+    """
+    def __init__(self, pipeline: "RayPipeline"):
+        """
+        Parameters
+        ----------
+        pipeline : RayPipeline
+            The instantiated pipeline to control.
+        """
+        self._pipeline = pipeline
+    def start(self, monitor_poll_interval: float = 5.0, scaling_poll_interval: float = 30.0) -> None:
+        """
+        Starts the RayPipeline.
+        Parameters
+        ----------
+        monitor_poll_interval : float
+            Unused here; provided for interface compatibility.
+        scaling_poll_interval : float
+            Unused here; provided for interface compatibility.
+        """
+        self._pipeline.start(monitor_poll_interval, scaling_poll_interval)
+    def stop(self) -> None:
+        """
+        Stops the RayPipeline and shuts down Ray.
+        """
+        self._pipeline.stop()
+        try:
+            import ray
+            ray.shutdown()
+        except Exception:
+            pass
+class RayPipeline(PipelineInterface):
     """
     A structured pipeline supporting dynamic scaling and queue flushing.
     Uses PIDController and ResourceConstraintManager. Supports optional GUI display.
@@ -151,10 +267,17 @@ class RayPipeline:
             actor_timeout=self.stats_config.actor_timeout_seconds,
             queue_timeout=self.stats_config.queue_timeout_seconds,
         )
         logger.info("RayStatsCollector initialized using StatsConfig.")
     # --- Accessor Methods for Stats Collector (and internal use) ---
+    def __del__(self):
+        try:
+            self.stop()
+        except Exception as e:
+            logger.error(f"Exception during RayPipeline cleanup: {e}")
     def get_stages_info(self) -> List[StageInfo]:
         """Returns a snapshot of the current stage information."""
         return self.topology.get_stages_info()
@@ -516,7 +639,9 @@ class RayPipeline:
         """
         current_count = len(current_replicas)
         num_to_remove = current_count - target_count
-        logger.info(f"[ScaleDown-{stage_name}] Scaling down from {current_count} to {target_count} (-{num_to_remove}).")
+        logger.debug(
+            f"[ScaleDown-{stage_name}] Scaling down from {current_count} to {target_count} (-{num_to_remove})."
+        )
         # Basic validation
         if num_to_remove <= 0:
@@ -564,7 +689,7 @@ class RayPipeline:
             logger.warning(f"[ScaleDown-{stage_name}] No actors successfully initiated stop for registration.")
         total_attempted = len(actors_to_remove)
-        logger.info(
+        logger.debug(
             f"[ScaleDown-{stage_name}] Scale down initiation process complete for {total_attempted} actors "
             f"(Skipped/Failed Initiation: {stop_initiation_failures}). Topology cleanup will handle final removal."
         )
@@ -647,9 +772,6 @@ class RayPipeline:
         # Activity check
         is_quiet = global_in_flight <= self.quiet_period_threshold
-        if is_quiet:
-            logger.info(f"Pipeline IS quiet. In-Flight: {global_in_flight} <= Threshold: {self.quiet_period_threshold}")
         return is_quiet
     def _wait_for_pipeline_drain(self, timeout_seconds: int) -> bool:
@@ -670,7 +792,6 @@ class RayPipeline:
                 return False
             # --- Trigger immediate stats collection via the collector instance ---
-            drain_stats = {}
             drain_success = False
             collection_error = None
@@ -689,19 +810,18 @@ class RayPipeline:
                     if not collection_error
                     else f"Collection Error: {type(collection_error).__name__}"
                 )
-                logger.info(
-                    f"[DrainWait] Check at {elapsed_time:.1f}s: Global In-Flight={global_in_flight} ({status_msg})"
+                logger.debug(
+                    f"[Drain] Check at {elapsed_time:.1f}s: Global In-Flight={global_in_flight} ({status_msg})"
                 )
                 last_in_flight = global_in_flight
             # --- Check for successful drain ---
             # Requires BOTH in-flight=0 AND the collection reporting it was successful
             if global_in_flight == 0 and drain_success and not collection_error:
-                logger.info(f"Pipeline confirmed drained (In-Flight=0) in {elapsed_time:.1f}s.")
                 return True
             elif global_in_flight == 0:  # Saw zero, but collection wasn't fully successful
                 logger.warning(
-                    "[DrainWait] In-Flight reached 0, but stats collection had errors/timeouts."
+                    "[Drain] In-Flight reached 0, but stats collection had errors/timeouts."
                     " Cannot confirm drain yet."
                 )
@@ -719,7 +839,6 @@ class RayPipeline:
         # Set flushing state in topology
         self.topology.set_flushing(True)
-        logger.info("--- Starting Queue Flush ---")
         overall_success = False
         source_actors_paused = []
         pause_refs = []
@@ -734,7 +853,7 @@ class RayPipeline:
             current_connections = self.topology.get_connections()
             # --- 1. Pause Source Stages (using snapshots) ---
-            logger.info("Pausing source stages...")
+            logger.debug("Pausing source stages...")
             pause_timeout = 60.0
             for stage in current_stages:
                 if stage.is_source:
@@ -747,22 +866,22 @@ class RayPipeline:
                             except Exception as e:
                                 logger.error(f"Failed sending pause to {actor}: {e}")
             if pause_refs:
-                logger.info(f"Waiting up to {pause_timeout}s for {len(pause_refs)} sources to pause...")
+                logger.debug(f"Waiting up to {pause_timeout}s for {len(pause_refs)} sources to pause...")
                 try:
                     ray.get(pause_refs, timeout=pause_timeout)
-                    logger.info(f"{len(pause_refs)} sources acknowledged pause.")
+                    logger.debug(f"{len(pause_refs)} sources acknowledged pause.")
                 except GetTimeoutError:
                     logger.warning(f"Timeout waiting for {len(pause_refs)} sources to pause.")
                 except Exception as e:
                     logger.error(f"Error waiting for sources pause: {e}. Proceeding cautiously.")
             # --- 2. Wait for Drain ---
-            logger.info("Waiting for pipeline to drain...")
+            logger.debug("Waiting for pipeline to drain...")
             if not self._wait_for_pipeline_drain(self.queue_flush_drain_timeout_seconds):
                 raise RuntimeError("Pipeline drain failed or timed out, aborting flush.")
             # --- 3. Create New Queues (using snapshot) ---
-            logger.info("Creating new replacement queues...")
+            logger.debug("Creating new replacement queues...")
             new_edge_queues_map = {}
             for queue_name, (_, queue_size) in current_edge_queues.items():
                 try:
@@ -775,7 +894,7 @@ class RayPipeline:
                     raise RuntimeError(f"Failed to create new queue '{queue_name}'.") from e
             # --- 4. Re-wire Actors to New Queues (using snapshots) ---
-            logger.info("Re-wiring actors to new queues...")
+            logger.debug("Re-wiring actors to new queues...")
             wiring_refs = []
             wiring_timeout = 120.0
             for from_stage_name, conns in current_connections.items():
@@ -811,7 +930,7 @@ class RayPipeline:
                     raise RuntimeError("Actor re-wiring failed.") from e
             # --- 5. Update Topology State (Commit Point) ---
-            logger.info("Committing new queues to pipeline topology.")
+            logger.debug("Committing new queues to pipeline topology.")
             self.topology.set_edge_queues(new_edge_queues_map)  # Commit the change
             overall_success = True
@@ -822,7 +941,7 @@ class RayPipeline:
         finally:
             # --- 6. Resume Source Stages (Always attempt) ---
             if source_actors_paused:
-                logger.info(f"Attempting to resume {len(source_actors_paused)} source actors...")
+                logger.debug(f"Attempting to resume {len(source_actors_paused)} source actors...")
                 resume_timeout = 30.0
                 resume_refs = []
                 for actor in source_actors_paused:
@@ -831,10 +950,10 @@ class RayPipeline:
                     except Exception as e:
                         logger.error(f"Failed sending resume to {actor}: {e}")
                 if resume_refs:
-                    logger.info(f"Waiting up to {resume_timeout}s for {len(resume_refs)} actors to resume...")
+                    logger.debug(f"Waiting up to {resume_timeout}s for {len(resume_refs)} actors to resume...")
                     try:
                         ray.get(resume_refs, timeout=resume_timeout)
-                        logger.info(f"{len(resume_refs)} sources resumed.")
+                        logger.debug(f"{len(resume_refs)} sources resumed.")
                     except GetTimeoutError:
                         logger.warning(f"Timeout waiting for {len(resume_refs)} sources to resume.")
                     except Exception as e:
@@ -843,9 +962,6 @@ class RayPipeline:
             # Update flush timestamp only on success
             if overall_success:
                 self._last_queue_flush_time = time.time()
-                logger.info("--- Queue Flush Completed Successfully ---")
-            else:
-                logger.error("--- Queue Flush Failed ---")
             # Reset flushing state in topology
             self.topology.set_flushing(False)
@@ -977,7 +1093,7 @@ class RayPipeline:
             if target_replica_count != current_count:
                 stages_needing_action.append((stage_name, target_replica_count))
-                logger.info(
+                logger.debug(
                     f"[ScalingApply-{stage_name}] Action: Current={current_count}, "
                     f"Target={target_replica_count} (Min={stage_info.min_replicas}, Max={stage_info.max_replicas})"
                 )
@@ -1019,7 +1135,7 @@ class RayPipeline:
         completed = sum(1 for r in action_results.values() if r["status"] == "completed")
         errors = sum(1 for r in action_results.values() if r["status"] == "error")
         timeouts = sum(1 for r in action_results.values() if r["status"] == "timeout")
-        logger.info(f"[ScalingApply] Summary: {completed} completed, {errors} errors, {timeouts} timeouts.")
+        logger.debug(f"[ScalingApply] Summary: {completed} completed, {errors} errors, {timeouts} timeouts.")
     def _perform_scaling_and_maintenance(self) -> None:
         """Orchestrates scaling/maintenance using topology and stats collector."""
@@ -1050,9 +1166,9 @@ class RayPipeline:
             logger.debug("--- Performing Scaling & Maintenance Cycle ---")
             if self._is_pipeline_quiet():
-                logger.info("Pipeline quiet, initiating queue flush.")
+                logger.info("[Drain] Pipeline quiet, initiating queue flush.")
                 flush_success = self._execute_queue_flush()
-                logger.info(f"Automatic queue flush completed. Success: {flush_success}")
+                logger.info(f"[Drain] Automatic queue flush completed. Success: {flush_success}")
                 return
             # Fast return check if stopping occurred while flushing or checking flush status
@@ -1201,5 +1317,6 @@ class RayPipeline:
             # Clear runtime state in topology
             self.topology.clear_runtime_state()
+            del self.topology
             logger.info("Pipeline stopped.")

nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py ADDED Viewed

@@ -0,0 +1,82 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+import logging
+import ray
+from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
+from nv_ingest.framework.util.flow_control import filter_by_task
+from nv_ingest_api.internal.extract.html.html_extractor import extract_markdown_from_html_internal
+from nv_ingest_api.internal.primitives.ingest_control_message import remove_task_by_type, IngestControlMessage
+from nv_ingest_api.internal.primitives.tracing.tagging import traceable
+from nv_ingest_api.internal.schemas.extract.extract_html_schema import HtmlExtractorSchema
+from nv_ingest_api.util.exception_handlers.decorators import (
+    nv_ingest_node_failure_try_except,
+)
+logger = logging.getLogger(__name__)
+@ray.remote
+class HtmlExtractorStage(RayActorStage):
+    """
+    A Ray actor stage that extracts text in markdown format from html content.
+    It expects an IngestControlMessage containing a DataFrame with html content. It then:
+      1. Removes the "html_content_extract" task from the message.
+      2. Calls the html extraction logic (via extract_markdown_from_html_internal) using a validated configuration.
+      3. Updates the message payload with the extracted text DataFrame.
+    """
+    def __init__(self, config: HtmlExtractorSchema) -> None:
+        super().__init__(config, log_to_stdout=False)
+        try:
+            self.validated_config = config
+            self._logger.info("HtmlExtractorStage configuration validated successfully.")
+        except Exception as e:
+            self._logger.exception(f"Error validating Html Extractor config: {e}")
+            raise
+    @traceable("html_extractor")
+    @filter_by_task(required_tasks=[("extract", {"document_type": "html"})])
+    @nv_ingest_node_failure_try_except(annotation_id="html_extractor", raise_on_failure=False)
+    def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
+        """
+        Process the control message by extracting content from html.
+        Parameters
+        ----------
+        control_message : IngestControlMessage
+            The message containing a DataFrame payload with html content.
+        Returns
+        -------
+        IngestControlMessage
+            The updated message with extracted content.
+        """
+        self._logger.debug("HtmlExtractorStage.on_data: Starting html extraction process.")
+        # Extract the DataFrame payload.
+        df_ledger = control_message.payload()
+        self._logger.debug("Extracted payload with %d rows.", len(df_ledger))
+        # Remove the "html_content_extract" task from the message to obtain task-specific configuration.
+        task_config = remove_task_by_type(control_message, "extract")
+        self._logger.debug("Extracted task config: %s", task_config)
+        # Perform html content extraction.
+        new_df, extraction_info = extract_markdown_from_html_internal(
+            df_extraction_ledger=df_ledger,
+            task_config=task_config,
+            extraction_config=self.validated_config,
+            execution_trace_log=None,
+        )
+        # Update the message payload with the extracted text DataFrame.
+        control_message.payload(new_df)
+        control_message.set_metadata("html_extraction_info", extraction_info)
+        return control_message

nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py CHANGED Viewed

@@ -495,7 +495,7 @@ def start_simple_message_broker(broker_client: dict) -> multiprocessing.Process:
         server.serve_forever()
     p = multiprocessing.Process(target=broker_server)
-    p.daemon = True
+    p.daemon = False
     p.start()
     logger.info(f"Started SimpleMessageBroker server in separate process on port {broker_client['port']}")

nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py CHANGED Viewed

@@ -490,7 +490,7 @@ class ResourceConstraintManager:
         final_proposals_this_step = {}
         if not room_to_scale_up_to_global_caps:
-            logger.info(
+            logger.debug(
                 "[ConstraintMgr-Proportional] Global scaling beyond effective minimums is RESTRICTED "
                 "as SumOfEffectiveMins likely meets/exceeds a global Core/MaxReplica cap. "
                 "Proposed increases from initial current values will be nullified."
@@ -502,7 +502,7 @@ class ResourceConstraintManager:
                 if val_from_prior_phases > original_current_replicas:
                     final_proposals_this_step[name] = original_current_replicas
                     if val_from_prior_phases != original_current_replicas:
-                        logger.info(
+                        logger.debug(
                             f"[ConstraintMgr-{name}] Proportional: Scaling restricted. "
                             f"Nullified proposed increase from {original_current_replicas} to {val_from_prior_phases}. "
                             f"Setting to {original_current_replicas}."
@@ -618,7 +618,7 @@ class ResourceConstraintManager:
         # Apply reduction to the deltas
         if reduction_factor <= 0.001:  # Epsilon for float
-            logger.info(
+            logger.debug(
                 f"[ConstraintMgr-Proportional] Scale-up beyond effective minimums fully constrained by global limits. "
                 f"Reasons: {'; '.join(limiting_reasons) if limiting_reasons else 'None'}. "
                 f"Final ReductionFactor={reduction_factor:.3f}."
@@ -637,7 +637,7 @@ class ResourceConstraintManager:
                 )
         elif reduction_factor < 1.0:
-            logger.info(
+            logger.debug(
                 f"[ConstraintMgr-Proportional] Reducing requested scale-up (beyond effective_mins) by "
                 f"factor {reduction_factor:.3f}. "
                 f"Limiting Factors: {'; '.join(limiting_reasons)}."
@@ -654,7 +654,7 @@ class ResourceConstraintManager:
                         f"-> FinalVal={final_value_for_stage}"
                     )
         else:  # reduction_factor is ~1.0, meaning full requested increase (above effective_mins) is allowed
-            logger.info(
+            logger.debug(
                 "[ConstraintMgr-Proportional] Full requested scale-up (beyond effective_mins) "
                 "is permissible by global limits."
             )
@@ -713,7 +713,7 @@ class ResourceConstraintManager:
                     target = max(1, min_r)
                     final_target = min(target, max_r)
                     if final_target > 0:
-                        logger.info(
+                        logger.debug(
                             f"[ConstraintMgr-{name}] Forcing minimum {final_target} replica due to global wake-up."
                         )
                         final_adjustments[name] = final_target
@@ -740,19 +740,19 @@ class ResourceConstraintManager:
         num_queue_actors = num_edges
         total_ray_components_for_info = final_stage_replicas_total + num_queue_actors
-        logger.info("[ConstraintMgr] --- Final Decision & Constraint Summary ---")
+        logger.debug("[ConstraintMgr] --- Final Decision & Constraint Summary ---")
         # --- I. Overall Pipeline State ---
-        logger.info(f"[ConstraintMgr]   Pipeline Activity: {global_in_flight} tasks in-flight.")
-        logger.info(f"[ConstraintMgr]   Effective Min Replicas (Sum): {sum_of_effective_mins}")
-        logger.info(
+        logger.debug(f"[ConstraintMgr]   Pipeline Activity: {global_in_flight} tasks in-flight.")
+        logger.debug(f"[ConstraintMgr]   Effective Min Replicas (Sum): {sum_of_effective_mins}")
+        logger.debug(
             f"[ConstraintMgr]     └─ Global Scaling Beyond Mins Permitted? {can_globally_scale_beyond_effective_mins}"
         )
         # --- II. Final Component Counts ---
-        logger.info(f"[ConstraintMgr]   Final Stage Replicas: {final_stage_replicas_total} (Target for caps)")
-        logger.info(f"[ConstraintMgr]   Queue/Edge Actors   : {num_queue_actors} (Informational)")
-        logger.info(f"[ConstraintMgr]   Total Ray Components: {total_ray_components_for_info} (Informational)")
+        logger.debug(f"[ConstraintMgr]   Final Stage Replicas: {final_stage_replicas_total} (Target for caps)")
+        logger.debug(f"[ConstraintMgr]   Queue/Edge Actors   : {num_queue_actors} (Informational)")
+        logger.debug(f"[ConstraintMgr]   Total Ray Components: {total_ray_components_for_info} (Informational)")
         # --- III. Resource Limits & Projected Usage (for Stages) ---
         # Configured Limits
@@ -762,18 +762,18 @@ class ResourceConstraintManager:
         )
         eff_mem_limit_str = f"{self.effective_memory_limit_mb:.1f}MB"
-        logger.info("[ConstraintMgr]   Global Limits (Stages):")
-        logger.info(f"[ConstraintMgr]     ├─ MaxTotalReplicas  : {max_r_cfg_str}")
-        logger.info(
+        logger.debug("[ConstraintMgr]   Global Limits (Stages):")
+        logger.debug(f"[ConstraintMgr]     ├─ MaxTotalReplicas  : {max_r_cfg_str}")
+        logger.debug(
             f"[ConstraintMgr]     ├─ CoreBasedRepLimit : {core_based_limit_str} "
             f"(System EffCores: {self.available_cores if self.available_cores is not None else 'N/A'})"
         )
-        logger.info(f"[ConstraintMgr]     └─ EffectiveMemLimit : {eff_mem_limit_str} ")
+        logger.debug(f"[ConstraintMgr]     └─ EffectiveMemLimit : {eff_mem_limit_str} ")
         # Usage vs Limits
-        logger.info("[ConstraintMgr]   Projected Usage (Stages):")
-        logger.info(f"[ConstraintMgr]     ├─ Replicas          : {final_stage_replicas_total}")
-        logger.info(
+        logger.debug("[ConstraintMgr]   Projected Usage (Stages):")
+        logger.debug(f"[ConstraintMgr]     ├─ Replicas          : {final_stage_replicas_total}")
+        logger.debug(
             f"[ConstraintMgr]     └─ Memory            : {projected_final_memory_mb:.1f}MB "
             f"(Current: {current_global_memory_usage_mb:.1f}MB)"
         )
@@ -815,20 +815,20 @@ class ResourceConstraintManager:
             )
             unexpected_breaches_details.append(f"MemoryLimit: {status_mem}")
-        logger.info("[ConstraintMgr]   Limit Adherence (Stages):")
-        logger.info(f"[ConstraintMgr]     ├─ MaxTotalReplicas  : {status_max_r}")
-        logger.info(f"[ConstraintMgr]     ├─ CoreBasedRepLimit : {status_core_r}")
-        logger.info(f"[ConstraintMgr]     └─ EffectiveMemLimit : {status_mem}")
+        logger.debug("[ConstraintMgr]   Limit Adherence (Stages):")
+        logger.debug(f"[ConstraintMgr]     ├─ MaxTotalReplicas  : {status_max_r}")
+        logger.debug(f"[ConstraintMgr]     ├─ CoreBasedRepLimit : {status_core_r}")
+        logger.debug(f"[ConstraintMgr]     └─ EffectiveMemLimit : {status_mem}")
         if unexpected_breaches_details:
-            logger.warning(f"[ConstraintMgr]   └─ UNEXPECTED BREACHES: {'; '.join(unexpected_breaches_details)}")
+            logger.debug(f"[ConstraintMgr]   └─ UNEXPECTED BREACHES: {'; '.join(unexpected_breaches_details)}")
         else:
-            logger.info("[ConstraintMgr]   └─ All hard caps (beyond tolerated minimums/wake-up) appear respected.")
+            logger.debug("[ConstraintMgr]   └─ All hard caps (beyond tolerated minimums/wake-up) appear respected.")
         # --- V. Final Decisions Per Stage ---
-        logger.info("[ConstraintMgr]   Final Decisions (Per Stage):")
+        logger.debug("[ConstraintMgr]   Final Decisions (Per Stage):")
         if not final_adjustments:
-            logger.info("[ConstraintMgr]     └─ No stages to adjust.")
+            logger.debug("[ConstraintMgr]     └─ No stages to adjust.")
         else:
             # Determine max stage name length for alignment
             max_name_len = 0
@@ -843,12 +843,12 @@ class ResourceConstraintManager:
                 eff_min_str = f"(EffMin: {min_replicas if orig_prop else 'N/A'})"
                 # Basic alignment, can be improved with more sophisticated padding
-                logger.info(
+                logger.debug(
                     f"[ConstraintMgr]     └─ {stage_name:<{max_name_len}} : "
                     f"{count:<3} {pid_proposed_str} {current_str} {eff_min_str}"
                 )
-        logger.info("[ConstraintMgr] --- Constraint Summary END ---")
+        logger.debug("[ConstraintMgr] --- Constraint Summary END ---")
     # --- Public Method ---
@@ -863,7 +863,7 @@ class ResourceConstraintManager:
         Applies all configured constraints to initial replica proposals.
         (Docstring from previous version is fine)
         """
-        logger.info(
+        logger.debug(
             f"[ConstraintMgr] --- Applying Constraints START --- "
             f"GlobalInFlight={global_in_flight}, "
             f"CurrentGlobalMemMB={current_global_memory_usage_mb}, "
@@ -904,7 +904,7 @@ class ResourceConstraintManager:
             current_effective_mins[name] = eff_min
             sum_of_effective_mins += eff_min
-        logger.info(
+        logger.debug(
             f"[ConstraintMgr] Calculated Effective Minimums: TotalSum={sum_of_effective_mins}. "
             # f"IndividualMins: {current_effective_mins}" # Can be verbose
         )
@@ -985,5 +985,5 @@ class ResourceConstraintManager:
             can_globally_scale_up_stages,  # Pass this for context in logging
         )
-        logger.info("[ConstraintMgr] --- Applying Constraints END ---")
+        logger.debug("[ConstraintMgr] --- Applying Constraints END ---")
         return final_adjustments

nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py CHANGED Viewed

@@ -19,6 +19,7 @@ from nv_ingest.framework.orchestration.ray.util.pipeline.stage_builders import (
     add_image_extractor_stage,
     add_docx_extractor_stage,
     add_audio_extractor_stage,
+    add_html_extractor_stage,
     add_image_dedup_stage,
     add_image_filter_stage,
     add_table_extractor_stage,
@@ -53,7 +54,7 @@ def setup_ingestion_pipeline(pipeline: RayPipeline, ingest_config: Dict[str, Any
         export_config_to_env(ingest_config)
     current_level = logging.getLogger().getEffectiveLevel()
-    ray.init(
+    ray_context = ray.init(
         namespace="nv_ingest_ray",
         logging_level=current_level,
         ignore_reinit_error=True,
@@ -103,6 +104,7 @@ def setup_ingestion_pipeline(pipeline: RayPipeline, ingest_config: Dict[str, Any
     docx_extractor_stage_id = add_docx_extractor_stage(pipeline, default_cpu_count)
     pptx_extractor_stage_id = add_pptx_extractor_stage(pipeline, default_cpu_count)
     audio_extractor_stage_id = add_audio_extractor_stage(pipeline, default_cpu_count)
+    html_extractor_stage_id = add_html_extractor_stage(pipeline, default_cpu_count)
     ########################################################################################################
     ########################################################################################################
@@ -159,7 +161,8 @@ def setup_ingestion_pipeline(pipeline: RayPipeline, ingest_config: Dict[str, Any
     pipeline.make_edge(audio_extractor_stage_id, docx_extractor_stage_id, queue_size=ingest_edge_buffer_size)
     pipeline.make_edge(docx_extractor_stage_id, pptx_extractor_stage_id, queue_size=ingest_edge_buffer_size)
     pipeline.make_edge(pptx_extractor_stage_id, image_extractor_stage_id, queue_size=ingest_edge_buffer_size)
-    pipeline.make_edge(image_extractor_stage_id, infographic_extraction_stage_id, queue_size=ingest_edge_buffer_size)
+    pipeline.make_edge(image_extractor_stage_id, html_extractor_stage_id, queue_size=ingest_edge_buffer_size)
+    pipeline.make_edge(html_extractor_stage_id, infographic_extraction_stage_id, queue_size=ingest_edge_buffer_size)
     ###### Primitive Extractors ########
     pipeline.make_edge(infographic_extraction_stage_id, table_extraction_stage_id, queue_size=ingest_edge_buffer_size)
@@ -193,3 +196,5 @@ def setup_ingestion_pipeline(pipeline: RayPipeline, ingest_config: Dict[str, Any
     #    pipe.add_edge(sink_stage, otel_tracer_stage)
     # pipe.add_edge(otel_tracer_stage, completed_job_counter_stage)
+    return ray_context

nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py CHANGED Viewed

@@ -2,16 +2,26 @@
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
+import atexit
 import logging
+import multiprocessing
 import os
+import signal
+import sys
 import time
+from ctypes import CDLL, c_int
 from datetime import datetime
-from typing import Union, Tuple
+from typing import Union, Tuple, Optional, TextIO
 import ray
 from pydantic import BaseModel, ConfigDict
-from nv_ingest.framework.orchestration.ray.primitives.ray_pipeline import RayPipeline, ScalingConfig
+from nv_ingest.framework.orchestration.ray.primitives.ray_pipeline import (
+    RayPipeline,
+    ScalingConfig,
+    RayPipelineSubprocessInterface,
+    RayPipelineInterface,
+)
 from nv_ingest.framework.orchestration.ray.util.pipeline.pipeline_builders import setup_ingestion_pipeline
 logger = logging.getLogger(__name__)
@@ -33,6 +43,8 @@ class PipelineCreationSchema(BaseModel):
     including endpoints, API keys, and processing options.
     """
+    arrow_default_memory_pool: str = os.getenv("ARROW_DEFAULT_MEMORY_POOL", "system")
     # Audio processing settings
     audio_grpc_endpoint: str = os.getenv("AUDIO_GRPC_ENDPOINT", "grpc.nvcf.nvidia.com:443")
     audio_function_id: str = os.getenv("AUDIO_FUNCTION_ID", "1598d209-5e27-4d3c-8079-4751568b1081")
@@ -100,6 +112,112 @@ class PipelineCreationSchema(BaseModel):
     model_config = ConfigDict(extra="forbid")
+def redirect_os_fds(stdout: Optional[TextIO] = None, stderr: Optional[TextIO] = None):
+    """
+    Redirect OS-level stdout (fd=1) and stderr (fd=2) to the given file-like objects,
+    or to /dev/null if not provided.
+    Parameters
+    ----------
+    stdout : Optional[TextIO]
+        Stream to receive OS-level stdout. If None, redirected to /dev/null.
+    stderr : Optional[TextIO]
+        Stream to receive OS-level stderr. If None, redirected to /dev/null.
+    """
+    devnull_fd = os.open(os.devnull, os.O_WRONLY)
+    if stdout is not None:
+        os.dup2(stdout.fileno(), 1)
+    else:
+        os.dup2(devnull_fd, 1)
+    if stderr is not None:
+        os.dup2(stderr.fileno(), 2)
+    else:
+        os.dup2(devnull_fd, 2)
+def set_pdeathsig(sig=signal.SIGKILL):
+    libc = CDLL("libc.so.6")
+    PR_SET_PDEATHSIG = 1
+    libc.prctl(PR_SET_PDEATHSIG, c_int(sig))
+def kill_pipeline_process_group(pid: int):
+    """
+    Kill the process group associated with the given PID, if it exists and is alive.
+    Parameters
+    ----------
+    pid : int
+        The PID of the process whose group should be killed.
+    """
+    try:
+        # Get the process group ID
+        pgid = os.getpgid(pid)
+        # Check if the group is still alive by sending signal 0
+        os.killpg(pgid, 0)  # Does not kill, just checks if it's alive
+        # If no exception, the group is alive — kill it
+        os.killpg(pgid, signal.SIGKILL)
+        print(f"Killed subprocess group {pgid}")
+    except ProcessLookupError:
+        print(f"Process group for PID {pid} no longer exists.")
+    except PermissionError:
+        print(f"Permission denied to kill process group for PID {pid}.")
+    except Exception as e:
+        print(f"Failed to kill subprocess group: {e}")
+def _run_pipeline_process(
+    ingest_config: PipelineCreationSchema,
+    disable_dynamic_scaling: Optional[bool],
+    dynamic_memory_threshold: Optional[float],
+    raw_stdout: Optional[TextIO] = None,
+    raw_stderr: Optional[TextIO] = None,
+):
+    """
+    Subprocess entrypoint to launch the pipeline. Redirects all output to the provided
+    file-like streams or /dev/null if not specified.
+    Parameters
+    ----------
+    ingest_config : PipelineCreationSchema
+        Validated pipeline configuration.
+    disable_dynamic_scaling : Optional[bool]
+        Whether to disable dynamic scaling.
+    dynamic_memory_threshold : Optional[float]
+        Threshold for triggering scaling.
+    raw_stdout : Optional[TextIO]
+        Destination for stdout. Defaults to /dev/null.
+    raw_stderr : Optional[TextIO]
+        Destination for stderr. Defaults to /dev/null.
+    """
+    # Set the death signal for the subprocess
+    set_pdeathsig()
+    os.setsid()  # Creates new process group so it can be SIGKILLed as a group
+    # Redirect OS-level file descriptors
+    redirect_os_fds(stdout=raw_stdout, stderr=raw_stderr)
+    # Redirect Python-level sys.stdout/sys.stderr
+    sys.stdout = raw_stdout or open(os.devnull, "w")
+    sys.stderr = raw_stderr or open(os.devnull, "w")
+    try:
+        _launch_pipeline(
+            ingest_config,
+            block=True,
+            disable_dynamic_scaling=disable_dynamic_scaling,
+            dynamic_memory_threshold=dynamic_memory_threshold,
+        )
+    except Exception as e:
+        sys.__stderr__.write(f"Subprocess pipeline run failed: {e}\n")
+        raise
 def _launch_pipeline(
     ingest_config: PipelineCreationSchema,
     block: bool,
@@ -122,7 +240,7 @@ def _launch_pipeline(
     start_abs = datetime.now()
     # Set up the ingestion pipeline
-    setup_ingestion_pipeline(pipeline, ingest_config.model_dump())
+    _ = setup_ingestion_pipeline(pipeline, ingest_config.model_dump())
     # Record setup time
     end_setup = start_run = datetime.now()
@@ -159,12 +277,100 @@ def _launch_pipeline(
 def run_pipeline(
     ingest_config: PipelineCreationSchema,
     block: bool = True,
-    disable_dynamic_scaling: bool = None,
-    dynamic_memory_threshold: float = None,
-) -> Union[RayPipeline, float]:
-    pipeline, total_elapsed = _launch_pipeline(ingest_config, block, disable_dynamic_scaling, dynamic_memory_threshold)
+    disable_dynamic_scaling: Optional[bool] = None,
+    dynamic_memory_threshold: Optional[float] = None,
+    run_in_subprocess: bool = False,
+    stdout: Optional[TextIO] = None,
+    stderr: Optional[TextIO] = None,
+) -> Union[RayPipelineInterface, float, RayPipelineSubprocessInterface]:
+    """
+    Launch and manage a pipeline, optionally in a subprocess.
+    This function is the primary entry point for executing a Ray pipeline,
+    either within the current process or in a separate Python subprocess.
+    It supports synchronous blocking execution or non-blocking lifecycle management,
+    and allows redirection of output to specified file-like objects.
+    Parameters
+    ----------
+    ingest_config : PipelineCreationSchema
+        The validated configuration object used to construct and launch the pipeline.
+    block : bool, default=True
+        If True, blocks until the pipeline completes.
+        If False, returns an interface to control the pipeline externally.
+    disable_dynamic_scaling : Optional[bool], default=None
+        If True, disables dynamic memory scaling. Overrides global configuration if set.
+        If None, uses the default or globally defined behavior.
+    dynamic_memory_threshold : Optional[float], default=None
+        The memory usage threshold (as a float between 0 and 1) that triggers autoscaling,
+        if dynamic scaling is enabled. Defaults to the globally configured value if None.
+    run_in_subprocess : bool, default=False
+        If True, launches the pipeline in a separate Python subprocess using `multiprocessing.Process`.
+        If False, runs the pipeline in the current process.
+    stdout : Optional[TextIO], default=None
+        Optional file-like stream to which subprocess stdout should be redirected.
+        If None, stdout is redirected to /dev/null.
+    stderr : Optional[TextIO], default=None
+        Optional file-like stream to which subprocess stderr should be redirected.
+        If None, stderr is redirected to /dev/null.
+    Returns
+    -------
+    Union[RayPipelineInterface, float, RayPipelineSubprocessInterface]
+        - If run in-process with `block=True`: returns elapsed time in seconds (float).
+        - If run in-process with `block=False`: returns a `RayPipelineInterface`.
+        - If run in subprocess with `block=False`: returns a `RayPipelineSubprocessInterface`.
+        - If run in subprocess with `block=True`: returns 0.0.
+    Raises
+    ------
+    RuntimeError
+        If the subprocess fails to start or exits with an error.
+    Exception
+        Any other exceptions raised during pipeline launch or configuration.
+    """
+    if run_in_subprocess:
+        logger.info("Launching pipeline in Python subprocess using multiprocessing.")
+        ctx = multiprocessing.get_context("fork")
+        process = ctx.Process(
+            target=_run_pipeline_process,
+            args=(
+                ingest_config,
+                disable_dynamic_scaling,
+                dynamic_memory_threshold,
+                stdout,  # raw_stdout
+                stderr,  # raw_stderr
+            ),
+            daemon=False,
+        )
+        process.start()
+        interface = RayPipelineSubprocessInterface(process)
+        if block:
+            start_time = time.time()
+            logger.info("Waiting for subprocess pipeline to complete...")
+            process.join()
+            logger.info("Pipeline subprocess completed.")
+            return time.time() - start_time
+        else:
+            logger.info(f"Pipeline subprocess started (PID={process.pid})")
+            atexit.register(lambda: kill_pipeline_process_group(process.pid))
+            return interface
+    # Run inline
+    pipeline, total_elapsed = _launch_pipeline(
+        ingest_config,
+        block=block,
+        disable_dynamic_scaling=disable_dynamic_scaling,
+        dynamic_memory_threshold=dynamic_memory_threshold,
+    )
     if block:
         logger.debug(f"Pipeline execution completed successfully in {total_elapsed:.2f} seconds.")
-    return pipeline
+        return total_elapsed
+    else:
+        return RayPipelineInterface(pipeline)

nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py CHANGED Viewed

@@ -23,6 +23,7 @@ from nv_ingest.framework.orchestration.ray.stages.extractors.infographic_extract
 from nv_ingest.framework.orchestration.ray.stages.extractors.pdf_extractor import PDFExtractorStage
 from nv_ingest.framework.orchestration.ray.stages.extractors.pptx_extractor import PPTXExtractorStage
 from nv_ingest.framework.orchestration.ray.stages.extractors.table_extractor import TableExtractorStage
+from nv_ingest.framework.orchestration.ray.stages.extractors.html_extractor import HtmlExtractorStage
 from nv_ingest.framework.orchestration.ray.stages.injectors.metadata_injector import MetadataInjectionStage
 from nv_ingest.framework.orchestration.ray.stages.mutate.image_dedup import ImageDedupStage
@@ -49,6 +50,7 @@ from nv_ingest_api.internal.schemas.extract.extract_image_schema import ImageCon
 from nv_ingest_api.internal.schemas.extract.extract_pdf_schema import PDFExtractorSchema
 from nv_ingest_api.internal.schemas.extract.extract_pptx_schema import PPTXExtractorSchema
 from nv_ingest_api.internal.schemas.extract.extract_table_schema import TableExtractorSchema
+from nv_ingest_api.internal.schemas.extract.extract_html_schema import HtmlExtractorSchema
 from nv_ingest_api.internal.schemas.mutate.mutate_image_dedup_schema import ImageDedupSchema
 from nv_ingest_api.internal.schemas.store.store_embedding_schema import EmbeddingStorageSchema
 from nv_ingest_api.internal.schemas.store.store_image_schema import ImageStorageModuleSchema
@@ -383,6 +385,19 @@ def add_audio_extractor_stage(pipeline, default_cpu_count, stage_name="audio_ext
     return stage_name
+def add_html_extractor_stage(pipeline, default_cpu_count, stage_name="html_extractor"):
+    pipeline.add_stage(
+        name=stage_name,
+        stage_actor=HtmlExtractorStage,
+        config=HtmlExtractorSchema(),
+        min_replicas=0,
+        max_replicas=int(max(1, (default_cpu_count // 14))),  # 7% of available CPU cores
+    )
+    return stage_name
 def add_otel_tracer_stage(pipeline, default_cpu_count, stage_name="otel_tracer"):
     _ = default_cpu_count  # Placeholder for future use
     otel_endpoint = os.getenv("OTEL_EXPORTER_OTLP_ENDPOINT", "http://localhost:4317")

{nv_ingest-2025.5.22.dev20250522.dist-info → nv_ingest-2025.5.29.dev20250529.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: nv-ingest
-Version: 2025.5.22.dev20250522
+Version: 2025.5.29.dev20250529
 Summary: Python module for multimodal document ingestion
 Author-email: Jeremy Dyer <jdyer@nvidia.com>
 License:                                  Apache License
@@ -225,7 +225,7 @@ Requires-Dist: httpx>=0.28.1
 Requires-Dist: isodate>=0.7.2
 Requires-Dist: langdetect>=1.0.9
 Requires-Dist: minio>=7.2.12
-Requires-Dist: openai>=1.57.1
+Requires-Dist: openai>=1.82.0
 Requires-Dist: opentelemetry-api>=1.27.0
 Requires-Dist: opentelemetry-exporter-otlp>=1.27.0
 Requires-Dist: opentelemetry-sdk>=1.27.0
@@ -239,6 +239,7 @@ Requires-Dist: build>=1.2.2
 Requires-Dist: python-docx>=1.1.2
 Requires-Dist: python-dotenv>=1.0.1
 Requires-Dist: python-pptx>=1.0.2
+Requires-Dist: prometheus-client
 Requires-Dist: torch==2.4.1
 Requires-Dist: ray[all]>=2.37.0
 Requires-Dist: redis>=5.2.1
@@ -255,9 +256,10 @@ Requires-Dist: uvicorn
 Requires-Dist: pip
 Requires-Dist: llama-index-embeddings-nvidia
 Requires-Dist: opencv-python
-Requires-Dist: pymilvus>=2.5.0
+Requires-Dist: pymilvus>=2.5.10
 Requires-Dist: pymilvus[bulk_writer,model]
 Requires-Dist: tritonclient
 Requires-Dist: nvidia-riva-client>=2.18.0
 Requires-Dist: unstructured-client
+Requires-Dist: markitdown
 Dynamic: license-file

{nv_ingest-2025.5.22.dev20250522.dist-info → nv_ingest-2025.5.29.dev20250529.dist-info}/RECORD RENAMED Viewed

@@ -1,10 +1,11 @@
 nv_ingest/__init__.py,sha256=vJLPeuxiIHqbxXPJSu9qe3MS-GPavbOUExyRq83DxxM,895
 nv_ingest/version.py,sha256=Y9gMjlV_tnRSE3JbmS1rWIfVppM974_g0k30MRF3IQM,1352
 nv_ingest/api/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
-nv_ingest/api/main.py,sha256=HkvOCo3XlcgMVnHWeRTsMCt4JA7MmKvZRt1LecVU2Y4,1501
+nv_ingest/api/main.py,sha256=XE-p4lJp1E7CCDOB8ENtYFrf63Dtq2bzQiGxpRfL2LA,1603
 nv_ingest/api/v1/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
 nv_ingest/api/v1/health.py,sha256=zqu-isMRjh4NveS4XWh5FaAZGPIlBVxpCOg3Uu8nUHQ,4746
 nv_ingest/api/v1/ingest.py,sha256=LWk3LN4lBd3uO8h30EN42g3LHCVcO00avVd5ohVK7NI,19392
+nv_ingest/api/v1/metrics.py,sha256=ZGVRApYLnzc2f2C7wRgGd7deqiXan-jxfA-33a16clY,981
 nv_ingest/framework/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
 nv_ingest/framework/orchestration/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
 nv_ingest/framework/orchestration/ray/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
@@ -19,14 +20,15 @@ nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py,sha25
 nv_ingest/framework/orchestration/ray/primitives/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
 nv_ingest/framework/orchestration/ray/primitives/dataclasses.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py,sha256=L8ENPiF-lxqhIXVEQwQD5CCqQMb710ynj5D_Y4ixGhs,11077
-nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py,sha256=MKwerM3saKAdXZDHXFb4nGSnnwr7rUcOZlDo5JxV45o,28441
-nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py,sha256=CWRxtSCTLe4S42Asv2NCA1hDEoKeblQdCEOmSKOGS0U,56500
+nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py,sha256=gc9gZNqPmnP76M-u8sQXyJd5aTSlyY_0CjLYNa-zvzk,29106
+nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py,sha256=BEBLjkYFXIH396EUQcfuxhrWlIMs9i6z7YfeeqJ5cZg,59579
 nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py,sha256=yPIvOhxY42P-gf5dLkcPkfvfwL_I-ay0C8k5eNaU-VA,15811
 nv_ingest/framework/orchestration/ray/stages/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
 nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
 nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py,sha256=KV4hvY0NTGG8CjZviTgcFLQzaH8WJJGkkb9PFYbROww,3417
 nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py,sha256=tydluNNXfZYSo-0eqqafB59icF3SaeLXWcMrZ6OzlyQ,3998
 nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py,sha256=tSa3Z4vK6sYJ6RBNMa7_FiuOwUaDUl0rTJ6agGbI5y0,3426
+nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py,sha256=fyr0oXokhuaGQrNu5rKyH_qNMD12AS1xPDxKgA26YHE,3426
 nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py,sha256=c-qlLGSizLOgKqH7wl_c8dGOVKYxLtXhZEHLXil4Jc4,3734
 nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py,sha256=dmgvzGMxVX81g7TpZO1ACnRh7sdtpc7YX5KK2QW26U4,2565
 nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py,sha256=BUVuYOCGyPdPpacVhL5rnvA56hydnBip7tPaWTXaT1c,4650
@@ -46,7 +48,7 @@ nv_ingest/framework/orchestration/ray/stages/sinks/__init__.py,sha256=wQSlVx3T14
 nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py,sha256=0SQHJlFuXlP16YRWduX1fMKgjhUd7UhDAWQ8XZh4_0I,1471
 nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py,sha256=enylryvcPmzirpOjCahqYJbNSLsNvv1KpMnOzGqNZQQ,11509
 nv_ingest/framework/orchestration/ray/stages/sources/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
-nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py,sha256=1OEms9l0GWoMuLP74mgEg-J8l_7ctJ3TY6ALT6Wphuo,20285
+nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py,sha256=srDsgp8ExMHZNI76ch3iX7S0drMXmQ3NkWC_udnwqmo,20286
 nv_ingest/framework/orchestration/ray/stages/storage/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
 nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py,sha256=6NkwQzseAnaj0Ptpr3oKvab2EnJdMwTjI2p4dS_HzsI,3901
 nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py,sha256=SMLHQElZkKldnjy0_VHIKS65DBAAtOhwhdoaFe1yb9I,3337
@@ -62,10 +64,10 @@ nv_ingest/framework/orchestration/ray/stages/utility/__init__.py,sha256=wQSlVx3T
 nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py,sha256=MB27CkoNeuirN6CUHgjsC5Wh958NF7m_N7HE4VKfx3k,2264
 nv_ingest/framework/orchestration/ray/util/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
 nv_ingest/framework/orchestration/ray/util/pipeline/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
-nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py,sha256=m13vysZhWo2MQYyjYODFesDkMDvh7w-vLcn28BPj6Ow,50651
-nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py,sha256=j-gMzfiDPMQPqn6my1xibNQOyxH5zBnEAHat6GtBb88,10195
-nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py,sha256=5-UO_RTSvkbzZhO-rnefNm0eh1xqL4DIt2Z3RfMhia4,6656
-nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py,sha256=hVHwzoIyr9nyBtu2LQeJ0mpWu9QwVTAvbHdp2Tc2n6k,20923
+nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py,sha256=AWyCFPP41vp1NOkO2urqm7vh-sTGKypJxwhdq8HxK6Q,50681
+nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py,sha256=jMYnVe_0rb1OIO9mlB4LH3uXtgaXBbUG-rDPx6fe6J8,10456
+nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py,sha256=3aSYSxyunm-eKUYErDArQTHXSoNKlNJMUr9o5Ui6VTk,14037
+nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py,sha256=_MPUbOVTo9CjkBdDA--mcpu2plQ9qFY_TCBXbfpbB_A,21477
 nv_ingest/framework/orchestration/ray/util/system_tools/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
 nv_ingest/framework/orchestration/ray/util/system_tools/memory.py,sha256=ICqY0LLB3hFTZk03iX5yffMSKFH2q_aQomtDVzS_mKw,2228
 nv_ingest/framework/orchestration/ray/util/system_tools/visualizers.py,sha256=2oHZdO_3L1LGuzpyNmZBDh19n0E-APAaHk4MEwBwSHs,12895
@@ -93,8 +95,8 @@ nv_ingest/framework/util/service/meta/ingest/__init__.py,sha256=wQSlVx3T14ZgQAt-
 nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py,sha256=QS3uNxWBl5dIcmIpJKNe8_TLcTUuN2vcKyHeAwa-eSo,1589
 nv_ingest/framework/util/telemetry/__init__.py,sha256=wQSlVx3T14ZgQAt-EPzEczQusXVW0W8yynnUaFFGE3s,143
 nv_ingest/framework/util/telemetry/global_stats.py,sha256=nq65pEEdiwjAfGiqsxG1CeQMC96O3CfQxsZuGFCY-ds,4554
-nv_ingest-2025.5.22.dev20250522.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
-nv_ingest-2025.5.22.dev20250522.dist-info/METADATA,sha256=LUbvIScRcL85fCyLAHdBYw9M3MmBHRLOJAVf0ri3ZMc,15082
-nv_ingest-2025.5.22.dev20250522.dist-info/WHEEL,sha256=zaaOINJESkSfm_4HQVc5ssNzHCPXhJm0kEUakpsEHaU,91
-nv_ingest-2025.5.22.dev20250522.dist-info/top_level.txt,sha256=sjb0ajIsgn3YgftSjZHlYO0HjYAIIhNuXG_AmywCvaU,10
-nv_ingest-2025.5.22.dev20250522.dist-info/RECORD,,
+nv_ingest-2025.5.29.dev20250529.dist-info/licenses/LICENSE,sha256=xx0jnfkXJvxRnG63LTGOxlggYnIysveWIZ6H3PNdCrQ,11357
+nv_ingest-2025.5.29.dev20250529.dist-info/METADATA,sha256=zMIjMLHJLCUg8DdH5oZUIyIK4BVkIZx6U7iQfE9TdxM,15142
+nv_ingest-2025.5.29.dev20250529.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+nv_ingest-2025.5.29.dev20250529.dist-info/top_level.txt,sha256=sjb0ajIsgn3YgftSjZHlYO0HjYAIIhNuXG_AmywCvaU,10
+nv_ingest-2025.5.29.dev20250529.dist-info/RECORD,,

{nv_ingest-2025.5.22.dev20250522.dist-info → nv_ingest-2025.5.29.dev20250529.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (80.8.0)
+Generator: setuptools (80.9.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

{nv_ingest-2025.5.22.dev20250522.dist-info → nv_ingest-2025.5.29.dev20250529.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{nv_ingest-2025.5.22.dev20250522.dist-info → nv_ingest-2025.5.29.dev20250529.dist-info}/top_level.txt RENAMED Viewed

File without changes

nv-ingest 2025.5.22.dev20250522__py3-none-any.whl → 2025.5.29.dev20250529__py3-none-any.whl

Potentially problematic release.

nv-ingest 2025.5.22.dev20250522py3-none-any.whl → 2025.5.29.dev20250529py3-none-any.whl