PyPI - nv-ingest - Versions diffs - 2025.5.21.dev20250521__tar.gz → 2025.5.29.dev20250529__tar.gz - Mend

nv-ingest 2025.5.21.dev20250521tar.gz → 2025.5.29.dev20250529tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of nv-ingest might be problematic. Click here for more details.

Files changed (108) hide show

{nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: nv-ingest
-Version: 2025.5.21.dev20250521
+Version: 2025.5.29.dev20250529
 Summary: Python module for multimodal document ingestion
 Author-email: Jeremy Dyer <jdyer@nvidia.com>
 License:                                  Apache License
@@ -225,13 +225,13 @@ Requires-Dist: httpx>=0.28.1
 Requires-Dist: isodate>=0.7.2
 Requires-Dist: langdetect>=1.0.9
 Requires-Dist: minio>=7.2.12
-Requires-Dist: openai>=1.57.1
+Requires-Dist: openai>=1.82.0
 Requires-Dist: opentelemetry-api>=1.27.0
 Requires-Dist: opentelemetry-exporter-otlp>=1.27.0
 Requires-Dist: opentelemetry-sdk>=1.27.0
 Requires-Dist: pydantic>2.0.0
 Requires-Dist: pydantic-settings>2.0.0
-Requires-Dist: pypdfium2>=4.30.0
+Requires-Dist: pypdfium2==4.30.1
 Requires-Dist: pytest>=8.0.2
 Requires-Dist: pytest-mock>=3.14.0
 Requires-Dist: pytest-cov>=6.0.0
@@ -239,6 +239,7 @@ Requires-Dist: build>=1.2.2
 Requires-Dist: python-docx>=1.1.2
 Requires-Dist: python-dotenv>=1.0.1
 Requires-Dist: python-pptx>=1.0.2
+Requires-Dist: prometheus-client
 Requires-Dist: torch==2.4.1
 Requires-Dist: ray[all]>=2.37.0
 Requires-Dist: redis>=5.2.1
@@ -255,9 +256,10 @@ Requires-Dist: uvicorn
 Requires-Dist: pip
 Requires-Dist: llama-index-embeddings-nvidia
 Requires-Dist: opencv-python
-Requires-Dist: pymilvus>=2.5.0
+Requires-Dist: pymilvus>=2.5.10
 Requires-Dist: pymilvus[bulk_writer,model]
 Requires-Dist: tritonclient
 Requires-Dist: nvidia-riva-client>=2.18.0
 Requires-Dist: unstructured-client
+Requires-Dist: markitdown
 Dynamic: license-file

{nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/api/main.py RENAMED Viewed

@@ -14,6 +14,7 @@ from opentelemetry.sdk.trace.export import BatchSpanProcessor
 from .v1.health import router as HealthApiRouter
 from .v1.ingest import router as IngestApiRouter
+from .v1.metrics import router as MetricsApiRouter
 logger = logging.getLogger(__name__)
@@ -21,7 +22,7 @@ logger = logging.getLogger(__name__)
 app = FastAPI(
     title="NV-Ingest Microservice",
     description="Service for ingesting heterogenous datatypes",
-    version="25.3.0",
+    version="25.4.2",
     contact={
         "name": "NVIDIA Corporation",
         "url": "https://nvidia.com",
@@ -31,6 +32,7 @@ app = FastAPI(
 app.include_router(IngestApiRouter, prefix="/v1")
 app.include_router(HealthApiRouter, prefix="/v1/health")
+app.include_router(MetricsApiRouter, prefix="/v1")
 # Set up the tracer provider and add a processor for exporting traces
 resource = Resource(attributes={"service.name": "nv-ingest"})

nv_ingest-2025.5.29.dev20250529/nv_ingest/api/v1/metrics.py ADDED Viewed

@@ -0,0 +1,29 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
+# All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+import logging
+from fastapi import APIRouter, Response, status
+from prometheus_client import Counter, Histogram, generate_latest, CONTENT_TYPE_LATEST
+router = APIRouter()
+# logger = logging.getLogger("uvicorn")
+logger = logging.getLogger(__name__)
+# Prometheus metrics
+REQUEST_COUNT = Counter("http_requests_total", "Total HTTP Requests", ["method", "endpoint"])
+REQUEST_LATENCY = Histogram("http_request_duration_seconds", "Request latency", ["method", "endpoint"])
+@router.get(
+    "/metrics",
+    tags=["Health"],
+    summary="Provide prometheus formatted metrics for consumption",
+    description="""
+        Provide prometheus formatted metrics for consumption by a prometheus scraping server.
+    """,
+    status_code=status.HTTP_200_OK,
+)
+def metrics():
+    return Response(generate_latest(), media_type=CONTENT_TYPE_LATEST)

{nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py RENAMED Viewed

@@ -74,9 +74,26 @@ class PipelineTopology:
         self._start_cleanup_thread()  # Start background cleanup on init
     def __del__(self):
-        """Ensure cleanup thread is stopped when topology object is destroyed."""
-        logger.debug("PipelineTopology destructor called, ensuring cleanup thread is stopped.")
-        self._stop_cleanup_thread()
+        """Ensure cleanup thread is stopped and internal actor references are released."""
+        logger.debug("PipelineTopology destructor called. Cleaning up thread and actor references.")
+        # Stop the background cleanup thread
+        try:
+            self._stop_cleanup_thread()
+        except Exception as e:
+            logger.warning(f"Error stopping cleanup thread during __del__: {e}")
+        # Clear references to actor handles and shutdown futures
+        try:
+            self._stage_actors.clear()
+            self._edge_queues.clear()
+            self._scaling_state.clear()
+            self._stage_memory_overhead.clear()
+            self._pending_removal_actors.clear()
+            self._stages.clear()
+            self._connections.clear()
+        except Exception as e:
+            logger.warning(f"Error clearing internal state during __del__: {e}")
     # --- Lock Context Manager ---
     @contextlib.contextmanager

{nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py RENAMED Viewed

@@ -2,7 +2,11 @@
 # All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
+import multiprocessing
+import os
+import signal
 import threading
+from abc import ABC, abstractmethod
 from collections import defaultdict
 from dataclasses import dataclass
@@ -24,6 +28,35 @@ from nv_ingest.framework.orchestration.ray.util.pipeline.pid_controller import P
 logger = logging.getLogger(__name__)
+class PipelineInterface(ABC):
+    """
+    Abstract base class for pipeline implementations.
+    Any concrete pipeline must implement start and stop methods.
+    """
+    @abstractmethod
+    def start(self, monitor_poll_interval: float = 5.0, scaling_poll_interval: float = 30.0) -> None:
+        """
+        Start the pipeline.
+        Parameters
+        ----------
+        monitor_poll_interval : float
+            Interval in seconds for monitoring poll (default: 5.0).
+        scaling_poll_interval : float
+            Interval in seconds for scaling decisions (default: 30.0).
+        """
+        pass
+    @abstractmethod
+    def stop(self) -> None:
+        """
+        Stop the pipeline and perform any necessary cleanup.
+        """
+        pass
 # --- Configuration Objects ---
@@ -62,7 +95,90 @@ class StatsConfig:
     queue_timeout_seconds: float = 2.0
-class RayPipeline:
+class RayPipelineSubprocessInterface(PipelineInterface):
+    """
+    Pipeline interface implementation for a subprocess-based Ray pipeline.
+    """
+    def __init__(self, process: multiprocessing.Process):
+        """
+        Parameters
+        ----------
+        process : multiprocessing.Process
+            A handle to the running subprocess.
+        """
+        self._process: multiprocessing.Process = process
+    def start(self, monitor_poll_interval: float = 5.0, scaling_poll_interval: float = 30.0) -> None:
+        """
+        Start is not supported because the subprocess is assumed to already be running.
+        """
+        pass
+    def stop(self) -> None:
+        """
+        Stops the subprocess pipeline. Tries terminate(), then escalates to SIGKILL on the process group if needed.
+        """
+        if not self._process.is_alive():
+            return
+        try:
+            self._process.terminate()
+            self._process.join(timeout=5.0)
+        except Exception as e:
+            logger.warning(f"Failed to terminate process cleanly: {e}")
+        if self._process.is_alive():
+            try:
+                pgid = os.getpgid(self._process.pid)
+                os.killpg(pgid, signal.SIGKILL)
+            except Exception as e:
+                logger.error(f"Failed to force-kill process group: {e}")
+            self._process.join(timeout=3.0)
+class RayPipelineInterface(PipelineInterface):
+    """
+    Pipeline interface for an in-process RayPipeline instance.
+    """
+    def __init__(self, pipeline: "RayPipeline"):
+        """
+        Parameters
+        ----------
+        pipeline : RayPipeline
+            The instantiated pipeline to control.
+        """
+        self._pipeline = pipeline
+    def start(self, monitor_poll_interval: float = 5.0, scaling_poll_interval: float = 30.0) -> None:
+        """
+        Starts the RayPipeline.
+        Parameters
+        ----------
+        monitor_poll_interval : float
+            Unused here; provided for interface compatibility.
+        scaling_poll_interval : float
+            Unused here; provided for interface compatibility.
+        """
+        self._pipeline.start(monitor_poll_interval, scaling_poll_interval)
+    def stop(self) -> None:
+        """
+        Stops the RayPipeline and shuts down Ray.
+        """
+        self._pipeline.stop()
+        try:
+            import ray
+            ray.shutdown()
+        except Exception:
+            pass
+class RayPipeline(PipelineInterface):
     """
     A structured pipeline supporting dynamic scaling and queue flushing.
     Uses PIDController and ResourceConstraintManager. Supports optional GUI display.
@@ -91,6 +207,8 @@ class RayPipeline:
         # --- State ---
         # self.scaling_state: Dict[str, str] = {}
         self.prev_global_memory_usage: Optional[int] = None
+        self._state_lock: threading.Lock = threading.Lock()
+        self._stopping = False
         # --- Build Time Config & State ---
         # Use scaling_config for these
@@ -149,10 +267,17 @@ class RayPipeline:
             actor_timeout=self.stats_config.actor_timeout_seconds,
             queue_timeout=self.stats_config.queue_timeout_seconds,
         )
         logger.info("RayStatsCollector initialized using StatsConfig.")
     # --- Accessor Methods for Stats Collector (and internal use) ---
+    def __del__(self):
+        try:
+            self.stop()
+        except Exception as e:
+            logger.error(f"Exception during RayPipeline cleanup: {e}")
     def get_stages_info(self) -> List[StageInfo]:
         """Returns a snapshot of the current stage information."""
         return self.topology.get_stages_info()
@@ -514,7 +639,9 @@ class RayPipeline:
         """
         current_count = len(current_replicas)
         num_to_remove = current_count - target_count
-        logger.info(f"[ScaleDown-{stage_name}] Scaling down from {current_count} to {target_count} (-{num_to_remove}).")
+        logger.debug(
+            f"[ScaleDown-{stage_name}] Scaling down from {current_count} to {target_count} (-{num_to_remove})."
+        )
         # Basic validation
         if num_to_remove <= 0:
@@ -562,7 +689,7 @@ class RayPipeline:
             logger.warning(f"[ScaleDown-{stage_name}] No actors successfully initiated stop for registration.")
         total_attempted = len(actors_to_remove)
-        logger.info(
+        logger.debug(
             f"[ScaleDown-{stage_name}] Scale down initiation process complete for {total_attempted} actors "
             f"(Skipped/Failed Initiation: {stop_initiation_failures}). Topology cleanup will handle final removal."
         )
@@ -645,9 +772,6 @@ class RayPipeline:
         # Activity check
         is_quiet = global_in_flight <= self.quiet_period_threshold
-        if is_quiet:
-            logger.info(f"Pipeline IS quiet. In-Flight: {global_in_flight} <= Threshold: {self.quiet_period_threshold}")
         return is_quiet
     def _wait_for_pipeline_drain(self, timeout_seconds: int) -> bool:
@@ -668,7 +792,6 @@ class RayPipeline:
                 return False
             # --- Trigger immediate stats collection via the collector instance ---
-            drain_stats = {}
             drain_success = False
             collection_error = None
@@ -687,19 +810,18 @@ class RayPipeline:
                     if not collection_error
                     else f"Collection Error: {type(collection_error).__name__}"
                 )
-                logger.info(
-                    f"[DrainWait] Check at {elapsed_time:.1f}s: Global In-Flight={global_in_flight} ({status_msg})"
+                logger.debug(
+                    f"[Drain] Check at {elapsed_time:.1f}s: Global In-Flight={global_in_flight} ({status_msg})"
                 )
                 last_in_flight = global_in_flight
             # --- Check for successful drain ---
             # Requires BOTH in-flight=0 AND the collection reporting it was successful
             if global_in_flight == 0 and drain_success and not collection_error:
-                logger.info(f"Pipeline confirmed drained (In-Flight=0) in {elapsed_time:.1f}s.")
                 return True
             elif global_in_flight == 0:  # Saw zero, but collection wasn't fully successful
                 logger.warning(
-                    "[DrainWait] In-Flight reached 0, but stats collection had errors/timeouts."
+                    "[Drain] In-Flight reached 0, but stats collection had errors/timeouts."
                     " Cannot confirm drain yet."
                 )
@@ -711,13 +833,12 @@ class RayPipeline:
     def _execute_queue_flush(self) -> bool:
         """Executes queue flush, using topology for state and structure."""
-        if self.topology.get_is_flushing():  # Check topology state
-            logger.warning("Queue flush requested but already in progress. Ignoring.")
+        if self.topology.get_is_flushing() or self._stopping:  # Check topology state
+            logger.warning("Queue flush requested but already in progress or pipeline is stopping. Ignoring.")
             return False
         # Set flushing state in topology
         self.topology.set_flushing(True)
-        logger.info("--- Starting Queue Flush ---")
         overall_success = False
         source_actors_paused = []
         pause_refs = []
@@ -732,7 +853,7 @@ class RayPipeline:
             current_connections = self.topology.get_connections()
             # --- 1. Pause Source Stages (using snapshots) ---
-            logger.info("Pausing source stages...")
+            logger.debug("Pausing source stages...")
             pause_timeout = 60.0
             for stage in current_stages:
                 if stage.is_source:
@@ -745,22 +866,22 @@ class RayPipeline:
                             except Exception as e:
                                 logger.error(f"Failed sending pause to {actor}: {e}")
             if pause_refs:
-                logger.info(f"Waiting up to {pause_timeout}s for {len(pause_refs)} sources to pause...")
+                logger.debug(f"Waiting up to {pause_timeout}s for {len(pause_refs)} sources to pause...")
                 try:
                     ray.get(pause_refs, timeout=pause_timeout)
-                    logger.info(f"{len(pause_refs)} sources acknowledged pause.")
+                    logger.debug(f"{len(pause_refs)} sources acknowledged pause.")
                 except GetTimeoutError:
                     logger.warning(f"Timeout waiting for {len(pause_refs)} sources to pause.")
                 except Exception as e:
                     logger.error(f"Error waiting for sources pause: {e}. Proceeding cautiously.")
             # --- 2. Wait for Drain ---
-            logger.info("Waiting for pipeline to drain...")
+            logger.debug("Waiting for pipeline to drain...")
             if not self._wait_for_pipeline_drain(self.queue_flush_drain_timeout_seconds):
                 raise RuntimeError("Pipeline drain failed or timed out, aborting flush.")
             # --- 3. Create New Queues (using snapshot) ---
-            logger.info("Creating new replacement queues...")
+            logger.debug("Creating new replacement queues...")
             new_edge_queues_map = {}
             for queue_name, (_, queue_size) in current_edge_queues.items():
                 try:
@@ -773,7 +894,7 @@ class RayPipeline:
                     raise RuntimeError(f"Failed to create new queue '{queue_name}'.") from e
             # --- 4. Re-wire Actors to New Queues (using snapshots) ---
-            logger.info("Re-wiring actors to new queues...")
+            logger.debug("Re-wiring actors to new queues...")
             wiring_refs = []
             wiring_timeout = 120.0
             for from_stage_name, conns in current_connections.items():
@@ -809,7 +930,7 @@ class RayPipeline:
                     raise RuntimeError("Actor re-wiring failed.") from e
             # --- 5. Update Topology State (Commit Point) ---
-            logger.info("Committing new queues to pipeline topology.")
+            logger.debug("Committing new queues to pipeline topology.")
             self.topology.set_edge_queues(new_edge_queues_map)  # Commit the change
             overall_success = True
@@ -820,7 +941,7 @@ class RayPipeline:
         finally:
             # --- 6. Resume Source Stages (Always attempt) ---
             if source_actors_paused:
-                logger.info(f"Attempting to resume {len(source_actors_paused)} source actors...")
+                logger.debug(f"Attempting to resume {len(source_actors_paused)} source actors...")
                 resume_timeout = 30.0
                 resume_refs = []
                 for actor in source_actors_paused:
@@ -829,10 +950,10 @@ class RayPipeline:
                     except Exception as e:
                         logger.error(f"Failed sending resume to {actor}: {e}")
                 if resume_refs:
-                    logger.info(f"Waiting up to {resume_timeout}s for {len(resume_refs)} actors to resume...")
+                    logger.debug(f"Waiting up to {resume_timeout}s for {len(resume_refs)} actors to resume...")
                     try:
                         ray.get(resume_refs, timeout=resume_timeout)
-                        logger.info(f"{len(resume_refs)} sources resumed.")
+                        logger.debug(f"{len(resume_refs)} sources resumed.")
                     except GetTimeoutError:
                         logger.warning(f"Timeout waiting for {len(resume_refs)} sources to resume.")
                     except Exception as e:
@@ -841,9 +962,6 @@ class RayPipeline:
             # Update flush timestamp only on success
             if overall_success:
                 self._last_queue_flush_time = time.time()
-                logger.info("--- Queue Flush Completed Successfully ---")
-            else:
-                logger.error("--- Queue Flush Failed ---")
             # Reset flushing state in topology
             self.topology.set_flushing(False)
@@ -853,8 +971,9 @@ class RayPipeline:
     def request_queue_flush(self, force: bool = False) -> None:
         """Requests a queue flush, checking topology state."""
         logger.info(f"Manual queue flush requested (force={force}).")
-        if self.topology.get_is_flushing():  # Check topology
-            logger.warning("Flush already in progress.")
+        if self.topology.get_is_flushing() or self._stopping:  # Check topology
+            logger.warning("Flush already in progress or pipeline is stopping.")
             return
         if force or self._is_pipeline_quiet():
             # Consider running _execute_queue_flush in a separate thread
@@ -974,7 +1093,7 @@ class RayPipeline:
             if target_replica_count != current_count:
                 stages_needing_action.append((stage_name, target_replica_count))
-                logger.info(
+                logger.debug(
                     f"[ScalingApply-{stage_name}] Action: Current={current_count}, "
                     f"Target={target_replica_count} (Min={stage_info.min_replicas}, Max={stage_info.max_replicas})"
                 )
@@ -1016,69 +1135,80 @@ class RayPipeline:
         completed = sum(1 for r in action_results.values() if r["status"] == "completed")
         errors = sum(1 for r in action_results.values() if r["status"] == "error")
         timeouts = sum(1 for r in action_results.values() if r["status"] == "timeout")
-        logger.info(f"[ScalingApply] Summary: {completed} completed, {errors} errors, {timeouts} timeouts.")
+        logger.debug(f"[ScalingApply] Summary: {completed} completed, {errors} errors, {timeouts} timeouts.")
     def _perform_scaling_and_maintenance(self) -> None:
         """Orchestrates scaling/maintenance using topology and stats collector."""
-        logger.debug("--- Performing Scaling & Maintenance Cycle ---")
+        if self._stopping:
+            logger.debug("Pipeline is stopping. Skipping scaling cycle.")
+            return
         if not self.dynamic_memory_scaling:
             logger.debug("Dynamic memory scaling disabled. Skipping cycle.")
             return
-        cycle_start_time = time.time()
-        # Check flushing state via topology
         if self.topology.get_is_flushing():
             logger.debug("Skipping scaling cycle: Queue flush in progress (topology state).")
             return
-        # --- Check for quietness for flushing (uses topology state via helper) ---
+        got_lock = self._state_lock.acquire(timeout=0.1)
+        if not got_lock:
+            logger.debug("Could not acquire lock for maintenance; skipping cycle.")
+            return
+        cycle_start_time = time.time()
         try:
+            if self._stopping:
+                logger.debug("Pipeline began stopping after acquiring lock. Skipping maintenance logic.")
+                return
+            logger.debug("--- Performing Scaling & Maintenance Cycle ---")
             if self._is_pipeline_quiet():
-                logger.info("Pipeline quiet, initiating queue flush.")
-                flush_success = self._execute_queue_flush()  # Uses topology internally
-                logger.info(f"Automatic queue flush completed. Success: {flush_success}")
-                return  # Skip scaling if flush occurred
-        except Exception as e:
-            logger.error(f"Error during quiet check or flush: {e}. Skipping cycle.", exc_info=True)
-            return
+                logger.info("[Drain] Pipeline quiet, initiating queue flush.")
+                flush_success = self._execute_queue_flush()
+                logger.info(f"[Drain] Automatic queue flush completed. Success: {flush_success}")
+                return
-        # --- Get & Validate Stats ---
-        current_stage_stats, global_in_flight, last_update_time, stats_were_successful = (
-            self.stats_collector.get_latest_stats()
-        )
+            # Fast return check if stopping occurred while flushing or checking flush status
+            if self._stopping:
+                return
-        last_update_age = time.time() - last_update_time
-        max_stats_age_for_scaling = max(15.0, self._stats_collection_interval_seconds)
-        if not current_stage_stats or not stats_were_successful or last_update_age > max_stats_age_for_scaling:
-            status = "No stats" if not current_stage_stats else "Failed" if not stats_were_successful else "Stale"
-            logger.warning(
-                f"[Scaling] Cannot scale reliably: Stats {status} (Age: {last_update_age:.1f}s). Skipping cycle."
+            current_stage_stats, global_in_flight, last_update_time, stats_were_successful = (
+                self.stats_collector.get_latest_stats()
             )
-            return
-        # --- Gather Metrics (uses topology via helper) ---
-        current_stage_metrics = self._gather_controller_metrics(current_stage_stats, global_in_flight)
-        if not current_stage_metrics:
-            logger.error("[Scaling] Failed gather metrics. Skipping.")
-            return
+            last_update_age = time.time() - last_update_time
+            max_age = max(15.0, self._stats_collection_interval_seconds)
+            if not current_stage_stats or not stats_were_successful or last_update_age > max_age:
+                status = "No stats" if not current_stage_stats else "Failed" if not stats_were_successful else "Stale"
+                logger.warning(
+                    f"[Scaling] Cannot scale reliably: Stats {status} (Age: {last_update_age:.1f}s). Skipping cycle."
+                )
+                return
-        # --- Get Memory Usage ---
-        current_global_memory_mb = self._get_current_global_memory()
+            current_stage_metrics = self._gather_controller_metrics(current_stage_stats, global_in_flight)
+            if not current_stage_metrics:
+                logger.error("[Scaling] Failed to gather metrics. Skipping.")
+                return
-        # --- Calculate Scaling Adjustments (uses topology via helper) ---
-        final_adjustments = self._calculate_scaling_adjustments(
-            current_stage_metrics, global_in_flight, current_global_memory_mb
-        )
+            current_global_memory_mb = self._get_current_global_memory()
+            final_adjustments = self._calculate_scaling_adjustments(
+                current_stage_metrics, global_in_flight, current_global_memory_mb
+            )
+            self.prev_global_memory_usage = current_global_memory_mb
+            self._apply_scaling_actions(final_adjustments)
-        # --- Update Memory Usage *After* Decision ---
-        self.prev_global_memory_usage = current_global_memory_mb
+            logger.debug(
+                f"--- Scaling & Maintenance Cycle Complete (Duration: {time.time() - cycle_start_time:.2f}s) ---"
+            )
-        # --- Apply Scaling Actions (uses topology via helper) ---
-        self._apply_scaling_actions(final_adjustments)
+        except Exception as e:  # noqa
+            logger.error("Exception during maintenance cycle", exc_info=True)
-        logger.debug(f"--- Scaling & Maintenance Cycle Complete (Duration: {time.time() - cycle_start_time:.2f}s) ---")
+        finally:
+            self._state_lock.release()
     # --- Lifecycle Methods for Monitoring/Scaling Threads ---
     def _scaling_loop(self, interval: float) -> None:
@@ -1149,39 +1279,44 @@ class RayPipeline:
         """Stops background threads and actors (via topology)."""
         logger.info("Stopping pipeline...")
+        if self._stopping:
+            return
+        self._stopping = True
         # 1. Stop background threads first
-        self._stop_scaling()
-        self.stats_collector.stop()
+        with self._state_lock:
+            self._stop_scaling()
+            self.stats_collector.stop()
+            # 2. Stop actors (using topology)
+            logger.debug("Stopping all stage actors...")
+            stop_refs_map: Dict[ray.ObjectRef, Any] = {}
-        # 2. Stop actors (using topology)
-        logger.debug("Stopping all stage actors...")
-        stop_refs_map: Dict[ray.ObjectRef, Any] = {}
-        actors_to_kill = []
+            # Get actors snapshot from topology
+            current_actors = {name: list(actors) for name, actors in self.topology.get_stage_actors().items()}
-        # Get actors snapshot from topology
-        current_actors = {name: list(actors) for name, actors in self.topology.get_stage_actors().items()}
+            for stage_name, actors in current_actors.items():
+                for actor in actors:
+                    try:
+                        stop_refs_map[actor.stop.remote()] = actor
+                    except Exception as e:
+                        logger.warning(f"Error initiating stop for {actor} in {stage_name}: {e}. Skipping.")
-        for stage_name, actors in current_actors.items():
-            for actor in actors:
+            if stop_refs_map:
+                stop_refs = list(stop_refs_map.keys())
+                logger.debug(f"Waiting up to 60s for {len(stop_refs)} actors to stop gracefully...")
                 try:
-                    stop_refs_map[actor.stop.remote()] = actor
+                    ready, not_ready = ray.wait(stop_refs, num_returns=len(stop_refs), timeout=60.0)
+                    if not_ready:
+                        logger.warning(
+                            f"Timeout waiting for {len(not_ready)} actors to stop. Allowing Ray to clean up."
+                        )
+                    logger.info(f"{len(ready)} actors stopped via stop().")
                 except Exception as e:
-                    logger.warning(f"Error initiating stop for {actor} in {stage_name}: {e}. Will kill.")
-        if stop_refs_map:
-            stop_refs = list(stop_refs_map.keys())
-            logger.debug(f"Waiting up to 60s for {len(stop_refs)} actors to stop gracefully...")
-            try:
-                ready, not_ready = ray.wait(stop_refs, num_returns=len(stop_refs), timeout=60.0)
-                if not_ready:
-                    logger.warning(f"Timeout waiting for {len(not_ready)} actors to stop. Will kill.")
-                    actors_to_kill.extend(stop_refs_map.get(ref) for ref in not_ready if stop_refs_map.get(ref))
-                logger.info(f"{len(ready)} actors stopped via stop().")
-            except Exception as e:
-                logger.error(f"Error during actor stop confirmation: {e}", exc_info=True)
-                actors_to_kill.extend(a for a in stop_refs_map.values() if a not in actors_to_kill)  # Add all on error
+                    logger.error(f"Error during actor stop confirmation: {e}", exc_info=True)
-        # Clear runtime state in topology
-        self.topology.clear_runtime_state()
+            # Clear runtime state in topology
+            self.topology.clear_runtime_state()
+            del self.topology
-        logger.info("Pipeline stopped.")
+            logger.info("Pipeline stopped.")

nv-ingest 2025.5.21.dev20250521__tar.gz → 2025.5.29.dev20250529__tar.gz

Potentially problematic release.

nv-ingest 2025.5.21.dev20250521tar.gz → 2025.5.29.dev20250529tar.gz