nv-ingest 2025.5.21.dev20250521__tar.gz → 2025.5.29.dev20250529__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of nv-ingest might be problematic. Click here for more details.
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/PKG-INFO +6 -4
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/api/main.py +3 -1
- nv_ingest-2025.5.29.dev20250529/nv_ingest/api/v1/metrics.py +29 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +20 -3
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +233 -98
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +1 -1
- nv_ingest-2025.5.29.dev20250529/nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py +82 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +1 -1
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +33 -33
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +7 -2
- nv_ingest-2025.5.29.dev20250529/nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +376 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +15 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest.egg-info/PKG-INFO +6 -4
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest.egg-info/SOURCES.txt +2 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest.egg-info/requires.txt +5 -3
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/pyproject.toml +5 -3
- nv_ingest-2025.5.21.dev20250521/nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +0 -170
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/LICENSE +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/MANIFEST.in +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/__init__.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/api/__init__.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/api/v1/__init__.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/api/v1/health.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/api/v1/ingest.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/__init__.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/__init__.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/__init__.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/edges/__init__.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/edges/async_queue_edge.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/edges/ray_queue_edge.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/edges/threaded_queue_edge.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/examples/__init__.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/examples/task_source_harness.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/primitives/__init__.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/primitives/dataclasses.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/__init__.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/injectors/__init__.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/meta/__init__.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_edge_base.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/mutate/__init__.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/sinks/__init__.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/sources/__init__.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/storage/__init__.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/telemetry/__init__.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/telemetry/otel_meter.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/transforms/__init__.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/utility/__init__.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/util/__init__.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/util/pipeline/__init__.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/util/system_tools/__init__.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/util/system_tools/memory.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/util/system_tools/visualizers.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/schemas/__init__.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/schemas/framework_ingest_config_schema.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/schemas/framework_job_counter_schema.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/schemas/framework_message_broker_sink_schema.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/schemas/framework_message_broker_source_schema.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/schemas/framework_message_wrapper_schema.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/schemas/framework_metadata_injector_schema.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/schemas/framework_otel_meter_schema.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/schemas/framework_otel_tracer_schema.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/schemas/framework_processing_job_schema.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/schemas/framework_task_injection_schema.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/schemas/framework_vdb_task_sink_schema.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/util/__init__.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/util/flow_control/__init__.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/util/flow_control/filter_by_task.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/util/service/__init__.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/util/service/impl/__init__.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/util/service/impl/ingest/__init__.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/util/service/meta/__init__.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/util/service/meta/ingest/__init__.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/util/telemetry/__init__.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/util/telemetry/global_stats.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/version.py +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest.egg-info/dependency_links.txt +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest.egg-info/top_level.txt +0 -0
- {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/setup.cfg +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: nv-ingest
|
|
3
|
-
Version: 2025.5.
|
|
3
|
+
Version: 2025.5.29.dev20250529
|
|
4
4
|
Summary: Python module for multimodal document ingestion
|
|
5
5
|
Author-email: Jeremy Dyer <jdyer@nvidia.com>
|
|
6
6
|
License: Apache License
|
|
@@ -225,13 +225,13 @@ Requires-Dist: httpx>=0.28.1
|
|
|
225
225
|
Requires-Dist: isodate>=0.7.2
|
|
226
226
|
Requires-Dist: langdetect>=1.0.9
|
|
227
227
|
Requires-Dist: minio>=7.2.12
|
|
228
|
-
Requires-Dist: openai>=1.
|
|
228
|
+
Requires-Dist: openai>=1.82.0
|
|
229
229
|
Requires-Dist: opentelemetry-api>=1.27.0
|
|
230
230
|
Requires-Dist: opentelemetry-exporter-otlp>=1.27.0
|
|
231
231
|
Requires-Dist: opentelemetry-sdk>=1.27.0
|
|
232
232
|
Requires-Dist: pydantic>2.0.0
|
|
233
233
|
Requires-Dist: pydantic-settings>2.0.0
|
|
234
|
-
Requires-Dist: pypdfium2
|
|
234
|
+
Requires-Dist: pypdfium2==4.30.1
|
|
235
235
|
Requires-Dist: pytest>=8.0.2
|
|
236
236
|
Requires-Dist: pytest-mock>=3.14.0
|
|
237
237
|
Requires-Dist: pytest-cov>=6.0.0
|
|
@@ -239,6 +239,7 @@ Requires-Dist: build>=1.2.2
|
|
|
239
239
|
Requires-Dist: python-docx>=1.1.2
|
|
240
240
|
Requires-Dist: python-dotenv>=1.0.1
|
|
241
241
|
Requires-Dist: python-pptx>=1.0.2
|
|
242
|
+
Requires-Dist: prometheus-client
|
|
242
243
|
Requires-Dist: torch==2.4.1
|
|
243
244
|
Requires-Dist: ray[all]>=2.37.0
|
|
244
245
|
Requires-Dist: redis>=5.2.1
|
|
@@ -255,9 +256,10 @@ Requires-Dist: uvicorn
|
|
|
255
256
|
Requires-Dist: pip
|
|
256
257
|
Requires-Dist: llama-index-embeddings-nvidia
|
|
257
258
|
Requires-Dist: opencv-python
|
|
258
|
-
Requires-Dist: pymilvus>=2.5.
|
|
259
|
+
Requires-Dist: pymilvus>=2.5.10
|
|
259
260
|
Requires-Dist: pymilvus[bulk_writer,model]
|
|
260
261
|
Requires-Dist: tritonclient
|
|
261
262
|
Requires-Dist: nvidia-riva-client>=2.18.0
|
|
262
263
|
Requires-Dist: unstructured-client
|
|
264
|
+
Requires-Dist: markitdown
|
|
263
265
|
Dynamic: license-file
|
|
@@ -14,6 +14,7 @@ from opentelemetry.sdk.trace.export import BatchSpanProcessor
|
|
|
14
14
|
|
|
15
15
|
from .v1.health import router as HealthApiRouter
|
|
16
16
|
from .v1.ingest import router as IngestApiRouter
|
|
17
|
+
from .v1.metrics import router as MetricsApiRouter
|
|
17
18
|
|
|
18
19
|
logger = logging.getLogger(__name__)
|
|
19
20
|
|
|
@@ -21,7 +22,7 @@ logger = logging.getLogger(__name__)
|
|
|
21
22
|
app = FastAPI(
|
|
22
23
|
title="NV-Ingest Microservice",
|
|
23
24
|
description="Service for ingesting heterogenous datatypes",
|
|
24
|
-
version="25.
|
|
25
|
+
version="25.4.2",
|
|
25
26
|
contact={
|
|
26
27
|
"name": "NVIDIA Corporation",
|
|
27
28
|
"url": "https://nvidia.com",
|
|
@@ -31,6 +32,7 @@ app = FastAPI(
|
|
|
31
32
|
|
|
32
33
|
app.include_router(IngestApiRouter, prefix="/v1")
|
|
33
34
|
app.include_router(HealthApiRouter, prefix="/v1/health")
|
|
35
|
+
app.include_router(MetricsApiRouter, prefix="/v1")
|
|
34
36
|
|
|
35
37
|
# Set up the tracer provider and add a processor for exporting traces
|
|
36
38
|
resource = Resource(attributes={"service.name": "nv-ingest"})
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
|
|
2
|
+
# All rights reserved.
|
|
3
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
4
|
+
import logging
|
|
5
|
+
|
|
6
|
+
from fastapi import APIRouter, Response, status
|
|
7
|
+
from prometheus_client import Counter, Histogram, generate_latest, CONTENT_TYPE_LATEST
|
|
8
|
+
|
|
9
|
+
router = APIRouter()
|
|
10
|
+
|
|
11
|
+
# logger = logging.getLogger("uvicorn")
|
|
12
|
+
logger = logging.getLogger(__name__)
|
|
13
|
+
|
|
14
|
+
# Prometheus metrics
|
|
15
|
+
REQUEST_COUNT = Counter("http_requests_total", "Total HTTP Requests", ["method", "endpoint"])
|
|
16
|
+
REQUEST_LATENCY = Histogram("http_request_duration_seconds", "Request latency", ["method", "endpoint"])
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@router.get(
|
|
20
|
+
"/metrics",
|
|
21
|
+
tags=["Health"],
|
|
22
|
+
summary="Provide prometheus formatted metrics for consumption",
|
|
23
|
+
description="""
|
|
24
|
+
Provide prometheus formatted metrics for consumption by a prometheus scraping server.
|
|
25
|
+
""",
|
|
26
|
+
status_code=status.HTTP_200_OK,
|
|
27
|
+
)
|
|
28
|
+
def metrics():
|
|
29
|
+
return Response(generate_latest(), media_type=CONTENT_TYPE_LATEST)
|
|
@@ -74,9 +74,26 @@ class PipelineTopology:
|
|
|
74
74
|
self._start_cleanup_thread() # Start background cleanup on init
|
|
75
75
|
|
|
76
76
|
def __del__(self):
|
|
77
|
-
"""Ensure cleanup thread is stopped
|
|
78
|
-
logger.debug("PipelineTopology destructor called
|
|
79
|
-
|
|
77
|
+
"""Ensure cleanup thread is stopped and internal actor references are released."""
|
|
78
|
+
logger.debug("PipelineTopology destructor called. Cleaning up thread and actor references.")
|
|
79
|
+
|
|
80
|
+
# Stop the background cleanup thread
|
|
81
|
+
try:
|
|
82
|
+
self._stop_cleanup_thread()
|
|
83
|
+
except Exception as e:
|
|
84
|
+
logger.warning(f"Error stopping cleanup thread during __del__: {e}")
|
|
85
|
+
|
|
86
|
+
# Clear references to actor handles and shutdown futures
|
|
87
|
+
try:
|
|
88
|
+
self._stage_actors.clear()
|
|
89
|
+
self._edge_queues.clear()
|
|
90
|
+
self._scaling_state.clear()
|
|
91
|
+
self._stage_memory_overhead.clear()
|
|
92
|
+
self._pending_removal_actors.clear()
|
|
93
|
+
self._stages.clear()
|
|
94
|
+
self._connections.clear()
|
|
95
|
+
except Exception as e:
|
|
96
|
+
logger.warning(f"Error clearing internal state during __del__: {e}")
|
|
80
97
|
|
|
81
98
|
# --- Lock Context Manager ---
|
|
82
99
|
@contextlib.contextmanager
|
|
@@ -2,7 +2,11 @@
|
|
|
2
2
|
# All rights reserved.
|
|
3
3
|
# SPDX-License-Identifier: Apache-2.0
|
|
4
4
|
|
|
5
|
+
import multiprocessing
|
|
6
|
+
import os
|
|
7
|
+
import signal
|
|
5
8
|
import threading
|
|
9
|
+
from abc import ABC, abstractmethod
|
|
6
10
|
from collections import defaultdict
|
|
7
11
|
from dataclasses import dataclass
|
|
8
12
|
|
|
@@ -24,6 +28,35 @@ from nv_ingest.framework.orchestration.ray.util.pipeline.pid_controller import P
|
|
|
24
28
|
logger = logging.getLogger(__name__)
|
|
25
29
|
|
|
26
30
|
|
|
31
|
+
class PipelineInterface(ABC):
|
|
32
|
+
"""
|
|
33
|
+
Abstract base class for pipeline implementations.
|
|
34
|
+
|
|
35
|
+
Any concrete pipeline must implement start and stop methods.
|
|
36
|
+
"""
|
|
37
|
+
|
|
38
|
+
@abstractmethod
|
|
39
|
+
def start(self, monitor_poll_interval: float = 5.0, scaling_poll_interval: float = 30.0) -> None:
|
|
40
|
+
"""
|
|
41
|
+
Start the pipeline.
|
|
42
|
+
|
|
43
|
+
Parameters
|
|
44
|
+
----------
|
|
45
|
+
monitor_poll_interval : float
|
|
46
|
+
Interval in seconds for monitoring poll (default: 5.0).
|
|
47
|
+
scaling_poll_interval : float
|
|
48
|
+
Interval in seconds for scaling decisions (default: 30.0).
|
|
49
|
+
"""
|
|
50
|
+
pass
|
|
51
|
+
|
|
52
|
+
@abstractmethod
|
|
53
|
+
def stop(self) -> None:
|
|
54
|
+
"""
|
|
55
|
+
Stop the pipeline and perform any necessary cleanup.
|
|
56
|
+
"""
|
|
57
|
+
pass
|
|
58
|
+
|
|
59
|
+
|
|
27
60
|
# --- Configuration Objects ---
|
|
28
61
|
|
|
29
62
|
|
|
@@ -62,7 +95,90 @@ class StatsConfig:
|
|
|
62
95
|
queue_timeout_seconds: float = 2.0
|
|
63
96
|
|
|
64
97
|
|
|
65
|
-
class
|
|
98
|
+
class RayPipelineSubprocessInterface(PipelineInterface):
|
|
99
|
+
"""
|
|
100
|
+
Pipeline interface implementation for a subprocess-based Ray pipeline.
|
|
101
|
+
"""
|
|
102
|
+
|
|
103
|
+
def __init__(self, process: multiprocessing.Process):
|
|
104
|
+
"""
|
|
105
|
+
Parameters
|
|
106
|
+
----------
|
|
107
|
+
process : multiprocessing.Process
|
|
108
|
+
A handle to the running subprocess.
|
|
109
|
+
"""
|
|
110
|
+
self._process: multiprocessing.Process = process
|
|
111
|
+
|
|
112
|
+
def start(self, monitor_poll_interval: float = 5.0, scaling_poll_interval: float = 30.0) -> None:
|
|
113
|
+
"""
|
|
114
|
+
Start is not supported because the subprocess is assumed to already be running.
|
|
115
|
+
"""
|
|
116
|
+
pass
|
|
117
|
+
|
|
118
|
+
def stop(self) -> None:
|
|
119
|
+
"""
|
|
120
|
+
Stops the subprocess pipeline. Tries terminate(), then escalates to SIGKILL on the process group if needed.
|
|
121
|
+
"""
|
|
122
|
+
if not self._process.is_alive():
|
|
123
|
+
return
|
|
124
|
+
|
|
125
|
+
try:
|
|
126
|
+
self._process.terminate()
|
|
127
|
+
self._process.join(timeout=5.0)
|
|
128
|
+
except Exception as e:
|
|
129
|
+
logger.warning(f"Failed to terminate process cleanly: {e}")
|
|
130
|
+
|
|
131
|
+
if self._process.is_alive():
|
|
132
|
+
try:
|
|
133
|
+
pgid = os.getpgid(self._process.pid)
|
|
134
|
+
os.killpg(pgid, signal.SIGKILL)
|
|
135
|
+
except Exception as e:
|
|
136
|
+
logger.error(f"Failed to force-kill process group: {e}")
|
|
137
|
+
self._process.join(timeout=3.0)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
class RayPipelineInterface(PipelineInterface):
|
|
141
|
+
"""
|
|
142
|
+
Pipeline interface for an in-process RayPipeline instance.
|
|
143
|
+
"""
|
|
144
|
+
|
|
145
|
+
def __init__(self, pipeline: "RayPipeline"):
|
|
146
|
+
"""
|
|
147
|
+
Parameters
|
|
148
|
+
----------
|
|
149
|
+
pipeline : RayPipeline
|
|
150
|
+
The instantiated pipeline to control.
|
|
151
|
+
"""
|
|
152
|
+
self._pipeline = pipeline
|
|
153
|
+
|
|
154
|
+
def start(self, monitor_poll_interval: float = 5.0, scaling_poll_interval: float = 30.0) -> None:
|
|
155
|
+
"""
|
|
156
|
+
Starts the RayPipeline.
|
|
157
|
+
|
|
158
|
+
Parameters
|
|
159
|
+
----------
|
|
160
|
+
monitor_poll_interval : float
|
|
161
|
+
Unused here; provided for interface compatibility.
|
|
162
|
+
scaling_poll_interval : float
|
|
163
|
+
Unused here; provided for interface compatibility.
|
|
164
|
+
"""
|
|
165
|
+
self._pipeline.start(monitor_poll_interval, scaling_poll_interval)
|
|
166
|
+
|
|
167
|
+
def stop(self) -> None:
|
|
168
|
+
"""
|
|
169
|
+
Stops the RayPipeline and shuts down Ray.
|
|
170
|
+
"""
|
|
171
|
+
self._pipeline.stop()
|
|
172
|
+
|
|
173
|
+
try:
|
|
174
|
+
import ray
|
|
175
|
+
|
|
176
|
+
ray.shutdown()
|
|
177
|
+
except Exception:
|
|
178
|
+
pass
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
class RayPipeline(PipelineInterface):
|
|
66
182
|
"""
|
|
67
183
|
A structured pipeline supporting dynamic scaling and queue flushing.
|
|
68
184
|
Uses PIDController and ResourceConstraintManager. Supports optional GUI display.
|
|
@@ -91,6 +207,8 @@ class RayPipeline:
|
|
|
91
207
|
# --- State ---
|
|
92
208
|
# self.scaling_state: Dict[str, str] = {}
|
|
93
209
|
self.prev_global_memory_usage: Optional[int] = None
|
|
210
|
+
self._state_lock: threading.Lock = threading.Lock()
|
|
211
|
+
self._stopping = False
|
|
94
212
|
|
|
95
213
|
# --- Build Time Config & State ---
|
|
96
214
|
# Use scaling_config for these
|
|
@@ -149,10 +267,17 @@ class RayPipeline:
|
|
|
149
267
|
actor_timeout=self.stats_config.actor_timeout_seconds,
|
|
150
268
|
queue_timeout=self.stats_config.queue_timeout_seconds,
|
|
151
269
|
)
|
|
270
|
+
|
|
152
271
|
logger.info("RayStatsCollector initialized using StatsConfig.")
|
|
153
272
|
|
|
154
273
|
# --- Accessor Methods for Stats Collector (and internal use) ---
|
|
155
274
|
|
|
275
|
+
def __del__(self):
|
|
276
|
+
try:
|
|
277
|
+
self.stop()
|
|
278
|
+
except Exception as e:
|
|
279
|
+
logger.error(f"Exception during RayPipeline cleanup: {e}")
|
|
280
|
+
|
|
156
281
|
def get_stages_info(self) -> List[StageInfo]:
|
|
157
282
|
"""Returns a snapshot of the current stage information."""
|
|
158
283
|
return self.topology.get_stages_info()
|
|
@@ -514,7 +639,9 @@ class RayPipeline:
|
|
|
514
639
|
"""
|
|
515
640
|
current_count = len(current_replicas)
|
|
516
641
|
num_to_remove = current_count - target_count
|
|
517
|
-
logger.
|
|
642
|
+
logger.debug(
|
|
643
|
+
f"[ScaleDown-{stage_name}] Scaling down from {current_count} to {target_count} (-{num_to_remove})."
|
|
644
|
+
)
|
|
518
645
|
|
|
519
646
|
# Basic validation
|
|
520
647
|
if num_to_remove <= 0:
|
|
@@ -562,7 +689,7 @@ class RayPipeline:
|
|
|
562
689
|
logger.warning(f"[ScaleDown-{stage_name}] No actors successfully initiated stop for registration.")
|
|
563
690
|
|
|
564
691
|
total_attempted = len(actors_to_remove)
|
|
565
|
-
logger.
|
|
692
|
+
logger.debug(
|
|
566
693
|
f"[ScaleDown-{stage_name}] Scale down initiation process complete for {total_attempted} actors "
|
|
567
694
|
f"(Skipped/Failed Initiation: {stop_initiation_failures}). Topology cleanup will handle final removal."
|
|
568
695
|
)
|
|
@@ -645,9 +772,6 @@ class RayPipeline:
|
|
|
645
772
|
# Activity check
|
|
646
773
|
is_quiet = global_in_flight <= self.quiet_period_threshold
|
|
647
774
|
|
|
648
|
-
if is_quiet:
|
|
649
|
-
logger.info(f"Pipeline IS quiet. In-Flight: {global_in_flight} <= Threshold: {self.quiet_period_threshold}")
|
|
650
|
-
|
|
651
775
|
return is_quiet
|
|
652
776
|
|
|
653
777
|
def _wait_for_pipeline_drain(self, timeout_seconds: int) -> bool:
|
|
@@ -668,7 +792,6 @@ class RayPipeline:
|
|
|
668
792
|
return False
|
|
669
793
|
|
|
670
794
|
# --- Trigger immediate stats collection via the collector instance ---
|
|
671
|
-
drain_stats = {}
|
|
672
795
|
drain_success = False
|
|
673
796
|
collection_error = None
|
|
674
797
|
|
|
@@ -687,19 +810,18 @@ class RayPipeline:
|
|
|
687
810
|
if not collection_error
|
|
688
811
|
else f"Collection Error: {type(collection_error).__name__}"
|
|
689
812
|
)
|
|
690
|
-
logger.
|
|
691
|
-
f"[
|
|
813
|
+
logger.debug(
|
|
814
|
+
f"[Drain] Check at {elapsed_time:.1f}s: Global In-Flight={global_in_flight} ({status_msg})"
|
|
692
815
|
)
|
|
693
816
|
last_in_flight = global_in_flight
|
|
694
817
|
|
|
695
818
|
# --- Check for successful drain ---
|
|
696
819
|
# Requires BOTH in-flight=0 AND the collection reporting it was successful
|
|
697
820
|
if global_in_flight == 0 and drain_success and not collection_error:
|
|
698
|
-
logger.info(f"Pipeline confirmed drained (In-Flight=0) in {elapsed_time:.1f}s.")
|
|
699
821
|
return True
|
|
700
822
|
elif global_in_flight == 0: # Saw zero, but collection wasn't fully successful
|
|
701
823
|
logger.warning(
|
|
702
|
-
"[
|
|
824
|
+
"[Drain] In-Flight reached 0, but stats collection had errors/timeouts."
|
|
703
825
|
" Cannot confirm drain yet."
|
|
704
826
|
)
|
|
705
827
|
|
|
@@ -711,13 +833,12 @@ class RayPipeline:
|
|
|
711
833
|
|
|
712
834
|
def _execute_queue_flush(self) -> bool:
|
|
713
835
|
"""Executes queue flush, using topology for state and structure."""
|
|
714
|
-
if self.topology.get_is_flushing(): # Check topology state
|
|
715
|
-
logger.warning("Queue flush requested but already in progress. Ignoring.")
|
|
836
|
+
if self.topology.get_is_flushing() or self._stopping: # Check topology state
|
|
837
|
+
logger.warning("Queue flush requested but already in progress or pipeline is stopping. Ignoring.")
|
|
716
838
|
return False
|
|
717
839
|
|
|
718
840
|
# Set flushing state in topology
|
|
719
841
|
self.topology.set_flushing(True)
|
|
720
|
-
logger.info("--- Starting Queue Flush ---")
|
|
721
842
|
overall_success = False
|
|
722
843
|
source_actors_paused = []
|
|
723
844
|
pause_refs = []
|
|
@@ -732,7 +853,7 @@ class RayPipeline:
|
|
|
732
853
|
current_connections = self.topology.get_connections()
|
|
733
854
|
|
|
734
855
|
# --- 1. Pause Source Stages (using snapshots) ---
|
|
735
|
-
logger.
|
|
856
|
+
logger.debug("Pausing source stages...")
|
|
736
857
|
pause_timeout = 60.0
|
|
737
858
|
for stage in current_stages:
|
|
738
859
|
if stage.is_source:
|
|
@@ -745,22 +866,22 @@ class RayPipeline:
|
|
|
745
866
|
except Exception as e:
|
|
746
867
|
logger.error(f"Failed sending pause to {actor}: {e}")
|
|
747
868
|
if pause_refs:
|
|
748
|
-
logger.
|
|
869
|
+
logger.debug(f"Waiting up to {pause_timeout}s for {len(pause_refs)} sources to pause...")
|
|
749
870
|
try:
|
|
750
871
|
ray.get(pause_refs, timeout=pause_timeout)
|
|
751
|
-
logger.
|
|
872
|
+
logger.debug(f"{len(pause_refs)} sources acknowledged pause.")
|
|
752
873
|
except GetTimeoutError:
|
|
753
874
|
logger.warning(f"Timeout waiting for {len(pause_refs)} sources to pause.")
|
|
754
875
|
except Exception as e:
|
|
755
876
|
logger.error(f"Error waiting for sources pause: {e}. Proceeding cautiously.")
|
|
756
877
|
|
|
757
878
|
# --- 2. Wait for Drain ---
|
|
758
|
-
logger.
|
|
879
|
+
logger.debug("Waiting for pipeline to drain...")
|
|
759
880
|
if not self._wait_for_pipeline_drain(self.queue_flush_drain_timeout_seconds):
|
|
760
881
|
raise RuntimeError("Pipeline drain failed or timed out, aborting flush.")
|
|
761
882
|
|
|
762
883
|
# --- 3. Create New Queues (using snapshot) ---
|
|
763
|
-
logger.
|
|
884
|
+
logger.debug("Creating new replacement queues...")
|
|
764
885
|
new_edge_queues_map = {}
|
|
765
886
|
for queue_name, (_, queue_size) in current_edge_queues.items():
|
|
766
887
|
try:
|
|
@@ -773,7 +894,7 @@ class RayPipeline:
|
|
|
773
894
|
raise RuntimeError(f"Failed to create new queue '{queue_name}'.") from e
|
|
774
895
|
|
|
775
896
|
# --- 4. Re-wire Actors to New Queues (using snapshots) ---
|
|
776
|
-
logger.
|
|
897
|
+
logger.debug("Re-wiring actors to new queues...")
|
|
777
898
|
wiring_refs = []
|
|
778
899
|
wiring_timeout = 120.0
|
|
779
900
|
for from_stage_name, conns in current_connections.items():
|
|
@@ -809,7 +930,7 @@ class RayPipeline:
|
|
|
809
930
|
raise RuntimeError("Actor re-wiring failed.") from e
|
|
810
931
|
|
|
811
932
|
# --- 5. Update Topology State (Commit Point) ---
|
|
812
|
-
logger.
|
|
933
|
+
logger.debug("Committing new queues to pipeline topology.")
|
|
813
934
|
self.topology.set_edge_queues(new_edge_queues_map) # Commit the change
|
|
814
935
|
overall_success = True
|
|
815
936
|
|
|
@@ -820,7 +941,7 @@ class RayPipeline:
|
|
|
820
941
|
finally:
|
|
821
942
|
# --- 6. Resume Source Stages (Always attempt) ---
|
|
822
943
|
if source_actors_paused:
|
|
823
|
-
logger.
|
|
944
|
+
logger.debug(f"Attempting to resume {len(source_actors_paused)} source actors...")
|
|
824
945
|
resume_timeout = 30.0
|
|
825
946
|
resume_refs = []
|
|
826
947
|
for actor in source_actors_paused:
|
|
@@ -829,10 +950,10 @@ class RayPipeline:
|
|
|
829
950
|
except Exception as e:
|
|
830
951
|
logger.error(f"Failed sending resume to {actor}: {e}")
|
|
831
952
|
if resume_refs:
|
|
832
|
-
logger.
|
|
953
|
+
logger.debug(f"Waiting up to {resume_timeout}s for {len(resume_refs)} actors to resume...")
|
|
833
954
|
try:
|
|
834
955
|
ray.get(resume_refs, timeout=resume_timeout)
|
|
835
|
-
logger.
|
|
956
|
+
logger.debug(f"{len(resume_refs)} sources resumed.")
|
|
836
957
|
except GetTimeoutError:
|
|
837
958
|
logger.warning(f"Timeout waiting for {len(resume_refs)} sources to resume.")
|
|
838
959
|
except Exception as e:
|
|
@@ -841,9 +962,6 @@ class RayPipeline:
|
|
|
841
962
|
# Update flush timestamp only on success
|
|
842
963
|
if overall_success:
|
|
843
964
|
self._last_queue_flush_time = time.time()
|
|
844
|
-
logger.info("--- Queue Flush Completed Successfully ---")
|
|
845
|
-
else:
|
|
846
|
-
logger.error("--- Queue Flush Failed ---")
|
|
847
965
|
|
|
848
966
|
# Reset flushing state in topology
|
|
849
967
|
self.topology.set_flushing(False)
|
|
@@ -853,8 +971,9 @@ class RayPipeline:
|
|
|
853
971
|
def request_queue_flush(self, force: bool = False) -> None:
|
|
854
972
|
"""Requests a queue flush, checking topology state."""
|
|
855
973
|
logger.info(f"Manual queue flush requested (force={force}).")
|
|
856
|
-
|
|
857
|
-
|
|
974
|
+
|
|
975
|
+
if self.topology.get_is_flushing() or self._stopping: # Check topology
|
|
976
|
+
logger.warning("Flush already in progress or pipeline is stopping.")
|
|
858
977
|
return
|
|
859
978
|
if force or self._is_pipeline_quiet():
|
|
860
979
|
# Consider running _execute_queue_flush in a separate thread
|
|
@@ -974,7 +1093,7 @@ class RayPipeline:
|
|
|
974
1093
|
|
|
975
1094
|
if target_replica_count != current_count:
|
|
976
1095
|
stages_needing_action.append((stage_name, target_replica_count))
|
|
977
|
-
logger.
|
|
1096
|
+
logger.debug(
|
|
978
1097
|
f"[ScalingApply-{stage_name}] Action: Current={current_count}, "
|
|
979
1098
|
f"Target={target_replica_count} (Min={stage_info.min_replicas}, Max={stage_info.max_replicas})"
|
|
980
1099
|
)
|
|
@@ -1016,69 +1135,80 @@ class RayPipeline:
|
|
|
1016
1135
|
completed = sum(1 for r in action_results.values() if r["status"] == "completed")
|
|
1017
1136
|
errors = sum(1 for r in action_results.values() if r["status"] == "error")
|
|
1018
1137
|
timeouts = sum(1 for r in action_results.values() if r["status"] == "timeout")
|
|
1019
|
-
logger.
|
|
1138
|
+
logger.debug(f"[ScalingApply] Summary: {completed} completed, {errors} errors, {timeouts} timeouts.")
|
|
1020
1139
|
|
|
1021
1140
|
def _perform_scaling_and_maintenance(self) -> None:
|
|
1022
1141
|
"""Orchestrates scaling/maintenance using topology and stats collector."""
|
|
1023
|
-
|
|
1142
|
+
|
|
1143
|
+
if self._stopping:
|
|
1144
|
+
logger.debug("Pipeline is stopping. Skipping scaling cycle.")
|
|
1145
|
+
return
|
|
1024
1146
|
|
|
1025
1147
|
if not self.dynamic_memory_scaling:
|
|
1026
1148
|
logger.debug("Dynamic memory scaling disabled. Skipping cycle.")
|
|
1027
1149
|
return
|
|
1028
1150
|
|
|
1029
|
-
cycle_start_time = time.time()
|
|
1030
|
-
|
|
1031
|
-
# Check flushing state via topology
|
|
1032
1151
|
if self.topology.get_is_flushing():
|
|
1033
1152
|
logger.debug("Skipping scaling cycle: Queue flush in progress (topology state).")
|
|
1034
1153
|
return
|
|
1035
1154
|
|
|
1036
|
-
|
|
1155
|
+
got_lock = self._state_lock.acquire(timeout=0.1)
|
|
1156
|
+
if not got_lock:
|
|
1157
|
+
logger.debug("Could not acquire lock for maintenance; skipping cycle.")
|
|
1158
|
+
return
|
|
1159
|
+
|
|
1160
|
+
cycle_start_time = time.time()
|
|
1037
1161
|
try:
|
|
1162
|
+
if self._stopping:
|
|
1163
|
+
logger.debug("Pipeline began stopping after acquiring lock. Skipping maintenance logic.")
|
|
1164
|
+
return
|
|
1165
|
+
|
|
1166
|
+
logger.debug("--- Performing Scaling & Maintenance Cycle ---")
|
|
1167
|
+
|
|
1038
1168
|
if self._is_pipeline_quiet():
|
|
1039
|
-
logger.info("Pipeline quiet, initiating queue flush.")
|
|
1040
|
-
flush_success = self._execute_queue_flush()
|
|
1041
|
-
logger.info(f"Automatic queue flush completed. Success: {flush_success}")
|
|
1042
|
-
return
|
|
1043
|
-
except Exception as e:
|
|
1044
|
-
logger.error(f"Error during quiet check or flush: {e}. Skipping cycle.", exc_info=True)
|
|
1045
|
-
return
|
|
1169
|
+
logger.info("[Drain] Pipeline quiet, initiating queue flush.")
|
|
1170
|
+
flush_success = self._execute_queue_flush()
|
|
1171
|
+
logger.info(f"[Drain] Automatic queue flush completed. Success: {flush_success}")
|
|
1172
|
+
return
|
|
1046
1173
|
|
|
1047
|
-
|
|
1048
|
-
|
|
1049
|
-
|
|
1050
|
-
)
|
|
1174
|
+
# Fast return check if stopping occurred while flushing or checking flush status
|
|
1175
|
+
if self._stopping:
|
|
1176
|
+
return
|
|
1051
1177
|
|
|
1052
|
-
|
|
1053
|
-
|
|
1054
|
-
if not current_stage_stats or not stats_were_successful or last_update_age > max_stats_age_for_scaling:
|
|
1055
|
-
status = "No stats" if not current_stage_stats else "Failed" if not stats_were_successful else "Stale"
|
|
1056
|
-
logger.warning(
|
|
1057
|
-
f"[Scaling] Cannot scale reliably: Stats {status} (Age: {last_update_age:.1f}s). Skipping cycle."
|
|
1178
|
+
current_stage_stats, global_in_flight, last_update_time, stats_were_successful = (
|
|
1179
|
+
self.stats_collector.get_latest_stats()
|
|
1058
1180
|
)
|
|
1059
|
-
return
|
|
1060
1181
|
|
|
1061
|
-
|
|
1062
|
-
|
|
1063
|
-
|
|
1064
|
-
|
|
1065
|
-
|
|
1182
|
+
last_update_age = time.time() - last_update_time
|
|
1183
|
+
max_age = max(15.0, self._stats_collection_interval_seconds)
|
|
1184
|
+
if not current_stage_stats or not stats_were_successful or last_update_age > max_age:
|
|
1185
|
+
status = "No stats" if not current_stage_stats else "Failed" if not stats_were_successful else "Stale"
|
|
1186
|
+
logger.warning(
|
|
1187
|
+
f"[Scaling] Cannot scale reliably: Stats {status} (Age: {last_update_age:.1f}s). Skipping cycle."
|
|
1188
|
+
)
|
|
1189
|
+
return
|
|
1066
1190
|
|
|
1067
|
-
|
|
1068
|
-
|
|
1191
|
+
current_stage_metrics = self._gather_controller_metrics(current_stage_stats, global_in_flight)
|
|
1192
|
+
if not current_stage_metrics:
|
|
1193
|
+
logger.error("[Scaling] Failed to gather metrics. Skipping.")
|
|
1194
|
+
return
|
|
1069
1195
|
|
|
1070
|
-
|
|
1071
|
-
|
|
1072
|
-
|
|
1073
|
-
|
|
1196
|
+
current_global_memory_mb = self._get_current_global_memory()
|
|
1197
|
+
final_adjustments = self._calculate_scaling_adjustments(
|
|
1198
|
+
current_stage_metrics, global_in_flight, current_global_memory_mb
|
|
1199
|
+
)
|
|
1200
|
+
self.prev_global_memory_usage = current_global_memory_mb
|
|
1201
|
+
self._apply_scaling_actions(final_adjustments)
|
|
1074
1202
|
|
|
1075
|
-
|
|
1076
|
-
|
|
1203
|
+
logger.debug(
|
|
1204
|
+
f"--- Scaling & Maintenance Cycle Complete (Duration: {time.time() - cycle_start_time:.2f}s) ---"
|
|
1205
|
+
)
|
|
1077
1206
|
|
|
1078
|
-
|
|
1079
|
-
|
|
1207
|
+
except Exception as e: # noqa
|
|
1208
|
+
logger.error("Exception during maintenance cycle", exc_info=True)
|
|
1080
1209
|
|
|
1081
|
-
|
|
1210
|
+
finally:
|
|
1211
|
+
self._state_lock.release()
|
|
1082
1212
|
|
|
1083
1213
|
# --- Lifecycle Methods for Monitoring/Scaling Threads ---
|
|
1084
1214
|
def _scaling_loop(self, interval: float) -> None:
|
|
@@ -1149,39 +1279,44 @@ class RayPipeline:
|
|
|
1149
1279
|
"""Stops background threads and actors (via topology)."""
|
|
1150
1280
|
logger.info("Stopping pipeline...")
|
|
1151
1281
|
|
|
1282
|
+
if self._stopping:
|
|
1283
|
+
return
|
|
1284
|
+
self._stopping = True
|
|
1285
|
+
|
|
1152
1286
|
# 1. Stop background threads first
|
|
1153
|
-
self.
|
|
1154
|
-
|
|
1287
|
+
with self._state_lock:
|
|
1288
|
+
self._stop_scaling()
|
|
1289
|
+
self.stats_collector.stop()
|
|
1290
|
+
|
|
1291
|
+
# 2. Stop actors (using topology)
|
|
1292
|
+
logger.debug("Stopping all stage actors...")
|
|
1293
|
+
stop_refs_map: Dict[ray.ObjectRef, Any] = {}
|
|
1155
1294
|
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
stop_refs_map: Dict[ray.ObjectRef, Any] = {}
|
|
1159
|
-
actors_to_kill = []
|
|
1295
|
+
# Get actors snapshot from topology
|
|
1296
|
+
current_actors = {name: list(actors) for name, actors in self.topology.get_stage_actors().items()}
|
|
1160
1297
|
|
|
1161
|
-
|
|
1162
|
-
|
|
1298
|
+
for stage_name, actors in current_actors.items():
|
|
1299
|
+
for actor in actors:
|
|
1300
|
+
try:
|
|
1301
|
+
stop_refs_map[actor.stop.remote()] = actor
|
|
1302
|
+
except Exception as e:
|
|
1303
|
+
logger.warning(f"Error initiating stop for {actor} in {stage_name}: {e}. Skipping.")
|
|
1163
1304
|
|
|
1164
|
-
|
|
1165
|
-
|
|
1305
|
+
if stop_refs_map:
|
|
1306
|
+
stop_refs = list(stop_refs_map.keys())
|
|
1307
|
+
logger.debug(f"Waiting up to 60s for {len(stop_refs)} actors to stop gracefully...")
|
|
1166
1308
|
try:
|
|
1167
|
-
|
|
1309
|
+
ready, not_ready = ray.wait(stop_refs, num_returns=len(stop_refs), timeout=60.0)
|
|
1310
|
+
if not_ready:
|
|
1311
|
+
logger.warning(
|
|
1312
|
+
f"Timeout waiting for {len(not_ready)} actors to stop. Allowing Ray to clean up."
|
|
1313
|
+
)
|
|
1314
|
+
logger.info(f"{len(ready)} actors stopped via stop().")
|
|
1168
1315
|
except Exception as e:
|
|
1169
|
-
logger.
|
|
1170
|
-
|
|
1171
|
-
if stop_refs_map:
|
|
1172
|
-
stop_refs = list(stop_refs_map.keys())
|
|
1173
|
-
logger.debug(f"Waiting up to 60s for {len(stop_refs)} actors to stop gracefully...")
|
|
1174
|
-
try:
|
|
1175
|
-
ready, not_ready = ray.wait(stop_refs, num_returns=len(stop_refs), timeout=60.0)
|
|
1176
|
-
if not_ready:
|
|
1177
|
-
logger.warning(f"Timeout waiting for {len(not_ready)} actors to stop. Will kill.")
|
|
1178
|
-
actors_to_kill.extend(stop_refs_map.get(ref) for ref in not_ready if stop_refs_map.get(ref))
|
|
1179
|
-
logger.info(f"{len(ready)} actors stopped via stop().")
|
|
1180
|
-
except Exception as e:
|
|
1181
|
-
logger.error(f"Error during actor stop confirmation: {e}", exc_info=True)
|
|
1182
|
-
actors_to_kill.extend(a for a in stop_refs_map.values() if a not in actors_to_kill) # Add all on error
|
|
1316
|
+
logger.error(f"Error during actor stop confirmation: {e}", exc_info=True)
|
|
1183
1317
|
|
|
1184
|
-
|
|
1185
|
-
|
|
1318
|
+
# Clear runtime state in topology
|
|
1319
|
+
self.topology.clear_runtime_state()
|
|
1320
|
+
del self.topology
|
|
1186
1321
|
|
|
1187
|
-
|
|
1322
|
+
logger.info("Pipeline stopped.")
|