nv-ingest 2025.5.21.dev20250521__tar.gz → 2025.5.29.dev20250529__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest might be problematic. Click here for more details.

Files changed (108) hide show
  1. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/PKG-INFO +6 -4
  2. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/api/main.py +3 -1
  3. nv_ingest-2025.5.29.dev20250529/nv_ingest/api/v1/metrics.py +29 -0
  4. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +20 -3
  5. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +233 -98
  6. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +1 -1
  7. nv_ingest-2025.5.29.dev20250529/nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py +82 -0
  8. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +1 -1
  9. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +33 -33
  10. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +7 -2
  11. nv_ingest-2025.5.29.dev20250529/nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +376 -0
  12. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +15 -0
  13. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest.egg-info/PKG-INFO +6 -4
  14. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest.egg-info/SOURCES.txt +2 -0
  15. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest.egg-info/requires.txt +5 -3
  16. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/pyproject.toml +5 -3
  17. nv_ingest-2025.5.21.dev20250521/nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +0 -170
  18. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/LICENSE +0 -0
  19. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/MANIFEST.in +0 -0
  20. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/__init__.py +0 -0
  21. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/api/__init__.py +0 -0
  22. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/api/v1/__init__.py +0 -0
  23. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/api/v1/health.py +0 -0
  24. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/api/v1/ingest.py +0 -0
  25. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/__init__.py +0 -0
  26. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/__init__.py +0 -0
  27. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/__init__.py +0 -0
  28. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/edges/__init__.py +0 -0
  29. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/edges/async_queue_edge.py +0 -0
  30. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/edges/ray_queue_edge.py +0 -0
  31. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/edges/threaded_queue_edge.py +0 -0
  32. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/examples/__init__.py +0 -0
  33. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +0 -0
  34. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/examples/task_source_harness.py +0 -0
  35. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py +0 -0
  36. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/primitives/__init__.py +0 -0
  37. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/primitives/dataclasses.py +0 -0
  38. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py +0 -0
  39. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/__init__.py +0 -0
  40. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py +0 -0
  41. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +0 -0
  42. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +0 -0
  43. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +0 -0
  44. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +0 -0
  45. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +0 -0
  46. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +0 -0
  47. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +0 -0
  48. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +0 -0
  49. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/injectors/__init__.py +0 -0
  50. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +0 -0
  51. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/meta/__init__.py +0 -0
  52. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_edge_base.py +0 -0
  53. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +0 -0
  54. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +0 -0
  55. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +0 -0
  56. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/mutate/__init__.py +0 -0
  57. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +0 -0
  58. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +0 -0
  59. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/sinks/__init__.py +0 -0
  60. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +0 -0
  61. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +0 -0
  62. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/sources/__init__.py +0 -0
  63. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/storage/__init__.py +0 -0
  64. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +0 -0
  65. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +0 -0
  66. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/telemetry/__init__.py +0 -0
  67. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +0 -0
  68. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/telemetry/otel_meter.py +0 -0
  69. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +0 -0
  70. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/transforms/__init__.py +0 -0
  71. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +0 -0
  72. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +0 -0
  73. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +0 -0
  74. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/utility/__init__.py +0 -0
  75. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +0 -0
  76. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/util/__init__.py +0 -0
  77. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/util/pipeline/__init__.py +0 -0
  78. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/util/system_tools/__init__.py +0 -0
  79. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/util/system_tools/memory.py +0 -0
  80. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/util/system_tools/visualizers.py +0 -0
  81. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/schemas/__init__.py +0 -0
  82. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/schemas/framework_ingest_config_schema.py +0 -0
  83. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/schemas/framework_job_counter_schema.py +0 -0
  84. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/schemas/framework_message_broker_sink_schema.py +0 -0
  85. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/schemas/framework_message_broker_source_schema.py +0 -0
  86. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/schemas/framework_message_wrapper_schema.py +0 -0
  87. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/schemas/framework_metadata_injector_schema.py +0 -0
  88. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/schemas/framework_otel_meter_schema.py +0 -0
  89. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/schemas/framework_otel_tracer_schema.py +0 -0
  90. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/schemas/framework_processing_job_schema.py +0 -0
  91. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/schemas/framework_task_injection_schema.py +0 -0
  92. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/schemas/framework_vdb_task_sink_schema.py +0 -0
  93. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/util/__init__.py +0 -0
  94. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/util/flow_control/__init__.py +0 -0
  95. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/util/flow_control/filter_by_task.py +0 -0
  96. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/util/service/__init__.py +0 -0
  97. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/util/service/impl/__init__.py +0 -0
  98. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/util/service/impl/ingest/__init__.py +0 -0
  99. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +0 -0
  100. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/util/service/meta/__init__.py +0 -0
  101. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/util/service/meta/ingest/__init__.py +0 -0
  102. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py +0 -0
  103. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/util/telemetry/__init__.py +0 -0
  104. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/util/telemetry/global_stats.py +0 -0
  105. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/version.py +0 -0
  106. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest.egg-info/dependency_links.txt +0 -0
  107. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/nv_ingest.egg-info/top_level.txt +0 -0
  108. {nv_ingest-2025.5.21.dev20250521 → nv_ingest-2025.5.29.dev20250529}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest
3
- Version: 2025.5.21.dev20250521
3
+ Version: 2025.5.29.dev20250529
4
4
  Summary: Python module for multimodal document ingestion
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -225,13 +225,13 @@ Requires-Dist: httpx>=0.28.1
225
225
  Requires-Dist: isodate>=0.7.2
226
226
  Requires-Dist: langdetect>=1.0.9
227
227
  Requires-Dist: minio>=7.2.12
228
- Requires-Dist: openai>=1.57.1
228
+ Requires-Dist: openai>=1.82.0
229
229
  Requires-Dist: opentelemetry-api>=1.27.0
230
230
  Requires-Dist: opentelemetry-exporter-otlp>=1.27.0
231
231
  Requires-Dist: opentelemetry-sdk>=1.27.0
232
232
  Requires-Dist: pydantic>2.0.0
233
233
  Requires-Dist: pydantic-settings>2.0.0
234
- Requires-Dist: pypdfium2>=4.30.0
234
+ Requires-Dist: pypdfium2==4.30.1
235
235
  Requires-Dist: pytest>=8.0.2
236
236
  Requires-Dist: pytest-mock>=3.14.0
237
237
  Requires-Dist: pytest-cov>=6.0.0
@@ -239,6 +239,7 @@ Requires-Dist: build>=1.2.2
239
239
  Requires-Dist: python-docx>=1.1.2
240
240
  Requires-Dist: python-dotenv>=1.0.1
241
241
  Requires-Dist: python-pptx>=1.0.2
242
+ Requires-Dist: prometheus-client
242
243
  Requires-Dist: torch==2.4.1
243
244
  Requires-Dist: ray[all]>=2.37.0
244
245
  Requires-Dist: redis>=5.2.1
@@ -255,9 +256,10 @@ Requires-Dist: uvicorn
255
256
  Requires-Dist: pip
256
257
  Requires-Dist: llama-index-embeddings-nvidia
257
258
  Requires-Dist: opencv-python
258
- Requires-Dist: pymilvus>=2.5.0
259
+ Requires-Dist: pymilvus>=2.5.10
259
260
  Requires-Dist: pymilvus[bulk_writer,model]
260
261
  Requires-Dist: tritonclient
261
262
  Requires-Dist: nvidia-riva-client>=2.18.0
262
263
  Requires-Dist: unstructured-client
264
+ Requires-Dist: markitdown
263
265
  Dynamic: license-file
@@ -14,6 +14,7 @@ from opentelemetry.sdk.trace.export import BatchSpanProcessor
14
14
 
15
15
  from .v1.health import router as HealthApiRouter
16
16
  from .v1.ingest import router as IngestApiRouter
17
+ from .v1.metrics import router as MetricsApiRouter
17
18
 
18
19
  logger = logging.getLogger(__name__)
19
20
 
@@ -21,7 +22,7 @@ logger = logging.getLogger(__name__)
21
22
  app = FastAPI(
22
23
  title="NV-Ingest Microservice",
23
24
  description="Service for ingesting heterogenous datatypes",
24
- version="25.3.0",
25
+ version="25.4.2",
25
26
  contact={
26
27
  "name": "NVIDIA Corporation",
27
28
  "url": "https://nvidia.com",
@@ -31,6 +32,7 @@ app = FastAPI(
31
32
 
32
33
  app.include_router(IngestApiRouter, prefix="/v1")
33
34
  app.include_router(HealthApiRouter, prefix="/v1/health")
35
+ app.include_router(MetricsApiRouter, prefix="/v1")
34
36
 
35
37
  # Set up the tracer provider and add a processor for exporting traces
36
38
  resource = Resource(attributes={"service.name": "nv-ingest"})
@@ -0,0 +1,29 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ import logging
5
+
6
+ from fastapi import APIRouter, Response, status
7
+ from prometheus_client import Counter, Histogram, generate_latest, CONTENT_TYPE_LATEST
8
+
9
+ router = APIRouter()
10
+
11
+ # logger = logging.getLogger("uvicorn")
12
+ logger = logging.getLogger(__name__)
13
+
14
+ # Prometheus metrics
15
+ REQUEST_COUNT = Counter("http_requests_total", "Total HTTP Requests", ["method", "endpoint"])
16
+ REQUEST_LATENCY = Histogram("http_request_duration_seconds", "Request latency", ["method", "endpoint"])
17
+
18
+
19
+ @router.get(
20
+ "/metrics",
21
+ tags=["Health"],
22
+ summary="Provide prometheus formatted metrics for consumption",
23
+ description="""
24
+ Provide prometheus formatted metrics for consumption by a prometheus scraping server.
25
+ """,
26
+ status_code=status.HTTP_200_OK,
27
+ )
28
+ def metrics():
29
+ return Response(generate_latest(), media_type=CONTENT_TYPE_LATEST)
@@ -74,9 +74,26 @@ class PipelineTopology:
74
74
  self._start_cleanup_thread() # Start background cleanup on init
75
75
 
76
76
  def __del__(self):
77
- """Ensure cleanup thread is stopped when topology object is destroyed."""
78
- logger.debug("PipelineTopology destructor called, ensuring cleanup thread is stopped.")
79
- self._stop_cleanup_thread()
77
+ """Ensure cleanup thread is stopped and internal actor references are released."""
78
+ logger.debug("PipelineTopology destructor called. Cleaning up thread and actor references.")
79
+
80
+ # Stop the background cleanup thread
81
+ try:
82
+ self._stop_cleanup_thread()
83
+ except Exception as e:
84
+ logger.warning(f"Error stopping cleanup thread during __del__: {e}")
85
+
86
+ # Clear references to actor handles and shutdown futures
87
+ try:
88
+ self._stage_actors.clear()
89
+ self._edge_queues.clear()
90
+ self._scaling_state.clear()
91
+ self._stage_memory_overhead.clear()
92
+ self._pending_removal_actors.clear()
93
+ self._stages.clear()
94
+ self._connections.clear()
95
+ except Exception as e:
96
+ logger.warning(f"Error clearing internal state during __del__: {e}")
80
97
 
81
98
  # --- Lock Context Manager ---
82
99
  @contextlib.contextmanager
@@ -2,7 +2,11 @@
2
2
  # All rights reserved.
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
+ import multiprocessing
6
+ import os
7
+ import signal
5
8
  import threading
9
+ from abc import ABC, abstractmethod
6
10
  from collections import defaultdict
7
11
  from dataclasses import dataclass
8
12
 
@@ -24,6 +28,35 @@ from nv_ingest.framework.orchestration.ray.util.pipeline.pid_controller import P
24
28
  logger = logging.getLogger(__name__)
25
29
 
26
30
 
31
+ class PipelineInterface(ABC):
32
+ """
33
+ Abstract base class for pipeline implementations.
34
+
35
+ Any concrete pipeline must implement start and stop methods.
36
+ """
37
+
38
+ @abstractmethod
39
+ def start(self, monitor_poll_interval: float = 5.0, scaling_poll_interval: float = 30.0) -> None:
40
+ """
41
+ Start the pipeline.
42
+
43
+ Parameters
44
+ ----------
45
+ monitor_poll_interval : float
46
+ Interval in seconds for monitoring poll (default: 5.0).
47
+ scaling_poll_interval : float
48
+ Interval in seconds for scaling decisions (default: 30.0).
49
+ """
50
+ pass
51
+
52
+ @abstractmethod
53
+ def stop(self) -> None:
54
+ """
55
+ Stop the pipeline and perform any necessary cleanup.
56
+ """
57
+ pass
58
+
59
+
27
60
  # --- Configuration Objects ---
28
61
 
29
62
 
@@ -62,7 +95,90 @@ class StatsConfig:
62
95
  queue_timeout_seconds: float = 2.0
63
96
 
64
97
 
65
- class RayPipeline:
98
+ class RayPipelineSubprocessInterface(PipelineInterface):
99
+ """
100
+ Pipeline interface implementation for a subprocess-based Ray pipeline.
101
+ """
102
+
103
+ def __init__(self, process: multiprocessing.Process):
104
+ """
105
+ Parameters
106
+ ----------
107
+ process : multiprocessing.Process
108
+ A handle to the running subprocess.
109
+ """
110
+ self._process: multiprocessing.Process = process
111
+
112
+ def start(self, monitor_poll_interval: float = 5.0, scaling_poll_interval: float = 30.0) -> None:
113
+ """
114
+ Start is not supported because the subprocess is assumed to already be running.
115
+ """
116
+ pass
117
+
118
+ def stop(self) -> None:
119
+ """
120
+ Stops the subprocess pipeline. Tries terminate(), then escalates to SIGKILL on the process group if needed.
121
+ """
122
+ if not self._process.is_alive():
123
+ return
124
+
125
+ try:
126
+ self._process.terminate()
127
+ self._process.join(timeout=5.0)
128
+ except Exception as e:
129
+ logger.warning(f"Failed to terminate process cleanly: {e}")
130
+
131
+ if self._process.is_alive():
132
+ try:
133
+ pgid = os.getpgid(self._process.pid)
134
+ os.killpg(pgid, signal.SIGKILL)
135
+ except Exception as e:
136
+ logger.error(f"Failed to force-kill process group: {e}")
137
+ self._process.join(timeout=3.0)
138
+
139
+
140
+ class RayPipelineInterface(PipelineInterface):
141
+ """
142
+ Pipeline interface for an in-process RayPipeline instance.
143
+ """
144
+
145
+ def __init__(self, pipeline: "RayPipeline"):
146
+ """
147
+ Parameters
148
+ ----------
149
+ pipeline : RayPipeline
150
+ The instantiated pipeline to control.
151
+ """
152
+ self._pipeline = pipeline
153
+
154
+ def start(self, monitor_poll_interval: float = 5.0, scaling_poll_interval: float = 30.0) -> None:
155
+ """
156
+ Starts the RayPipeline.
157
+
158
+ Parameters
159
+ ----------
160
+ monitor_poll_interval : float
161
+ Unused here; provided for interface compatibility.
162
+ scaling_poll_interval : float
163
+ Unused here; provided for interface compatibility.
164
+ """
165
+ self._pipeline.start(monitor_poll_interval, scaling_poll_interval)
166
+
167
+ def stop(self) -> None:
168
+ """
169
+ Stops the RayPipeline and shuts down Ray.
170
+ """
171
+ self._pipeline.stop()
172
+
173
+ try:
174
+ import ray
175
+
176
+ ray.shutdown()
177
+ except Exception:
178
+ pass
179
+
180
+
181
+ class RayPipeline(PipelineInterface):
66
182
  """
67
183
  A structured pipeline supporting dynamic scaling and queue flushing.
68
184
  Uses PIDController and ResourceConstraintManager. Supports optional GUI display.
@@ -91,6 +207,8 @@ class RayPipeline:
91
207
  # --- State ---
92
208
  # self.scaling_state: Dict[str, str] = {}
93
209
  self.prev_global_memory_usage: Optional[int] = None
210
+ self._state_lock: threading.Lock = threading.Lock()
211
+ self._stopping = False
94
212
 
95
213
  # --- Build Time Config & State ---
96
214
  # Use scaling_config for these
@@ -149,10 +267,17 @@ class RayPipeline:
149
267
  actor_timeout=self.stats_config.actor_timeout_seconds,
150
268
  queue_timeout=self.stats_config.queue_timeout_seconds,
151
269
  )
270
+
152
271
  logger.info("RayStatsCollector initialized using StatsConfig.")
153
272
 
154
273
  # --- Accessor Methods for Stats Collector (and internal use) ---
155
274
 
275
+ def __del__(self):
276
+ try:
277
+ self.stop()
278
+ except Exception as e:
279
+ logger.error(f"Exception during RayPipeline cleanup: {e}")
280
+
156
281
  def get_stages_info(self) -> List[StageInfo]:
157
282
  """Returns a snapshot of the current stage information."""
158
283
  return self.topology.get_stages_info()
@@ -514,7 +639,9 @@ class RayPipeline:
514
639
  """
515
640
  current_count = len(current_replicas)
516
641
  num_to_remove = current_count - target_count
517
- logger.info(f"[ScaleDown-{stage_name}] Scaling down from {current_count} to {target_count} (-{num_to_remove}).")
642
+ logger.debug(
643
+ f"[ScaleDown-{stage_name}] Scaling down from {current_count} to {target_count} (-{num_to_remove})."
644
+ )
518
645
 
519
646
  # Basic validation
520
647
  if num_to_remove <= 0:
@@ -562,7 +689,7 @@ class RayPipeline:
562
689
  logger.warning(f"[ScaleDown-{stage_name}] No actors successfully initiated stop for registration.")
563
690
 
564
691
  total_attempted = len(actors_to_remove)
565
- logger.info(
692
+ logger.debug(
566
693
  f"[ScaleDown-{stage_name}] Scale down initiation process complete for {total_attempted} actors "
567
694
  f"(Skipped/Failed Initiation: {stop_initiation_failures}). Topology cleanup will handle final removal."
568
695
  )
@@ -645,9 +772,6 @@ class RayPipeline:
645
772
  # Activity check
646
773
  is_quiet = global_in_flight <= self.quiet_period_threshold
647
774
 
648
- if is_quiet:
649
- logger.info(f"Pipeline IS quiet. In-Flight: {global_in_flight} <= Threshold: {self.quiet_period_threshold}")
650
-
651
775
  return is_quiet
652
776
 
653
777
  def _wait_for_pipeline_drain(self, timeout_seconds: int) -> bool:
@@ -668,7 +792,6 @@ class RayPipeline:
668
792
  return False
669
793
 
670
794
  # --- Trigger immediate stats collection via the collector instance ---
671
- drain_stats = {}
672
795
  drain_success = False
673
796
  collection_error = None
674
797
 
@@ -687,19 +810,18 @@ class RayPipeline:
687
810
  if not collection_error
688
811
  else f"Collection Error: {type(collection_error).__name__}"
689
812
  )
690
- logger.info(
691
- f"[DrainWait] Check at {elapsed_time:.1f}s: Global In-Flight={global_in_flight} ({status_msg})"
813
+ logger.debug(
814
+ f"[Drain] Check at {elapsed_time:.1f}s: Global In-Flight={global_in_flight} ({status_msg})"
692
815
  )
693
816
  last_in_flight = global_in_flight
694
817
 
695
818
  # --- Check for successful drain ---
696
819
  # Requires BOTH in-flight=0 AND the collection reporting it was successful
697
820
  if global_in_flight == 0 and drain_success and not collection_error:
698
- logger.info(f"Pipeline confirmed drained (In-Flight=0) in {elapsed_time:.1f}s.")
699
821
  return True
700
822
  elif global_in_flight == 0: # Saw zero, but collection wasn't fully successful
701
823
  logger.warning(
702
- "[DrainWait] In-Flight reached 0, but stats collection had errors/timeouts."
824
+ "[Drain] In-Flight reached 0, but stats collection had errors/timeouts."
703
825
  " Cannot confirm drain yet."
704
826
  )
705
827
 
@@ -711,13 +833,12 @@ class RayPipeline:
711
833
 
712
834
  def _execute_queue_flush(self) -> bool:
713
835
  """Executes queue flush, using topology for state and structure."""
714
- if self.topology.get_is_flushing(): # Check topology state
715
- logger.warning("Queue flush requested but already in progress. Ignoring.")
836
+ if self.topology.get_is_flushing() or self._stopping: # Check topology state
837
+ logger.warning("Queue flush requested but already in progress or pipeline is stopping. Ignoring.")
716
838
  return False
717
839
 
718
840
  # Set flushing state in topology
719
841
  self.topology.set_flushing(True)
720
- logger.info("--- Starting Queue Flush ---")
721
842
  overall_success = False
722
843
  source_actors_paused = []
723
844
  pause_refs = []
@@ -732,7 +853,7 @@ class RayPipeline:
732
853
  current_connections = self.topology.get_connections()
733
854
 
734
855
  # --- 1. Pause Source Stages (using snapshots) ---
735
- logger.info("Pausing source stages...")
856
+ logger.debug("Pausing source stages...")
736
857
  pause_timeout = 60.0
737
858
  for stage in current_stages:
738
859
  if stage.is_source:
@@ -745,22 +866,22 @@ class RayPipeline:
745
866
  except Exception as e:
746
867
  logger.error(f"Failed sending pause to {actor}: {e}")
747
868
  if pause_refs:
748
- logger.info(f"Waiting up to {pause_timeout}s for {len(pause_refs)} sources to pause...")
869
+ logger.debug(f"Waiting up to {pause_timeout}s for {len(pause_refs)} sources to pause...")
749
870
  try:
750
871
  ray.get(pause_refs, timeout=pause_timeout)
751
- logger.info(f"{len(pause_refs)} sources acknowledged pause.")
872
+ logger.debug(f"{len(pause_refs)} sources acknowledged pause.")
752
873
  except GetTimeoutError:
753
874
  logger.warning(f"Timeout waiting for {len(pause_refs)} sources to pause.")
754
875
  except Exception as e:
755
876
  logger.error(f"Error waiting for sources pause: {e}. Proceeding cautiously.")
756
877
 
757
878
  # --- 2. Wait for Drain ---
758
- logger.info("Waiting for pipeline to drain...")
879
+ logger.debug("Waiting for pipeline to drain...")
759
880
  if not self._wait_for_pipeline_drain(self.queue_flush_drain_timeout_seconds):
760
881
  raise RuntimeError("Pipeline drain failed or timed out, aborting flush.")
761
882
 
762
883
  # --- 3. Create New Queues (using snapshot) ---
763
- logger.info("Creating new replacement queues...")
884
+ logger.debug("Creating new replacement queues...")
764
885
  new_edge_queues_map = {}
765
886
  for queue_name, (_, queue_size) in current_edge_queues.items():
766
887
  try:
@@ -773,7 +894,7 @@ class RayPipeline:
773
894
  raise RuntimeError(f"Failed to create new queue '{queue_name}'.") from e
774
895
 
775
896
  # --- 4. Re-wire Actors to New Queues (using snapshots) ---
776
- logger.info("Re-wiring actors to new queues...")
897
+ logger.debug("Re-wiring actors to new queues...")
777
898
  wiring_refs = []
778
899
  wiring_timeout = 120.0
779
900
  for from_stage_name, conns in current_connections.items():
@@ -809,7 +930,7 @@ class RayPipeline:
809
930
  raise RuntimeError("Actor re-wiring failed.") from e
810
931
 
811
932
  # --- 5. Update Topology State (Commit Point) ---
812
- logger.info("Committing new queues to pipeline topology.")
933
+ logger.debug("Committing new queues to pipeline topology.")
813
934
  self.topology.set_edge_queues(new_edge_queues_map) # Commit the change
814
935
  overall_success = True
815
936
 
@@ -820,7 +941,7 @@ class RayPipeline:
820
941
  finally:
821
942
  # --- 6. Resume Source Stages (Always attempt) ---
822
943
  if source_actors_paused:
823
- logger.info(f"Attempting to resume {len(source_actors_paused)} source actors...")
944
+ logger.debug(f"Attempting to resume {len(source_actors_paused)} source actors...")
824
945
  resume_timeout = 30.0
825
946
  resume_refs = []
826
947
  for actor in source_actors_paused:
@@ -829,10 +950,10 @@ class RayPipeline:
829
950
  except Exception as e:
830
951
  logger.error(f"Failed sending resume to {actor}: {e}")
831
952
  if resume_refs:
832
- logger.info(f"Waiting up to {resume_timeout}s for {len(resume_refs)} actors to resume...")
953
+ logger.debug(f"Waiting up to {resume_timeout}s for {len(resume_refs)} actors to resume...")
833
954
  try:
834
955
  ray.get(resume_refs, timeout=resume_timeout)
835
- logger.info(f"{len(resume_refs)} sources resumed.")
956
+ logger.debug(f"{len(resume_refs)} sources resumed.")
836
957
  except GetTimeoutError:
837
958
  logger.warning(f"Timeout waiting for {len(resume_refs)} sources to resume.")
838
959
  except Exception as e:
@@ -841,9 +962,6 @@ class RayPipeline:
841
962
  # Update flush timestamp only on success
842
963
  if overall_success:
843
964
  self._last_queue_flush_time = time.time()
844
- logger.info("--- Queue Flush Completed Successfully ---")
845
- else:
846
- logger.error("--- Queue Flush Failed ---")
847
965
 
848
966
  # Reset flushing state in topology
849
967
  self.topology.set_flushing(False)
@@ -853,8 +971,9 @@ class RayPipeline:
853
971
  def request_queue_flush(self, force: bool = False) -> None:
854
972
  """Requests a queue flush, checking topology state."""
855
973
  logger.info(f"Manual queue flush requested (force={force}).")
856
- if self.topology.get_is_flushing(): # Check topology
857
- logger.warning("Flush already in progress.")
974
+
975
+ if self.topology.get_is_flushing() or self._stopping: # Check topology
976
+ logger.warning("Flush already in progress or pipeline is stopping.")
858
977
  return
859
978
  if force or self._is_pipeline_quiet():
860
979
  # Consider running _execute_queue_flush in a separate thread
@@ -974,7 +1093,7 @@ class RayPipeline:
974
1093
 
975
1094
  if target_replica_count != current_count:
976
1095
  stages_needing_action.append((stage_name, target_replica_count))
977
- logger.info(
1096
+ logger.debug(
978
1097
  f"[ScalingApply-{stage_name}] Action: Current={current_count}, "
979
1098
  f"Target={target_replica_count} (Min={stage_info.min_replicas}, Max={stage_info.max_replicas})"
980
1099
  )
@@ -1016,69 +1135,80 @@ class RayPipeline:
1016
1135
  completed = sum(1 for r in action_results.values() if r["status"] == "completed")
1017
1136
  errors = sum(1 for r in action_results.values() if r["status"] == "error")
1018
1137
  timeouts = sum(1 for r in action_results.values() if r["status"] == "timeout")
1019
- logger.info(f"[ScalingApply] Summary: {completed} completed, {errors} errors, {timeouts} timeouts.")
1138
+ logger.debug(f"[ScalingApply] Summary: {completed} completed, {errors} errors, {timeouts} timeouts.")
1020
1139
 
1021
1140
  def _perform_scaling_and_maintenance(self) -> None:
1022
1141
  """Orchestrates scaling/maintenance using topology and stats collector."""
1023
- logger.debug("--- Performing Scaling & Maintenance Cycle ---")
1142
+
1143
+ if self._stopping:
1144
+ logger.debug("Pipeline is stopping. Skipping scaling cycle.")
1145
+ return
1024
1146
 
1025
1147
  if not self.dynamic_memory_scaling:
1026
1148
  logger.debug("Dynamic memory scaling disabled. Skipping cycle.")
1027
1149
  return
1028
1150
 
1029
- cycle_start_time = time.time()
1030
-
1031
- # Check flushing state via topology
1032
1151
  if self.topology.get_is_flushing():
1033
1152
  logger.debug("Skipping scaling cycle: Queue flush in progress (topology state).")
1034
1153
  return
1035
1154
 
1036
- # --- Check for quietness for flushing (uses topology state via helper) ---
1155
+ got_lock = self._state_lock.acquire(timeout=0.1)
1156
+ if not got_lock:
1157
+ logger.debug("Could not acquire lock for maintenance; skipping cycle.")
1158
+ return
1159
+
1160
+ cycle_start_time = time.time()
1037
1161
  try:
1162
+ if self._stopping:
1163
+ logger.debug("Pipeline began stopping after acquiring lock. Skipping maintenance logic.")
1164
+ return
1165
+
1166
+ logger.debug("--- Performing Scaling & Maintenance Cycle ---")
1167
+
1038
1168
  if self._is_pipeline_quiet():
1039
- logger.info("Pipeline quiet, initiating queue flush.")
1040
- flush_success = self._execute_queue_flush() # Uses topology internally
1041
- logger.info(f"Automatic queue flush completed. Success: {flush_success}")
1042
- return # Skip scaling if flush occurred
1043
- except Exception as e:
1044
- logger.error(f"Error during quiet check or flush: {e}. Skipping cycle.", exc_info=True)
1045
- return
1169
+ logger.info("[Drain] Pipeline quiet, initiating queue flush.")
1170
+ flush_success = self._execute_queue_flush()
1171
+ logger.info(f"[Drain] Automatic queue flush completed. Success: {flush_success}")
1172
+ return
1046
1173
 
1047
- # --- Get & Validate Stats ---
1048
- current_stage_stats, global_in_flight, last_update_time, stats_were_successful = (
1049
- self.stats_collector.get_latest_stats()
1050
- )
1174
+ # Fast return check if stopping occurred while flushing or checking flush status
1175
+ if self._stopping:
1176
+ return
1051
1177
 
1052
- last_update_age = time.time() - last_update_time
1053
- max_stats_age_for_scaling = max(15.0, self._stats_collection_interval_seconds)
1054
- if not current_stage_stats or not stats_were_successful or last_update_age > max_stats_age_for_scaling:
1055
- status = "No stats" if not current_stage_stats else "Failed" if not stats_were_successful else "Stale"
1056
- logger.warning(
1057
- f"[Scaling] Cannot scale reliably: Stats {status} (Age: {last_update_age:.1f}s). Skipping cycle."
1178
+ current_stage_stats, global_in_flight, last_update_time, stats_were_successful = (
1179
+ self.stats_collector.get_latest_stats()
1058
1180
  )
1059
- return
1060
1181
 
1061
- # --- Gather Metrics (uses topology via helper) ---
1062
- current_stage_metrics = self._gather_controller_metrics(current_stage_stats, global_in_flight)
1063
- if not current_stage_metrics:
1064
- logger.error("[Scaling] Failed gather metrics. Skipping.")
1065
- return
1182
+ last_update_age = time.time() - last_update_time
1183
+ max_age = max(15.0, self._stats_collection_interval_seconds)
1184
+ if not current_stage_stats or not stats_were_successful or last_update_age > max_age:
1185
+ status = "No stats" if not current_stage_stats else "Failed" if not stats_were_successful else "Stale"
1186
+ logger.warning(
1187
+ f"[Scaling] Cannot scale reliably: Stats {status} (Age: {last_update_age:.1f}s). Skipping cycle."
1188
+ )
1189
+ return
1066
1190
 
1067
- # --- Get Memory Usage ---
1068
- current_global_memory_mb = self._get_current_global_memory()
1191
+ current_stage_metrics = self._gather_controller_metrics(current_stage_stats, global_in_flight)
1192
+ if not current_stage_metrics:
1193
+ logger.error("[Scaling] Failed to gather metrics. Skipping.")
1194
+ return
1069
1195
 
1070
- # --- Calculate Scaling Adjustments (uses topology via helper) ---
1071
- final_adjustments = self._calculate_scaling_adjustments(
1072
- current_stage_metrics, global_in_flight, current_global_memory_mb
1073
- )
1196
+ current_global_memory_mb = self._get_current_global_memory()
1197
+ final_adjustments = self._calculate_scaling_adjustments(
1198
+ current_stage_metrics, global_in_flight, current_global_memory_mb
1199
+ )
1200
+ self.prev_global_memory_usage = current_global_memory_mb
1201
+ self._apply_scaling_actions(final_adjustments)
1074
1202
 
1075
- # --- Update Memory Usage *After* Decision ---
1076
- self.prev_global_memory_usage = current_global_memory_mb
1203
+ logger.debug(
1204
+ f"--- Scaling & Maintenance Cycle Complete (Duration: {time.time() - cycle_start_time:.2f}s) ---"
1205
+ )
1077
1206
 
1078
- # --- Apply Scaling Actions (uses topology via helper) ---
1079
- self._apply_scaling_actions(final_adjustments)
1207
+ except Exception as e: # noqa
1208
+ logger.error("Exception during maintenance cycle", exc_info=True)
1080
1209
 
1081
- logger.debug(f"--- Scaling & Maintenance Cycle Complete (Duration: {time.time() - cycle_start_time:.2f}s) ---")
1210
+ finally:
1211
+ self._state_lock.release()
1082
1212
 
1083
1213
  # --- Lifecycle Methods for Monitoring/Scaling Threads ---
1084
1214
  def _scaling_loop(self, interval: float) -> None:
@@ -1149,39 +1279,44 @@ class RayPipeline:
1149
1279
  """Stops background threads and actors (via topology)."""
1150
1280
  logger.info("Stopping pipeline...")
1151
1281
 
1282
+ if self._stopping:
1283
+ return
1284
+ self._stopping = True
1285
+
1152
1286
  # 1. Stop background threads first
1153
- self._stop_scaling()
1154
- self.stats_collector.stop()
1287
+ with self._state_lock:
1288
+ self._stop_scaling()
1289
+ self.stats_collector.stop()
1290
+
1291
+ # 2. Stop actors (using topology)
1292
+ logger.debug("Stopping all stage actors...")
1293
+ stop_refs_map: Dict[ray.ObjectRef, Any] = {}
1155
1294
 
1156
- # 2. Stop actors (using topology)
1157
- logger.debug("Stopping all stage actors...")
1158
- stop_refs_map: Dict[ray.ObjectRef, Any] = {}
1159
- actors_to_kill = []
1295
+ # Get actors snapshot from topology
1296
+ current_actors = {name: list(actors) for name, actors in self.topology.get_stage_actors().items()}
1160
1297
 
1161
- # Get actors snapshot from topology
1162
- current_actors = {name: list(actors) for name, actors in self.topology.get_stage_actors().items()}
1298
+ for stage_name, actors in current_actors.items():
1299
+ for actor in actors:
1300
+ try:
1301
+ stop_refs_map[actor.stop.remote()] = actor
1302
+ except Exception as e:
1303
+ logger.warning(f"Error initiating stop for {actor} in {stage_name}: {e}. Skipping.")
1163
1304
 
1164
- for stage_name, actors in current_actors.items():
1165
- for actor in actors:
1305
+ if stop_refs_map:
1306
+ stop_refs = list(stop_refs_map.keys())
1307
+ logger.debug(f"Waiting up to 60s for {len(stop_refs)} actors to stop gracefully...")
1166
1308
  try:
1167
- stop_refs_map[actor.stop.remote()] = actor
1309
+ ready, not_ready = ray.wait(stop_refs, num_returns=len(stop_refs), timeout=60.0)
1310
+ if not_ready:
1311
+ logger.warning(
1312
+ f"Timeout waiting for {len(not_ready)} actors to stop. Allowing Ray to clean up."
1313
+ )
1314
+ logger.info(f"{len(ready)} actors stopped via stop().")
1168
1315
  except Exception as e:
1169
- logger.warning(f"Error initiating stop for {actor} in {stage_name}: {e}. Will kill.")
1170
-
1171
- if stop_refs_map:
1172
- stop_refs = list(stop_refs_map.keys())
1173
- logger.debug(f"Waiting up to 60s for {len(stop_refs)} actors to stop gracefully...")
1174
- try:
1175
- ready, not_ready = ray.wait(stop_refs, num_returns=len(stop_refs), timeout=60.0)
1176
- if not_ready:
1177
- logger.warning(f"Timeout waiting for {len(not_ready)} actors to stop. Will kill.")
1178
- actors_to_kill.extend(stop_refs_map.get(ref) for ref in not_ready if stop_refs_map.get(ref))
1179
- logger.info(f"{len(ready)} actors stopped via stop().")
1180
- except Exception as e:
1181
- logger.error(f"Error during actor stop confirmation: {e}", exc_info=True)
1182
- actors_to_kill.extend(a for a in stop_refs_map.values() if a not in actors_to_kill) # Add all on error
1316
+ logger.error(f"Error during actor stop confirmation: {e}", exc_info=True)
1183
1317
 
1184
- # Clear runtime state in topology
1185
- self.topology.clear_runtime_state()
1318
+ # Clear runtime state in topology
1319
+ self.topology.clear_runtime_state()
1320
+ del self.topology
1186
1321
 
1187
- logger.info("Pipeline stopped.")
1322
+ logger.info("Pipeline stopped.")