nv-ingest 2025.5.22.dev20250522__tar.gz → 2025.5.29.dev20250529__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest might be problematic. Click here for more details.

Files changed (108) hide show
  1. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/PKG-INFO +5 -3
  2. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/api/main.py +3 -1
  3. nv_ingest-2025.5.29.dev20250529/nv_ingest/api/v1/metrics.py +29 -0
  4. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +20 -3
  5. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +146 -29
  6. nv_ingest-2025.5.29.dev20250529/nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py +82 -0
  7. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +1 -1
  8. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +33 -33
  9. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +7 -2
  10. nv_ingest-2025.5.29.dev20250529/nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +376 -0
  11. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +15 -0
  12. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest.egg-info/PKG-INFO +5 -3
  13. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest.egg-info/SOURCES.txt +2 -0
  14. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest.egg-info/requires.txt +4 -2
  15. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/pyproject.toml +4 -2
  16. nv_ingest-2025.5.22.dev20250522/nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +0 -170
  17. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/LICENSE +0 -0
  18. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/MANIFEST.in +0 -0
  19. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/__init__.py +0 -0
  20. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/api/__init__.py +0 -0
  21. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/api/v1/__init__.py +0 -0
  22. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/api/v1/health.py +0 -0
  23. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/api/v1/ingest.py +0 -0
  24. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/__init__.py +0 -0
  25. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/__init__.py +0 -0
  26. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/__init__.py +0 -0
  27. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/edges/__init__.py +0 -0
  28. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/edges/async_queue_edge.py +0 -0
  29. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/edges/ray_queue_edge.py +0 -0
  30. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/edges/threaded_queue_edge.py +0 -0
  31. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/examples/__init__.py +0 -0
  32. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +0 -0
  33. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/examples/task_source_harness.py +0 -0
  34. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py +0 -0
  35. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/primitives/__init__.py +0 -0
  36. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/primitives/dataclasses.py +0 -0
  37. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py +0 -0
  38. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +0 -0
  39. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/__init__.py +0 -0
  40. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py +0 -0
  41. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +0 -0
  42. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +0 -0
  43. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +0 -0
  44. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +0 -0
  45. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +0 -0
  46. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +0 -0
  47. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +0 -0
  48. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +0 -0
  49. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/injectors/__init__.py +0 -0
  50. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +0 -0
  51. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/meta/__init__.py +0 -0
  52. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_edge_base.py +0 -0
  53. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +0 -0
  54. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +0 -0
  55. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +0 -0
  56. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/mutate/__init__.py +0 -0
  57. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +0 -0
  58. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +0 -0
  59. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/sinks/__init__.py +0 -0
  60. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +0 -0
  61. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +0 -0
  62. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/sources/__init__.py +0 -0
  63. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/storage/__init__.py +0 -0
  64. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +0 -0
  65. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +0 -0
  66. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/telemetry/__init__.py +0 -0
  67. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +0 -0
  68. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/telemetry/otel_meter.py +0 -0
  69. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +0 -0
  70. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/transforms/__init__.py +0 -0
  71. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +0 -0
  72. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +0 -0
  73. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +0 -0
  74. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/utility/__init__.py +0 -0
  75. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +0 -0
  76. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/util/__init__.py +0 -0
  77. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/util/pipeline/__init__.py +0 -0
  78. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/util/system_tools/__init__.py +0 -0
  79. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/util/system_tools/memory.py +0 -0
  80. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/orchestration/ray/util/system_tools/visualizers.py +0 -0
  81. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/schemas/__init__.py +0 -0
  82. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/schemas/framework_ingest_config_schema.py +0 -0
  83. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/schemas/framework_job_counter_schema.py +0 -0
  84. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/schemas/framework_message_broker_sink_schema.py +0 -0
  85. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/schemas/framework_message_broker_source_schema.py +0 -0
  86. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/schemas/framework_message_wrapper_schema.py +0 -0
  87. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/schemas/framework_metadata_injector_schema.py +0 -0
  88. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/schemas/framework_otel_meter_schema.py +0 -0
  89. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/schemas/framework_otel_tracer_schema.py +0 -0
  90. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/schemas/framework_processing_job_schema.py +0 -0
  91. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/schemas/framework_task_injection_schema.py +0 -0
  92. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/schemas/framework_vdb_task_sink_schema.py +0 -0
  93. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/util/__init__.py +0 -0
  94. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/util/flow_control/__init__.py +0 -0
  95. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/util/flow_control/filter_by_task.py +0 -0
  96. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/util/service/__init__.py +0 -0
  97. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/util/service/impl/__init__.py +0 -0
  98. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/util/service/impl/ingest/__init__.py +0 -0
  99. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +0 -0
  100. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/util/service/meta/__init__.py +0 -0
  101. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/util/service/meta/ingest/__init__.py +0 -0
  102. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py +0 -0
  103. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/util/telemetry/__init__.py +0 -0
  104. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/framework/util/telemetry/global_stats.py +0 -0
  105. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest/version.py +0 -0
  106. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest.egg-info/dependency_links.txt +0 -0
  107. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/nv_ingest.egg-info/top_level.txt +0 -0
  108. {nv_ingest-2025.5.22.dev20250522 → nv_ingest-2025.5.29.dev20250529}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest
3
- Version: 2025.5.22.dev20250522
3
+ Version: 2025.5.29.dev20250529
4
4
  Summary: Python module for multimodal document ingestion
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -225,7 +225,7 @@ Requires-Dist: httpx>=0.28.1
225
225
  Requires-Dist: isodate>=0.7.2
226
226
  Requires-Dist: langdetect>=1.0.9
227
227
  Requires-Dist: minio>=7.2.12
228
- Requires-Dist: openai>=1.57.1
228
+ Requires-Dist: openai>=1.82.0
229
229
  Requires-Dist: opentelemetry-api>=1.27.0
230
230
  Requires-Dist: opentelemetry-exporter-otlp>=1.27.0
231
231
  Requires-Dist: opentelemetry-sdk>=1.27.0
@@ -239,6 +239,7 @@ Requires-Dist: build>=1.2.2
239
239
  Requires-Dist: python-docx>=1.1.2
240
240
  Requires-Dist: python-dotenv>=1.0.1
241
241
  Requires-Dist: python-pptx>=1.0.2
242
+ Requires-Dist: prometheus-client
242
243
  Requires-Dist: torch==2.4.1
243
244
  Requires-Dist: ray[all]>=2.37.0
244
245
  Requires-Dist: redis>=5.2.1
@@ -255,9 +256,10 @@ Requires-Dist: uvicorn
255
256
  Requires-Dist: pip
256
257
  Requires-Dist: llama-index-embeddings-nvidia
257
258
  Requires-Dist: opencv-python
258
- Requires-Dist: pymilvus>=2.5.0
259
+ Requires-Dist: pymilvus>=2.5.10
259
260
  Requires-Dist: pymilvus[bulk_writer,model]
260
261
  Requires-Dist: tritonclient
261
262
  Requires-Dist: nvidia-riva-client>=2.18.0
262
263
  Requires-Dist: unstructured-client
264
+ Requires-Dist: markitdown
263
265
  Dynamic: license-file
@@ -14,6 +14,7 @@ from opentelemetry.sdk.trace.export import BatchSpanProcessor
14
14
 
15
15
  from .v1.health import router as HealthApiRouter
16
16
  from .v1.ingest import router as IngestApiRouter
17
+ from .v1.metrics import router as MetricsApiRouter
17
18
 
18
19
  logger = logging.getLogger(__name__)
19
20
 
@@ -21,7 +22,7 @@ logger = logging.getLogger(__name__)
21
22
  app = FastAPI(
22
23
  title="NV-Ingest Microservice",
23
24
  description="Service for ingesting heterogenous datatypes",
24
- version="25.3.0",
25
+ version="25.4.2",
25
26
  contact={
26
27
  "name": "NVIDIA Corporation",
27
28
  "url": "https://nvidia.com",
@@ -31,6 +32,7 @@ app = FastAPI(
31
32
 
32
33
  app.include_router(IngestApiRouter, prefix="/v1")
33
34
  app.include_router(HealthApiRouter, prefix="/v1/health")
35
+ app.include_router(MetricsApiRouter, prefix="/v1")
34
36
 
35
37
  # Set up the tracer provider and add a processor for exporting traces
36
38
  resource = Resource(attributes={"service.name": "nv-ingest"})
@@ -0,0 +1,29 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+ import logging
5
+
6
+ from fastapi import APIRouter, Response, status
7
+ from prometheus_client import Counter, Histogram, generate_latest, CONTENT_TYPE_LATEST
8
+
9
+ router = APIRouter()
10
+
11
+ # logger = logging.getLogger("uvicorn")
12
+ logger = logging.getLogger(__name__)
13
+
14
+ # Prometheus metrics
15
+ REQUEST_COUNT = Counter("http_requests_total", "Total HTTP Requests", ["method", "endpoint"])
16
+ REQUEST_LATENCY = Histogram("http_request_duration_seconds", "Request latency", ["method", "endpoint"])
17
+
18
+
19
+ @router.get(
20
+ "/metrics",
21
+ tags=["Health"],
22
+ summary="Provide prometheus formatted metrics for consumption",
23
+ description="""
24
+ Provide prometheus formatted metrics for consumption by a prometheus scraping server.
25
+ """,
26
+ status_code=status.HTTP_200_OK,
27
+ )
28
+ def metrics():
29
+ return Response(generate_latest(), media_type=CONTENT_TYPE_LATEST)
@@ -74,9 +74,26 @@ class PipelineTopology:
74
74
  self._start_cleanup_thread() # Start background cleanup on init
75
75
 
76
76
  def __del__(self):
77
- """Ensure cleanup thread is stopped when topology object is destroyed."""
78
- logger.debug("PipelineTopology destructor called, ensuring cleanup thread is stopped.")
79
- self._stop_cleanup_thread()
77
+ """Ensure cleanup thread is stopped and internal actor references are released."""
78
+ logger.debug("PipelineTopology destructor called. Cleaning up thread and actor references.")
79
+
80
+ # Stop the background cleanup thread
81
+ try:
82
+ self._stop_cleanup_thread()
83
+ except Exception as e:
84
+ logger.warning(f"Error stopping cleanup thread during __del__: {e}")
85
+
86
+ # Clear references to actor handles and shutdown futures
87
+ try:
88
+ self._stage_actors.clear()
89
+ self._edge_queues.clear()
90
+ self._scaling_state.clear()
91
+ self._stage_memory_overhead.clear()
92
+ self._pending_removal_actors.clear()
93
+ self._stages.clear()
94
+ self._connections.clear()
95
+ except Exception as e:
96
+ logger.warning(f"Error clearing internal state during __del__: {e}")
80
97
 
81
98
  # --- Lock Context Manager ---
82
99
  @contextlib.contextmanager
@@ -2,7 +2,11 @@
2
2
  # All rights reserved.
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
+ import multiprocessing
6
+ import os
7
+ import signal
5
8
  import threading
9
+ from abc import ABC, abstractmethod
6
10
  from collections import defaultdict
7
11
  from dataclasses import dataclass
8
12
 
@@ -24,6 +28,35 @@ from nv_ingest.framework.orchestration.ray.util.pipeline.pid_controller import P
24
28
  logger = logging.getLogger(__name__)
25
29
 
26
30
 
31
+ class PipelineInterface(ABC):
32
+ """
33
+ Abstract base class for pipeline implementations.
34
+
35
+ Any concrete pipeline must implement start and stop methods.
36
+ """
37
+
38
+ @abstractmethod
39
+ def start(self, monitor_poll_interval: float = 5.0, scaling_poll_interval: float = 30.0) -> None:
40
+ """
41
+ Start the pipeline.
42
+
43
+ Parameters
44
+ ----------
45
+ monitor_poll_interval : float
46
+ Interval in seconds for monitoring poll (default: 5.0).
47
+ scaling_poll_interval : float
48
+ Interval in seconds for scaling decisions (default: 30.0).
49
+ """
50
+ pass
51
+
52
+ @abstractmethod
53
+ def stop(self) -> None:
54
+ """
55
+ Stop the pipeline and perform any necessary cleanup.
56
+ """
57
+ pass
58
+
59
+
27
60
  # --- Configuration Objects ---
28
61
 
29
62
 
@@ -62,7 +95,90 @@ class StatsConfig:
62
95
  queue_timeout_seconds: float = 2.0
63
96
 
64
97
 
65
- class RayPipeline:
98
+ class RayPipelineSubprocessInterface(PipelineInterface):
99
+ """
100
+ Pipeline interface implementation for a subprocess-based Ray pipeline.
101
+ """
102
+
103
+ def __init__(self, process: multiprocessing.Process):
104
+ """
105
+ Parameters
106
+ ----------
107
+ process : multiprocessing.Process
108
+ A handle to the running subprocess.
109
+ """
110
+ self._process: multiprocessing.Process = process
111
+
112
+ def start(self, monitor_poll_interval: float = 5.0, scaling_poll_interval: float = 30.0) -> None:
113
+ """
114
+ Start is not supported because the subprocess is assumed to already be running.
115
+ """
116
+ pass
117
+
118
+ def stop(self) -> None:
119
+ """
120
+ Stops the subprocess pipeline. Tries terminate(), then escalates to SIGKILL on the process group if needed.
121
+ """
122
+ if not self._process.is_alive():
123
+ return
124
+
125
+ try:
126
+ self._process.terminate()
127
+ self._process.join(timeout=5.0)
128
+ except Exception as e:
129
+ logger.warning(f"Failed to terminate process cleanly: {e}")
130
+
131
+ if self._process.is_alive():
132
+ try:
133
+ pgid = os.getpgid(self._process.pid)
134
+ os.killpg(pgid, signal.SIGKILL)
135
+ except Exception as e:
136
+ logger.error(f"Failed to force-kill process group: {e}")
137
+ self._process.join(timeout=3.0)
138
+
139
+
140
+ class RayPipelineInterface(PipelineInterface):
141
+ """
142
+ Pipeline interface for an in-process RayPipeline instance.
143
+ """
144
+
145
+ def __init__(self, pipeline: "RayPipeline"):
146
+ """
147
+ Parameters
148
+ ----------
149
+ pipeline : RayPipeline
150
+ The instantiated pipeline to control.
151
+ """
152
+ self._pipeline = pipeline
153
+
154
+ def start(self, monitor_poll_interval: float = 5.0, scaling_poll_interval: float = 30.0) -> None:
155
+ """
156
+ Starts the RayPipeline.
157
+
158
+ Parameters
159
+ ----------
160
+ monitor_poll_interval : float
161
+ Unused here; provided for interface compatibility.
162
+ scaling_poll_interval : float
163
+ Unused here; provided for interface compatibility.
164
+ """
165
+ self._pipeline.start(monitor_poll_interval, scaling_poll_interval)
166
+
167
+ def stop(self) -> None:
168
+ """
169
+ Stops the RayPipeline and shuts down Ray.
170
+ """
171
+ self._pipeline.stop()
172
+
173
+ try:
174
+ import ray
175
+
176
+ ray.shutdown()
177
+ except Exception:
178
+ pass
179
+
180
+
181
+ class RayPipeline(PipelineInterface):
66
182
  """
67
183
  A structured pipeline supporting dynamic scaling and queue flushing.
68
184
  Uses PIDController and ResourceConstraintManager. Supports optional GUI display.
@@ -151,10 +267,17 @@ class RayPipeline:
151
267
  actor_timeout=self.stats_config.actor_timeout_seconds,
152
268
  queue_timeout=self.stats_config.queue_timeout_seconds,
153
269
  )
270
+
154
271
  logger.info("RayStatsCollector initialized using StatsConfig.")
155
272
 
156
273
  # --- Accessor Methods for Stats Collector (and internal use) ---
157
274
 
275
+ def __del__(self):
276
+ try:
277
+ self.stop()
278
+ except Exception as e:
279
+ logger.error(f"Exception during RayPipeline cleanup: {e}")
280
+
158
281
  def get_stages_info(self) -> List[StageInfo]:
159
282
  """Returns a snapshot of the current stage information."""
160
283
  return self.topology.get_stages_info()
@@ -516,7 +639,9 @@ class RayPipeline:
516
639
  """
517
640
  current_count = len(current_replicas)
518
641
  num_to_remove = current_count - target_count
519
- logger.info(f"[ScaleDown-{stage_name}] Scaling down from {current_count} to {target_count} (-{num_to_remove}).")
642
+ logger.debug(
643
+ f"[ScaleDown-{stage_name}] Scaling down from {current_count} to {target_count} (-{num_to_remove})."
644
+ )
520
645
 
521
646
  # Basic validation
522
647
  if num_to_remove <= 0:
@@ -564,7 +689,7 @@ class RayPipeline:
564
689
  logger.warning(f"[ScaleDown-{stage_name}] No actors successfully initiated stop for registration.")
565
690
 
566
691
  total_attempted = len(actors_to_remove)
567
- logger.info(
692
+ logger.debug(
568
693
  f"[ScaleDown-{stage_name}] Scale down initiation process complete for {total_attempted} actors "
569
694
  f"(Skipped/Failed Initiation: {stop_initiation_failures}). Topology cleanup will handle final removal."
570
695
  )
@@ -647,9 +772,6 @@ class RayPipeline:
647
772
  # Activity check
648
773
  is_quiet = global_in_flight <= self.quiet_period_threshold
649
774
 
650
- if is_quiet:
651
- logger.info(f"Pipeline IS quiet. In-Flight: {global_in_flight} <= Threshold: {self.quiet_period_threshold}")
652
-
653
775
  return is_quiet
654
776
 
655
777
  def _wait_for_pipeline_drain(self, timeout_seconds: int) -> bool:
@@ -670,7 +792,6 @@ class RayPipeline:
670
792
  return False
671
793
 
672
794
  # --- Trigger immediate stats collection via the collector instance ---
673
- drain_stats = {}
674
795
  drain_success = False
675
796
  collection_error = None
676
797
 
@@ -689,19 +810,18 @@ class RayPipeline:
689
810
  if not collection_error
690
811
  else f"Collection Error: {type(collection_error).__name__}"
691
812
  )
692
- logger.info(
693
- f"[DrainWait] Check at {elapsed_time:.1f}s: Global In-Flight={global_in_flight} ({status_msg})"
813
+ logger.debug(
814
+ f"[Drain] Check at {elapsed_time:.1f}s: Global In-Flight={global_in_flight} ({status_msg})"
694
815
  )
695
816
  last_in_flight = global_in_flight
696
817
 
697
818
  # --- Check for successful drain ---
698
819
  # Requires BOTH in-flight=0 AND the collection reporting it was successful
699
820
  if global_in_flight == 0 and drain_success and not collection_error:
700
- logger.info(f"Pipeline confirmed drained (In-Flight=0) in {elapsed_time:.1f}s.")
701
821
  return True
702
822
  elif global_in_flight == 0: # Saw zero, but collection wasn't fully successful
703
823
  logger.warning(
704
- "[DrainWait] In-Flight reached 0, but stats collection had errors/timeouts."
824
+ "[Drain] In-Flight reached 0, but stats collection had errors/timeouts."
705
825
  " Cannot confirm drain yet."
706
826
  )
707
827
 
@@ -719,7 +839,6 @@ class RayPipeline:
719
839
 
720
840
  # Set flushing state in topology
721
841
  self.topology.set_flushing(True)
722
- logger.info("--- Starting Queue Flush ---")
723
842
  overall_success = False
724
843
  source_actors_paused = []
725
844
  pause_refs = []
@@ -734,7 +853,7 @@ class RayPipeline:
734
853
  current_connections = self.topology.get_connections()
735
854
 
736
855
  # --- 1. Pause Source Stages (using snapshots) ---
737
- logger.info("Pausing source stages...")
856
+ logger.debug("Pausing source stages...")
738
857
  pause_timeout = 60.0
739
858
  for stage in current_stages:
740
859
  if stage.is_source:
@@ -747,22 +866,22 @@ class RayPipeline:
747
866
  except Exception as e:
748
867
  logger.error(f"Failed sending pause to {actor}: {e}")
749
868
  if pause_refs:
750
- logger.info(f"Waiting up to {pause_timeout}s for {len(pause_refs)} sources to pause...")
869
+ logger.debug(f"Waiting up to {pause_timeout}s for {len(pause_refs)} sources to pause...")
751
870
  try:
752
871
  ray.get(pause_refs, timeout=pause_timeout)
753
- logger.info(f"{len(pause_refs)} sources acknowledged pause.")
872
+ logger.debug(f"{len(pause_refs)} sources acknowledged pause.")
754
873
  except GetTimeoutError:
755
874
  logger.warning(f"Timeout waiting for {len(pause_refs)} sources to pause.")
756
875
  except Exception as e:
757
876
  logger.error(f"Error waiting for sources pause: {e}. Proceeding cautiously.")
758
877
 
759
878
  # --- 2. Wait for Drain ---
760
- logger.info("Waiting for pipeline to drain...")
879
+ logger.debug("Waiting for pipeline to drain...")
761
880
  if not self._wait_for_pipeline_drain(self.queue_flush_drain_timeout_seconds):
762
881
  raise RuntimeError("Pipeline drain failed or timed out, aborting flush.")
763
882
 
764
883
  # --- 3. Create New Queues (using snapshot) ---
765
- logger.info("Creating new replacement queues...")
884
+ logger.debug("Creating new replacement queues...")
766
885
  new_edge_queues_map = {}
767
886
  for queue_name, (_, queue_size) in current_edge_queues.items():
768
887
  try:
@@ -775,7 +894,7 @@ class RayPipeline:
775
894
  raise RuntimeError(f"Failed to create new queue '{queue_name}'.") from e
776
895
 
777
896
  # --- 4. Re-wire Actors to New Queues (using snapshots) ---
778
- logger.info("Re-wiring actors to new queues...")
897
+ logger.debug("Re-wiring actors to new queues...")
779
898
  wiring_refs = []
780
899
  wiring_timeout = 120.0
781
900
  for from_stage_name, conns in current_connections.items():
@@ -811,7 +930,7 @@ class RayPipeline:
811
930
  raise RuntimeError("Actor re-wiring failed.") from e
812
931
 
813
932
  # --- 5. Update Topology State (Commit Point) ---
814
- logger.info("Committing new queues to pipeline topology.")
933
+ logger.debug("Committing new queues to pipeline topology.")
815
934
  self.topology.set_edge_queues(new_edge_queues_map) # Commit the change
816
935
  overall_success = True
817
936
 
@@ -822,7 +941,7 @@ class RayPipeline:
822
941
  finally:
823
942
  # --- 6. Resume Source Stages (Always attempt) ---
824
943
  if source_actors_paused:
825
- logger.info(f"Attempting to resume {len(source_actors_paused)} source actors...")
944
+ logger.debug(f"Attempting to resume {len(source_actors_paused)} source actors...")
826
945
  resume_timeout = 30.0
827
946
  resume_refs = []
828
947
  for actor in source_actors_paused:
@@ -831,10 +950,10 @@ class RayPipeline:
831
950
  except Exception as e:
832
951
  logger.error(f"Failed sending resume to {actor}: {e}")
833
952
  if resume_refs:
834
- logger.info(f"Waiting up to {resume_timeout}s for {len(resume_refs)} actors to resume...")
953
+ logger.debug(f"Waiting up to {resume_timeout}s for {len(resume_refs)} actors to resume...")
835
954
  try:
836
955
  ray.get(resume_refs, timeout=resume_timeout)
837
- logger.info(f"{len(resume_refs)} sources resumed.")
956
+ logger.debug(f"{len(resume_refs)} sources resumed.")
838
957
  except GetTimeoutError:
839
958
  logger.warning(f"Timeout waiting for {len(resume_refs)} sources to resume.")
840
959
  except Exception as e:
@@ -843,9 +962,6 @@ class RayPipeline:
843
962
  # Update flush timestamp only on success
844
963
  if overall_success:
845
964
  self._last_queue_flush_time = time.time()
846
- logger.info("--- Queue Flush Completed Successfully ---")
847
- else:
848
- logger.error("--- Queue Flush Failed ---")
849
965
 
850
966
  # Reset flushing state in topology
851
967
  self.topology.set_flushing(False)
@@ -977,7 +1093,7 @@ class RayPipeline:
977
1093
 
978
1094
  if target_replica_count != current_count:
979
1095
  stages_needing_action.append((stage_name, target_replica_count))
980
- logger.info(
1096
+ logger.debug(
981
1097
  f"[ScalingApply-{stage_name}] Action: Current={current_count}, "
982
1098
  f"Target={target_replica_count} (Min={stage_info.min_replicas}, Max={stage_info.max_replicas})"
983
1099
  )
@@ -1019,7 +1135,7 @@ class RayPipeline:
1019
1135
  completed = sum(1 for r in action_results.values() if r["status"] == "completed")
1020
1136
  errors = sum(1 for r in action_results.values() if r["status"] == "error")
1021
1137
  timeouts = sum(1 for r in action_results.values() if r["status"] == "timeout")
1022
- logger.info(f"[ScalingApply] Summary: {completed} completed, {errors} errors, {timeouts} timeouts.")
1138
+ logger.debug(f"[ScalingApply] Summary: {completed} completed, {errors} errors, {timeouts} timeouts.")
1023
1139
 
1024
1140
  def _perform_scaling_and_maintenance(self) -> None:
1025
1141
  """Orchestrates scaling/maintenance using topology and stats collector."""
@@ -1050,9 +1166,9 @@ class RayPipeline:
1050
1166
  logger.debug("--- Performing Scaling & Maintenance Cycle ---")
1051
1167
 
1052
1168
  if self._is_pipeline_quiet():
1053
- logger.info("Pipeline quiet, initiating queue flush.")
1169
+ logger.info("[Drain] Pipeline quiet, initiating queue flush.")
1054
1170
  flush_success = self._execute_queue_flush()
1055
- logger.info(f"Automatic queue flush completed. Success: {flush_success}")
1171
+ logger.info(f"[Drain] Automatic queue flush completed. Success: {flush_success}")
1056
1172
  return
1057
1173
 
1058
1174
  # Fast return check if stopping occurred while flushing or checking flush status
@@ -1201,5 +1317,6 @@ class RayPipeline:
1201
1317
 
1202
1318
  # Clear runtime state in topology
1203
1319
  self.topology.clear_runtime_state()
1320
+ del self.topology
1204
1321
 
1205
1322
  logger.info("Pipeline stopped.")
@@ -0,0 +1,82 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+
6
+ import logging
7
+
8
+ import ray
9
+
10
+ from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
11
+ from nv_ingest.framework.util.flow_control import filter_by_task
12
+ from nv_ingest_api.internal.extract.html.html_extractor import extract_markdown_from_html_internal
13
+ from nv_ingest_api.internal.primitives.ingest_control_message import remove_task_by_type, IngestControlMessage
14
+ from nv_ingest_api.internal.primitives.tracing.tagging import traceable
15
+ from nv_ingest_api.internal.schemas.extract.extract_html_schema import HtmlExtractorSchema
16
+ from nv_ingest_api.util.exception_handlers.decorators import (
17
+ nv_ingest_node_failure_try_except,
18
+ )
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ @ray.remote
24
+ class HtmlExtractorStage(RayActorStage):
25
+ """
26
+ A Ray actor stage that extracts text in markdown format from html content.
27
+
28
+ It expects an IngestControlMessage containing a DataFrame with html content. It then:
29
+ 1. Removes the "html_content_extract" task from the message.
30
+ 2. Calls the html extraction logic (via extract_markdown_from_html_internal) using a validated configuration.
31
+ 3. Updates the message payload with the extracted text DataFrame.
32
+ """
33
+
34
+ def __init__(self, config: HtmlExtractorSchema) -> None:
35
+ super().__init__(config, log_to_stdout=False)
36
+ try:
37
+ self.validated_config = config
38
+ self._logger.info("HtmlExtractorStage configuration validated successfully.")
39
+ except Exception as e:
40
+ self._logger.exception(f"Error validating Html Extractor config: {e}")
41
+ raise
42
+
43
+ @traceable("html_extractor")
44
+ @filter_by_task(required_tasks=[("extract", {"document_type": "html"})])
45
+ @nv_ingest_node_failure_try_except(annotation_id="html_extractor", raise_on_failure=False)
46
+ def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
47
+ """
48
+ Process the control message by extracting content from html.
49
+
50
+ Parameters
51
+ ----------
52
+ control_message : IngestControlMessage
53
+ The message containing a DataFrame payload with html content.
54
+
55
+ Returns
56
+ -------
57
+ IngestControlMessage
58
+ The updated message with extracted content.
59
+ """
60
+ self._logger.debug("HtmlExtractorStage.on_data: Starting html extraction process.")
61
+
62
+ # Extract the DataFrame payload.
63
+ df_ledger = control_message.payload()
64
+ self._logger.debug("Extracted payload with %d rows.", len(df_ledger))
65
+
66
+ # Remove the "html_content_extract" task from the message to obtain task-specific configuration.
67
+ task_config = remove_task_by_type(control_message, "extract")
68
+ self._logger.debug("Extracted task config: %s", task_config)
69
+
70
+ # Perform html content extraction.
71
+ new_df, extraction_info = extract_markdown_from_html_internal(
72
+ df_extraction_ledger=df_ledger,
73
+ task_config=task_config,
74
+ extraction_config=self.validated_config,
75
+ execution_trace_log=None,
76
+ )
77
+
78
+ # Update the message payload with the extracted text DataFrame.
79
+ control_message.payload(new_df)
80
+ control_message.set_metadata("html_extraction_info", extraction_info)
81
+
82
+ return control_message
@@ -495,7 +495,7 @@ def start_simple_message_broker(broker_client: dict) -> multiprocessing.Process:
495
495
  server.serve_forever()
496
496
 
497
497
  p = multiprocessing.Process(target=broker_server)
498
- p.daemon = True
498
+ p.daemon = False
499
499
  p.start()
500
500
  logger.info(f"Started SimpleMessageBroker server in separate process on port {broker_client['port']}")
501
501