nv-ingest 2025.6.2.dev20250602__tar.gz → 2025.6.24.dev20250625__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest might be problematic. Click here for more details.

Files changed (108) hide show
  1. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/PKG-INFO +4 -4
  2. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +2 -2
  3. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +2 -2
  4. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +1 -1
  5. nv_ingest-2025.6.24.dev20250625/nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +161 -0
  6. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +4 -0
  7. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +1 -1
  8. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/version.py +0 -8
  9. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest.egg-info/PKG-INFO +4 -4
  10. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest.egg-info/requires.txt +3 -3
  11. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/pyproject.toml +3 -3
  12. nv_ingest-2025.6.2.dev20250602/nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +0 -97
  13. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/LICENSE +0 -0
  14. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/MANIFEST.in +0 -0
  15. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/__init__.py +0 -0
  16. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/api/__init__.py +0 -0
  17. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/api/main.py +0 -0
  18. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/api/v1/__init__.py +0 -0
  19. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/api/v1/health.py +0 -0
  20. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/api/v1/ingest.py +0 -0
  21. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/api/v1/metrics.py +0 -0
  22. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/__init__.py +0 -0
  23. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/__init__.py +0 -0
  24. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/__init__.py +0 -0
  25. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/edges/__init__.py +0 -0
  26. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/edges/async_queue_edge.py +0 -0
  27. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/edges/ray_queue_edge.py +0 -0
  28. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/edges/threaded_queue_edge.py +0 -0
  29. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/examples/__init__.py +0 -0
  30. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/examples/task_source_harness.py +0 -0
  31. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py +0 -0
  32. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/primitives/__init__.py +0 -0
  33. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/primitives/dataclasses.py +0 -0
  34. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py +0 -0
  35. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +0 -0
  36. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/stages/__init__.py +0 -0
  37. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py +0 -0
  38. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +0 -0
  39. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +0 -0
  40. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +0 -0
  41. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py +0 -0
  42. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +0 -0
  43. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +0 -0
  44. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +0 -0
  45. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +0 -0
  46. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +0 -0
  47. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/stages/injectors/__init__.py +0 -0
  48. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/stages/meta/__init__.py +0 -0
  49. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_edge_base.py +0 -0
  50. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +0 -0
  51. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +0 -0
  52. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +0 -0
  53. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/stages/mutate/__init__.py +0 -0
  54. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +0 -0
  55. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +0 -0
  56. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/stages/sinks/__init__.py +0 -0
  57. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +0 -0
  58. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +0 -0
  59. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/stages/sources/__init__.py +0 -0
  60. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +0 -0
  61. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/stages/storage/__init__.py +0 -0
  62. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +0 -0
  63. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +0 -0
  64. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/stages/telemetry/__init__.py +0 -0
  65. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +0 -0
  66. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/stages/telemetry/otel_meter.py +0 -0
  67. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +0 -0
  68. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/stages/transforms/__init__.py +0 -0
  69. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +0 -0
  70. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +0 -0
  71. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +0 -0
  72. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/stages/utility/__init__.py +0 -0
  73. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +0 -0
  74. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/util/__init__.py +0 -0
  75. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/util/pipeline/__init__.py +0 -0
  76. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +0 -0
  77. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +0 -0
  78. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/util/system_tools/__init__.py +0 -0
  79. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/util/system_tools/memory.py +0 -0
  80. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/orchestration/ray/util/system_tools/visualizers.py +0 -0
  81. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/schemas/__init__.py +0 -0
  82. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/schemas/framework_ingest_config_schema.py +0 -0
  83. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/schemas/framework_job_counter_schema.py +0 -0
  84. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/schemas/framework_message_broker_sink_schema.py +0 -0
  85. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/schemas/framework_message_broker_source_schema.py +0 -0
  86. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/schemas/framework_message_wrapper_schema.py +0 -0
  87. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/schemas/framework_metadata_injector_schema.py +0 -0
  88. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/schemas/framework_otel_meter_schema.py +0 -0
  89. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/schemas/framework_otel_tracer_schema.py +0 -0
  90. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/schemas/framework_processing_job_schema.py +0 -0
  91. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/schemas/framework_task_injection_schema.py +0 -0
  92. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/schemas/framework_vdb_task_sink_schema.py +0 -0
  93. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/util/__init__.py +0 -0
  94. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/util/flow_control/__init__.py +0 -0
  95. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/util/flow_control/filter_by_task.py +0 -0
  96. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/util/service/__init__.py +0 -0
  97. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/util/service/impl/__init__.py +0 -0
  98. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/util/service/impl/ingest/__init__.py +0 -0
  99. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +0 -0
  100. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/util/service/meta/__init__.py +0 -0
  101. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/util/service/meta/ingest/__init__.py +0 -0
  102. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py +0 -0
  103. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/util/telemetry/__init__.py +0 -0
  104. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest/framework/util/telemetry/global_stats.py +0 -0
  105. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest.egg-info/SOURCES.txt +0 -0
  106. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest.egg-info/dependency_links.txt +0 -0
  107. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/nv_ingest.egg-info/top_level.txt +0 -0
  108. {nv_ingest-2025.6.2.dev20250602 → nv_ingest-2025.6.24.dev20250625}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest
3
- Version: 2025.6.2.dev20250602
3
+ Version: 2025.6.24.dev20250625
4
4
  Summary: Python module for multimodal document ingestion
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -240,13 +240,13 @@ Requires-Dist: python-docx>=1.1.2
240
240
  Requires-Dist: python-dotenv>=1.0.1
241
241
  Requires-Dist: python-pptx>=1.0.2
242
242
  Requires-Dist: prometheus-client
243
- Requires-Dist: torch==2.4.1
243
+ Requires-Dist: torch>=2.4.1
244
244
  Requires-Dist: ray[all]>=2.37.0
245
245
  Requires-Dist: redis>=5.2.1
246
246
  Requires-Dist: requests>=2.28.2
247
247
  Requires-Dist: scikit-learn>=1.6.0
248
248
  Requires-Dist: scipy>=1.15.1
249
- Requires-Dist: setuptools>=58.2.0
249
+ Requires-Dist: setuptools>=78.1.1
250
250
  Requires-Dist: tabulate>=0.9.0
251
251
  Requires-Dist: torchvision
252
252
  Requires-Dist: torchaudio
@@ -259,7 +259,7 @@ Requires-Dist: opencv-python
259
259
  Requires-Dist: pymilvus>=2.5.10
260
260
  Requires-Dist: pymilvus[bulk_writer,model]
261
261
  Requires-Dist: tritonclient
262
- Requires-Dist: nvidia-riva-client>=2.18.0
262
+ Requires-Dist: nvidia-riva-client==2.20.0
263
263
  Requires-Dist: unstructured-client
264
264
  Requires-Dist: markitdown
265
265
  Dynamic: license-file
@@ -155,7 +155,7 @@ if __name__ == "__main__":
155
155
  logger.info("Environment variables set.")
156
156
 
157
157
  image_caption_endpoint_url = "https://integrate.api.nvidia.com/v1/chat/completions"
158
- image_caption_model_name = "meta/llama-3.2-11b-vision-instruct"
158
+ model_name = "meta/llama-3.2-11b-vision-instruct"
159
159
  yolox_grpc, yolox_http, yolox_auth, yolox_protocol = get_nim_service("yolox")
160
160
  (
161
161
  yolox_table_structure_grpc,
@@ -228,7 +228,7 @@ if __name__ == "__main__":
228
228
  image_caption_config = {
229
229
  "api_key": yolox_auth,
230
230
  "endpoint_url": image_caption_endpoint_url,
231
- "image_caption_model_name": image_caption_model_name,
231
+ "model_name": model_name,
232
232
  "prompt": "Caption the content of this image:",
233
233
  }
234
234
  logger.info("Service configuration retrieved from get_nim_service and environment variables.")
@@ -555,7 +555,7 @@ class PipelineTopology:
555
555
  return None
556
556
 
557
557
  def get_connections(self) -> Dict[str, List[Tuple[str, int]]]:
558
- """Returns a shallow copy of the connections dictionary."""
558
+ """Returns a shallow copy of the connection dictionary."""
559
559
  with self._lock:
560
560
  # Shallow copy is usually sufficient here as tuples are immutable
561
561
  return self._connections.copy()
@@ -571,7 +571,7 @@ class PipelineTopology:
571
571
  return len(self._stage_actors.get(stage_name, []))
572
572
 
573
573
  def get_edge_queues(self) -> Dict[str, Tuple[Any, int]]:
574
- """Returns a shallow copy of the edge queues dictionary."""
574
+ """Returns a shallow copy of the edge queues' dictionary."""
575
575
  with self._lock:
576
576
  return self._edge_queues.copy()
577
577
 
@@ -40,7 +40,7 @@ class RayStatsCollector:
40
40
  - `get_edge_queues() -> Dict[str, Tuple[Any, int]]`
41
41
  These methods should return snapshots suitable for iteration.
42
42
  interval : float, optional
43
- The interval in seconds between stats collection attempts, by default 5.0.
43
+ The interval in seconds between stat collection attempts, by default 5.0.
44
44
  actor_timeout : float, optional
45
45
  Timeout in seconds for waiting for stats from a single actor, by default 5.0.
46
46
  queue_timeout : float, optional
@@ -0,0 +1,161 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ from datetime import datetime
6
+ import logging
7
+ import pandas as pd
8
+ from typing import Any
9
+ from pydantic import BaseModel
10
+ import ray
11
+
12
+ from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
13
+ from nv_ingest_api.internal.enums.common import (
14
+ DocumentTypeEnum,
15
+ ContentTypeEnum,
16
+ AccessLevelEnum,
17
+ TextTypeEnum,
18
+ LanguageEnum,
19
+ )
20
+ from nv_ingest_api.internal.primitives.tracing.tagging import traceable
21
+ from nv_ingest_api.internal.schemas.meta.metadata_schema import ContentHierarchySchema
22
+ from nv_ingest_api.util.converters.type_mappings import doc_type_to_content_type
23
+ from nv_ingest_api.util.exception_handlers.decorators import (
24
+ nv_ingest_node_failure_try_except,
25
+ )
26
+
27
+ # logging.basicConfig(level=logging.DEBUG)
28
+ logger = logging.getLogger(__name__)
29
+
30
+
31
+ @ray.remote
32
+ class MetadataInjectionStage(RayActorStage):
33
+ """
34
+ A Ray actor stage that performs metadata injection on IngestControlMessages.
35
+
36
+ This stage iterates over the rows of the DataFrame payload, checks if metadata
37
+ injection is required, and if so, injects the appropriate metadata.
38
+ """
39
+
40
+ def __init__(self, config: BaseModel) -> None:
41
+ # Call the base initializer to set attributes like self._running.
42
+ super().__init__(config)
43
+ # Additional initialization can be added here if necessary.
44
+ logger.info("MetadataInjectionStage initialized with config: %s", config)
45
+
46
+ @traceable("metadata_injector")
47
+ @nv_ingest_node_failure_try_except(annotation_id="metadata_injector", raise_on_failure=False)
48
+ def on_data(self, message: Any) -> Any:
49
+ """
50
+ Process an incoming IngestControlMessage by injecting metadata into its DataFrame payload.
51
+
52
+ Parameters
53
+ ----------
54
+ message : IngestControlMessage
55
+ The incoming message containing the payload DataFrame.
56
+
57
+ Returns
58
+ -------
59
+ IngestControlMessage
60
+ The message with updated metadata if injection was required.
61
+ """
62
+ df = message.payload()
63
+ update_required = False
64
+ rows = []
65
+ logger.info("Starting metadata injection on DataFrame with %d rows", len(df))
66
+
67
+ for _, row in df.iterrows():
68
+ try:
69
+ # Convert document type to content type using enums.
70
+ content_type = doc_type_to_content_type(DocumentTypeEnum(row["document_type"]))
71
+ # Check if metadata is missing or doesn't contain 'content'
72
+ if (
73
+ "metadata" not in row
74
+ or not isinstance(row["metadata"], dict)
75
+ or "content" not in row["metadata"].keys()
76
+ ):
77
+ update_required = True
78
+
79
+ # Initialize default structures based on MetaDataSchema
80
+ default_source_metadata = {
81
+ "source_id": row.get("source_id"),
82
+ "source_name": row.get("source_name"),
83
+ "source_type": row["document_type"],
84
+ "source_location": "",
85
+ "collection_id": "",
86
+ "date_created": datetime.now().isoformat(),
87
+ "last_modified": datetime.now().isoformat(),
88
+ "summary": "",
89
+ "partition_id": -1,
90
+ "access_level": AccessLevelEnum.UNKNOWN.value,
91
+ }
92
+
93
+ default_content_metadata = {
94
+ "type": content_type.name.lower(),
95
+ "page_number": -1,
96
+ "description": "",
97
+ "hierarchy": ContentHierarchySchema().model_dump(),
98
+ "subtype": "",
99
+ "start_time": -1,
100
+ "end_time": -1,
101
+ }
102
+
103
+ default_audio_metadata = None
104
+ if content_type == ContentTypeEnum.AUDIO:
105
+ default_audio_metadata = {
106
+ "audio_type": row["document_type"],
107
+ "audio_transcript": "",
108
+ }
109
+
110
+ default_image_metadata = None
111
+ if content_type == ContentTypeEnum.IMAGE:
112
+ default_image_metadata = {
113
+ "image_type": row["document_type"],
114
+ "structured_image_type": ContentTypeEnum.NONE.value,
115
+ "caption": "",
116
+ "text": "",
117
+ "image_location": (0, 0, 0, 0),
118
+ "image_location_max_dimensions": (0, 0),
119
+ "uploaded_image_url": "",
120
+ "width": 0,
121
+ "height": 0,
122
+ }
123
+
124
+ default_text_metadata = None
125
+ if content_type == ContentTypeEnum.TEXT:
126
+ default_text_metadata = {
127
+ "text_type": TextTypeEnum.DOCUMENT.value,
128
+ "summary": "",
129
+ "keywords": "",
130
+ "language": LanguageEnum.UNKNOWN.value,
131
+ "text_location": (0, 0, 0, 0),
132
+ "text_location_max_dimensions": (0, 0, 0, 0),
133
+ }
134
+
135
+ row["metadata"] = {
136
+ "content": row["content"],
137
+ "content_metadata": default_content_metadata,
138
+ "error_metadata": None,
139
+ "audio_metadata": default_audio_metadata,
140
+ "image_metadata": default_image_metadata,
141
+ "source_metadata": default_source_metadata,
142
+ "text_metadata": default_text_metadata,
143
+ }
144
+ logger.info(
145
+ f"METADATA_INJECTOR_DEBUG: Rebuilt metadata for source_id='{row.get('source_id', 'N/A')}'. "
146
+ f"Metadata keys: {list(row['metadata'].keys())}."
147
+ f"'content' present: {'content' in row['metadata']}"
148
+ )
149
+ except Exception as inner_e:
150
+ logger.exception("Failed to process row during metadata injection")
151
+ raise inner_e
152
+ rows.append(row)
153
+
154
+ if update_required:
155
+ docs = pd.DataFrame(rows)
156
+ message.payload(docs)
157
+ logger.info("Metadata injection updated payload with %d rows", len(docs))
158
+ else:
159
+ logger.info("No metadata update was necessary during metadata injection")
160
+
161
+ return message
@@ -331,6 +331,10 @@ def run_pipeline(
331
331
  """
332
332
  if run_in_subprocess:
333
333
  logger.info("Launching pipeline in Python subprocess using multiprocessing.")
334
+ if (ingest_config.ngc_api_key is None or ingest_config.ngc_api_key == "") and (
335
+ ingest_config.nvidia_build_api_key is None or ingest_config.nvidia_build_api_key == ""
336
+ ):
337
+ logger.warning("NGC_API_KEY or NVIDIA_BUILD_API_KEY are not set. NIM Related functions will not work.")
334
338
 
335
339
  ctx = multiprocessing.get_context("fork")
336
340
  process = ctx.Process(
@@ -479,7 +479,7 @@ def add_image_caption_stage(pipeline, default_cpu_count, stage_name="image_capti
479
479
  **{
480
480
  "api_key": auth_token,
481
481
  "endpoint_url": endpoint_url,
482
- "image_caption_model_name": model_name,
482
+ "model_name": model_name,
483
483
  "prompt": "Caption the content of this image:",
484
484
  }
485
485
  )
@@ -5,7 +5,6 @@
5
5
 
6
6
  import datetime
7
7
  import os
8
- import re
9
8
 
10
9
 
11
10
  def get_version():
@@ -16,13 +15,6 @@ def get_version():
16
15
  if not version:
17
16
  version = f"{datetime.datetime.now().strftime('%Y.%m.%d')}"
18
17
 
19
- # We only check this for dev, we assume for release the user knows what they are doing
20
- if release_type != "release":
21
- # Ensure the version is PEP 440 compatible
22
- pep440_regex = r"^\d{4}\.\d{1,2}\.\d{1,2}$"
23
- if not re.match(pep440_regex, version):
24
- raise ValueError(f"Version '{version}' is not PEP 440 compatible")
25
-
26
18
  # Construct the final version string
27
19
  if release_type == "dev":
28
20
  # If rev is not specified and defaults to 0 lets create a more meaningful development
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest
3
- Version: 2025.6.2.dev20250602
3
+ Version: 2025.6.24.dev20250625
4
4
  Summary: Python module for multimodal document ingestion
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -240,13 +240,13 @@ Requires-Dist: python-docx>=1.1.2
240
240
  Requires-Dist: python-dotenv>=1.0.1
241
241
  Requires-Dist: python-pptx>=1.0.2
242
242
  Requires-Dist: prometheus-client
243
- Requires-Dist: torch==2.4.1
243
+ Requires-Dist: torch>=2.4.1
244
244
  Requires-Dist: ray[all]>=2.37.0
245
245
  Requires-Dist: redis>=5.2.1
246
246
  Requires-Dist: requests>=2.28.2
247
247
  Requires-Dist: scikit-learn>=1.6.0
248
248
  Requires-Dist: scipy>=1.15.1
249
- Requires-Dist: setuptools>=58.2.0
249
+ Requires-Dist: setuptools>=78.1.1
250
250
  Requires-Dist: tabulate>=0.9.0
251
251
  Requires-Dist: torchvision
252
252
  Requires-Dist: torchaudio
@@ -259,7 +259,7 @@ Requires-Dist: opencv-python
259
259
  Requires-Dist: pymilvus>=2.5.10
260
260
  Requires-Dist: pymilvus[bulk_writer,model]
261
261
  Requires-Dist: tritonclient
262
- Requires-Dist: nvidia-riva-client>=2.18.0
262
+ Requires-Dist: nvidia-riva-client==2.20.0
263
263
  Requires-Dist: unstructured-client
264
264
  Requires-Dist: markitdown
265
265
  Dynamic: license-file
@@ -25,13 +25,13 @@ python-docx>=1.1.2
25
25
  python-dotenv>=1.0.1
26
26
  python-pptx>=1.0.2
27
27
  prometheus-client
28
- torch==2.4.1
28
+ torch>=2.4.1
29
29
  ray[all]>=2.37.0
30
30
  redis>=5.2.1
31
31
  requests>=2.28.2
32
32
  scikit-learn>=1.6.0
33
33
  scipy>=1.15.1
34
- setuptools>=58.2.0
34
+ setuptools>=78.1.1
35
35
  tabulate>=0.9.0
36
36
  torchvision
37
37
  torchaudio
@@ -44,6 +44,6 @@ opencv-python
44
44
  pymilvus>=2.5.10
45
45
  pymilvus[bulk_writer,model]
46
46
  tritonclient
47
- nvidia-riva-client>=2.18.0
47
+ nvidia-riva-client==2.20.0
48
48
  unstructured-client
49
49
  markitdown
@@ -44,13 +44,13 @@ dependencies = [
44
44
  "python-dotenv>=1.0.1",
45
45
  "python-pptx>=1.0.2",
46
46
  "prometheus-client",
47
- "torch==2.4.1",
47
+ "torch>=2.4.1",
48
48
  "ray[all]>=2.37.0",
49
49
  "redis>=5.2.1",
50
50
  "requests>=2.28.2",
51
51
  "scikit-learn>=1.6.0",
52
52
  "scipy>=1.15.1",
53
- "setuptools>=58.2.0",
53
+ "setuptools>=78.1.1",
54
54
  "tabulate>=0.9.0",
55
55
  "torchvision",
56
56
  "torchaudio",
@@ -63,7 +63,7 @@ dependencies = [
63
63
  "pymilvus>=2.5.10",
64
64
  "pymilvus[bulk_writer, model]",
65
65
  "tritonclient",
66
- "nvidia-riva-client>=2.18.0",
66
+ "nvidia-riva-client==2.20.0",
67
67
  "unstructured-client",
68
68
  "markitdown",
69
69
  ]
@@ -1,97 +0,0 @@
1
- # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
- # All rights reserved.
3
- # SPDX-License-Identifier: Apache-2.0
4
-
5
- import logging
6
- import pandas as pd
7
- from typing import Any
8
- from pydantic import BaseModel
9
- import ray
10
-
11
- from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
12
- from nv_ingest_api.internal.enums.common import DocumentTypeEnum, ContentTypeEnum
13
- from nv_ingest_api.internal.primitives.tracing.tagging import traceable
14
- from nv_ingest_api.util.converters.type_mappings import doc_type_to_content_type
15
- from nv_ingest_api.util.exception_handlers.decorators import (
16
- nv_ingest_node_failure_try_except,
17
- )
18
-
19
- # logging.basicConfig(level=logging.DEBUG)
20
- logger = logging.getLogger(__name__)
21
-
22
-
23
- @ray.remote
24
- class MetadataInjectionStage(RayActorStage):
25
- """
26
- A Ray actor stage that performs metadata injection on IngestControlMessages.
27
-
28
- This stage iterates over the rows of the DataFrame payload, checks if metadata
29
- injection is required, and if so, injects the appropriate metadata.
30
- """
31
-
32
- def __init__(self, config: BaseModel) -> None:
33
- # Call the base initializer to set attributes like self._running.
34
- super().__init__(config)
35
- # Additional initialization can be added here if necessary.
36
- logger.info("MetadataInjectionStage initialized with config: %s", config)
37
-
38
- @traceable("metadata_injector")
39
- @nv_ingest_node_failure_try_except(annotation_id="metadata_injector", raise_on_failure=False)
40
- def on_data(self, message: Any) -> Any:
41
- """
42
- Process an incoming IngestControlMessage by injecting metadata into its DataFrame payload.
43
-
44
- Parameters
45
- ----------
46
- message : IngestControlMessage
47
- The incoming message containing the payload DataFrame.
48
-
49
- Returns
50
- -------
51
- IngestControlMessage
52
- The message with updated metadata if injection was required.
53
- """
54
- df = message.payload()
55
- update_required = False
56
- rows = []
57
- logger.info("Starting metadata injection on DataFrame with %d rows", len(df))
58
-
59
- for _, row in df.iterrows():
60
- try:
61
- # Convert document type to content type using enums.
62
- content_type = doc_type_to_content_type(DocumentTypeEnum(row["document_type"]))
63
- # Check if metadata is missing or doesn't contain 'content'
64
- if "metadata" not in row or not isinstance(row["metadata"], dict) or "content" not in row["metadata"]:
65
- update_required = True
66
- row["metadata"] = {
67
- "content": row.get("content"),
68
- "content_metadata": {
69
- "type": content_type.name.lower(),
70
- },
71
- "error_metadata": None,
72
- "audio_metadata": (
73
- None if content_type != ContentTypeEnum.AUDIO else {"audio_type": row["document_type"]}
74
- ),
75
- "image_metadata": (
76
- None if content_type != ContentTypeEnum.IMAGE else {"image_type": row["document_type"]}
77
- ),
78
- "source_metadata": {
79
- "source_id": row.get("source_id"),
80
- "source_name": row.get("source_name"),
81
- "source_type": row["document_type"],
82
- },
83
- "text_metadata": (None if content_type != ContentTypeEnum.TEXT else {"text_type": "document"}),
84
- }
85
- except Exception as inner_e:
86
- logger.exception("Failed to process row during metadata injection")
87
- raise inner_e
88
- rows.append(row)
89
-
90
- if update_required:
91
- docs = pd.DataFrame(rows)
92
- message.payload(docs)
93
- logger.info("Metadata injection updated payload with %d rows", len(docs))
94
- else:
95
- logger.info("No metadata update was necessary during metadata injection")
96
-
97
- return message