nv-ingest 2025.11.29.dev20251129__tar.gz → 2025.12.8.dev20251208__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (130) hide show
  1. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/PKG-INFO +3 -1
  2. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +72 -6
  3. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/pipeline/default_libmode_pipeline_impl.py +17 -3
  4. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/pipeline/default_pipeline_impl.py +18 -1
  5. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest.egg-info/PKG-INFO +3 -1
  6. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest.egg-info/requires.txt +2 -0
  7. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/pyproject.toml +2 -0
  8. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/LICENSE +0 -0
  9. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/MANIFEST.in +0 -0
  10. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/__init__.py +0 -0
  11. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/api/__init__.py +0 -0
  12. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/api/main.py +0 -0
  13. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/api/tracing.py +0 -0
  14. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/api/v1/__init__.py +0 -0
  15. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/api/v1/health.py +0 -0
  16. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/api/v1/ingest.py +0 -0
  17. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/api/v1/metrics.py +0 -0
  18. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/api/v2/README.md +0 -0
  19. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/api/v2/__init__.py +0 -0
  20. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/api/v2/ingest.py +0 -0
  21. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/__init__.py +0 -0
  22. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/__init__.py +0 -0
  23. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/execution/__init__.py +0 -0
  24. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/execution/helpers.py +0 -0
  25. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/execution/options.py +0 -0
  26. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/process/__init__.py +0 -0
  27. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/process/dependent_services.py +0 -0
  28. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/process/execution.py +0 -0
  29. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/process/lifecycle.py +0 -0
  30. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/process/strategies.py +0 -0
  31. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/process/termination.py +0 -0
  32. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/__init__.py +0 -0
  33. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/edges/__init__.py +0 -0
  34. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/edges/async_queue_edge.py +0 -0
  35. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/edges/ray_queue_edge.py +0 -0
  36. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/edges/threaded_queue_edge.py +0 -0
  37. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/examples/__init__.py +0 -0
  38. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +0 -0
  39. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/examples/task_source_harness.py +0 -0
  40. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py +0 -0
  41. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/primitives/__init__.py +0 -0
  42. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/primitives/dataclasses.py +0 -0
  43. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py +0 -0
  44. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +0 -0
  45. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +0 -0
  46. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +0 -0
  47. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/stages/__init__.py +0 -0
  48. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py +0 -0
  49. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +0 -0
  50. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +0 -0
  51. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +0 -0
  52. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py +0 -0
  53. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +0 -0
  54. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +0 -0
  55. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/stages/extractors/ocr_extractor.py +0 -0
  56. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +0 -0
  57. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +0 -0
  58. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +0 -0
  59. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/stages/injectors/__init__.py +0 -0
  60. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +0 -0
  61. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/stages/meta/__init__.py +0 -0
  62. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_edge_base.py +0 -0
  63. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +0 -0
  64. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +0 -0
  65. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +0 -0
  66. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/stages/mutate/__init__.py +0 -0
  67. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +0 -0
  68. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +0 -0
  69. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/stages/sinks/__init__.py +0 -0
  70. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +0 -0
  71. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +0 -0
  72. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/stages/sources/__init__.py +0 -0
  73. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +0 -0
  74. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/stages/storage/__init__.py +0 -0
  75. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +0 -0
  76. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/stages/telemetry/__init__.py +0 -0
  77. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +0 -0
  78. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/stages/telemetry/otel_meter.py +0 -0
  79. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +0 -0
  80. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/stages/transforms/__init__.py +0 -0
  81. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +0 -0
  82. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +0 -0
  83. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +0 -0
  84. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/stages/utility/__init__.py +0 -0
  85. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +0 -0
  86. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/util/__init__.py +0 -0
  87. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/util/env_config.py +0 -0
  88. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/util/pipeline/__init__.py +0 -0
  89. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +0 -0
  90. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +0 -0
  91. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/util/pipeline/tools.py +0 -0
  92. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/util/system_tools/__init__.py +0 -0
  93. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/util/system_tools/memory.py +0 -0
  94. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/orchestration/ray/util/system_tools/visualizers.py +0 -0
  95. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/schemas/__init__.py +0 -0
  96. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/schemas/framework_ingest_config_schema.py +0 -0
  97. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/schemas/framework_job_counter_schema.py +0 -0
  98. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/schemas/framework_message_broker_sink_schema.py +0 -0
  99. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/schemas/framework_message_broker_source_schema.py +0 -0
  100. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/schemas/framework_message_wrapper_schema.py +0 -0
  101. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/schemas/framework_metadata_injector_schema.py +0 -0
  102. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/schemas/framework_otel_meter_schema.py +0 -0
  103. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/schemas/framework_otel_tracer_schema.py +0 -0
  104. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/schemas/framework_processing_job_schema.py +0 -0
  105. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/schemas/framework_task_injection_schema.py +0 -0
  106. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/schemas/framework_vdb_task_sink_schema.py +0 -0
  107. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/util/__init__.py +0 -0
  108. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/util/flow_control/__init__.py +0 -0
  109. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/util/flow_control/filter_by_task.py +0 -0
  110. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/util/flow_control/udf_intercept.py +0 -0
  111. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/util/service/__init__.py +0 -0
  112. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/util/service/impl/__init__.py +0 -0
  113. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/util/service/impl/ingest/__init__.py +0 -0
  114. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +0 -0
  115. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/util/service/meta/__init__.py +0 -0
  116. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/util/service/meta/ingest/__init__.py +0 -0
  117. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py +0 -0
  118. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/util/telemetry/__init__.py +0 -0
  119. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/framework/util/telemetry/global_stats.py +0 -0
  120. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/pipeline/__init__.py +0 -0
  121. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/pipeline/config/__init__.py +0 -0
  122. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/pipeline/config/loaders.py +0 -0
  123. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/pipeline/config/replica_resolver.py +0 -0
  124. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/pipeline/ingest_pipeline.py +0 -0
  125. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/pipeline/pipeline_schema.py +0 -0
  126. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest/version.py +0 -0
  127. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest.egg-info/SOURCES.txt +0 -0
  128. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest.egg-info/dependency_links.txt +0 -0
  129. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/nv_ingest.egg-info/top_level.txt +0 -0
  130. {nv_ingest-2025.11.29.dev20251129 → nv_ingest-2025.12.8.dev20251208}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest
3
- Version: 2025.11.29.dev20251129
3
+ Version: 2025.12.8.dev20251208
4
4
  Summary: Python module for multimodal document ingestion
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -219,6 +219,8 @@ Requires-Dist: diskcache>=5.6.3
219
219
  Requires-Dist: fastapi>=0.115.6
220
220
  Requires-Dist: fastparquet>=2024.11.0
221
221
  Requires-Dist: fsspec>=2024.10.0
222
+ Requires-Dist: universal_pathlib>=0.2.6
223
+ Requires-Dist: s3fs>=2024.10.0
222
224
  Requires-Dist: gunicorn
223
225
  Requires-Dist: h11>=0.16.0
224
226
  Requires-Dist: httpx>=0.28.1
@@ -3,7 +3,9 @@
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
5
  import logging
6
+ import os
6
7
  from typing import Dict, Any, Optional
8
+ from urllib.parse import urlparse
7
9
 
8
10
  import pandas as pd
9
11
  import ray
@@ -26,7 +28,8 @@ logger = logging.getLogger(__name__)
26
28
  @ray.remote
27
29
  class ImageStorageStage(RayActorStage):
28
30
  """
29
- A Ray actor stage that stores images or structured content in MinIO and updates metadata with storage URLs.
31
+ A Ray actor stage that stores images or structured content using an fsspec-compatible backend and updates
32
+ metadata with storage URLs.
30
33
 
31
34
  This stage uses the validated configuration (ImageStorageModuleSchema) to process and store the DataFrame
32
35
  payload and updates the control message accordingly.
@@ -69,8 +72,16 @@ class ImageStorageStage(RayActorStage):
69
72
  task_config = remove_task_by_type(control_message, "store")
70
73
  # logger.debug("ImageStorageStage: Task configuration extracted: %s", pprint.pformat(task_config))
71
74
 
72
- store_structured: bool = task_config.get("structured", True)
73
- store_unstructured: bool = task_config.get("images", False)
75
+ stage_defaults = {
76
+ "structured": self.validated_config.structured,
77
+ "images": self.validated_config.images,
78
+ "storage_uri": self.validated_config.storage_uri,
79
+ "storage_options": self.validated_config.storage_options,
80
+ "public_base_url": self.validated_config.public_base_url,
81
+ }
82
+
83
+ store_structured: bool = task_config.get("structured", stage_defaults["structured"])
84
+ store_unstructured: bool = task_config.get("images", stage_defaults["images"])
74
85
 
75
86
  content_types: Dict[Any, Any] = {}
76
87
  if store_structured:
@@ -80,14 +91,34 @@ class ImageStorageStage(RayActorStage):
80
91
  content_types[ContentTypeEnum.IMAGE] = store_unstructured
81
92
 
82
93
  params: Dict[str, Any] = task_config.get("params", {})
83
- params["content_types"] = content_types
84
94
 
85
- logger.debug(f"Processing storage task with parameters: {params}")
95
+ storage_uri = task_config.get("storage_uri") or params.get("storage_uri") or stage_defaults["storage_uri"]
96
+ storage_options = {
97
+ **(stage_defaults["storage_options"] or {}),
98
+ **(task_config.get("storage_options") or {}),
99
+ **params.get("storage_options", {}),
100
+ }
101
+ if "public_base_url" in task_config:
102
+ public_base_url = task_config["public_base_url"]
103
+ else:
104
+ public_base_url = params.get("public_base_url", stage_defaults["public_base_url"])
105
+
106
+ storage_options = self._inject_storage_defaults(storage_uri, storage_options)
107
+
108
+ storage_params: Dict[str, Any] = {
109
+ "content_types": content_types,
110
+ "storage_uri": storage_uri,
111
+ "storage_options": storage_options,
112
+ }
113
+ if public_base_url:
114
+ storage_params["public_base_url"] = public_base_url
115
+
116
+ logger.debug("Processing storage task with parameters: %s", storage_params)
86
117
 
87
118
  # Store images or structured content.
88
119
  df_storage_ledger: pd.DataFrame = store_images_to_minio_internal(
89
120
  df_storage_ledger=df_payload,
90
- task_config=params,
121
+ task_config=storage_params,
91
122
  storage_config={},
92
123
  execution_trace_log=None,
93
124
  )
@@ -98,3 +129,38 @@ class ImageStorageStage(RayActorStage):
98
129
  control_message.payload(df_storage_ledger)
99
130
 
100
131
  return control_message
132
+
133
+ @staticmethod
134
+ def _inject_storage_defaults(storage_uri: str, storage_options: Dict[str, Any]) -> Dict[str, Any]:
135
+ """
136
+ Populate storage options for common backends (e.g., MinIO/S3) using environment defaults.
137
+ """
138
+ parsed_scheme = urlparse(storage_uri).scheme.lower()
139
+ merged_options: Dict[str, Any] = {k: v for k, v in storage_options.items() if v is not None}
140
+
141
+ if parsed_scheme not in {"s3", "s3a", "s3n"}:
142
+ return merged_options
143
+
144
+ def _set_if_absent(key: str, env_var: str) -> None:
145
+ if key not in merged_options and env_var in os.environ:
146
+ merged_options[key] = os.environ[env_var]
147
+
148
+ _set_if_absent("key", "MINIO_ACCESS_KEY")
149
+ _set_if_absent("secret", "MINIO_SECRET_KEY")
150
+ if "token" not in merged_options and os.environ.get("MINIO_SESSION_TOKEN"):
151
+ merged_options["token"] = os.environ["MINIO_SESSION_TOKEN"]
152
+
153
+ client_kwargs = dict(merged_options.get("client_kwargs", {}))
154
+ endpoint = os.environ.get("MINIO_INTERNAL_ADDRESS")
155
+ if not endpoint:
156
+ endpoint = "http://minio:9000"
157
+ if endpoint and not endpoint.startswith(("http://", "https://")):
158
+ endpoint = f"http://{endpoint}"
159
+ client_kwargs.setdefault("endpoint_url", endpoint)
160
+ region = os.environ.get("MINIO_REGION")
161
+ if region:
162
+ client_kwargs.setdefault("region_name", region)
163
+ if client_kwargs:
164
+ merged_options["client_kwargs"] = client_kwargs
165
+
166
+ return merged_options
@@ -128,6 +128,13 @@ stages:
128
128
  ]
129
129
  yolox_infer_protocol: $YOLOX_INFER_PROTOCOL|http
130
130
  auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
131
+ pdfium_config:
132
+ yolox_endpoints: [
133
+ $YOLOX_GRPC_ENDPOINT|"",
134
+ $YOLOX_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v2"
135
+ ]
136
+ yolox_infer_protocol: $YOLOX_INFER_PROTOCOL|http
137
+ auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
131
138
  replicas:
132
139
  min_replicas: 0
133
140
  max_replicas:
@@ -149,6 +156,13 @@ stages:
149
156
  ]
150
157
  yolox_infer_protocol: $YOLOX_INFER_PROTOCOL|http
151
158
  auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
159
+ pdfium_config:
160
+ yolox_endpoints: [
161
+ $YOLOX_GRPC_ENDPOINT|"",
162
+ $YOLOX_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v2"
163
+ ]
164
+ yolox_infer_protocol: $YOLOX_INFER_PROTOCOL|http
165
+ auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
152
166
  replicas:
153
167
  min_replicas: 0
154
168
  max_replicas:
@@ -201,7 +215,7 @@ stages:
201
215
  endpoint_config:
202
216
  ocr_endpoints: [
203
217
  $OCR_GRPC_ENDPOINT|"",
204
- $OCR_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/baidu/paddleocr"
218
+ $OCR_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-ocr-v1"
205
219
  ]
206
220
  ocr_infer_protocol: $OCR_INFER_PROTOCOL|"http"
207
221
  auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
@@ -227,7 +241,7 @@ stages:
227
241
  yolox_infer_protocol: $YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL|"http"
228
242
  ocr_endpoints: [
229
243
  $OCR_GRPC_ENDPOINT|"",
230
- $OCR_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/baidu/paddleocr"
244
+ $OCR_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-ocr-v1"
231
245
  ]
232
246
  ocr_infer_protocol: $PADDLE_INFER_PROTOCOL|"http"
233
247
  auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
@@ -254,7 +268,7 @@ stages:
254
268
  yolox_infer_protocol: $YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL|"http"
255
269
  ocr_endpoints: [
256
270
  $OCR_GRPC_ENDPOINT|"",
257
- $OCR_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/baidu/paddleocr"
271
+ $OCR_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-ocr-v1"
258
272
  ]
259
273
  ocr_infer_protocol: $OCR_INFER_PROTOCOL|"http"
260
274
  auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
@@ -123,7 +123,14 @@ stages:
123
123
  docx_extraction_config:
124
124
  yolox_endpoints: [
125
125
  $YOLOX_GRPC_ENDPOINT|"page-elements:8001",
126
- $YOLOX_HTTP_ENDPOINT|"",
126
+ $YOLOX_HTTP_ENDPOINT|"http://page-elements:8000/v1/infer",
127
+ ]
128
+ yolox_infer_protocol: $YOLOX_INFER_PROTOCOL|grpc
129
+ auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
130
+ pdfium_config:
131
+ yolox_endpoints: [
132
+ $YOLOX_GRPC_ENDPOINT|"page-elements:8001",
133
+ $YOLOX_HTTP_ENDPOINT|"http://page-elements:8000/v1/infer",
127
134
  ]
128
135
  yolox_infer_protocol: $YOLOX_INFER_PROTOCOL|grpc
129
136
  auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
@@ -148,6 +155,13 @@ stages:
148
155
  ]
149
156
  yolox_infer_protocol: $YOLOX_INFER_PROTOCOL|grpc
150
157
  auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
158
+ pdfium_config:
159
+ yolox_endpoints: [
160
+ $YOLOX_GRPC_ENDPOINT|"page-elements:8001",
161
+ $YOLOX_HTTP_ENDPOINT|"http://page-elements:8000/v1/infer",
162
+ ]
163
+ yolox_infer_protocol: $YOLOX_INFER_PROTOCOL|grpc
164
+ auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
151
165
  replicas:
152
166
  min_replicas: 0
153
167
  max_replicas:
@@ -372,6 +386,9 @@ stages:
372
386
  type: "stage"
373
387
  phase: 5 # RESPONSE
374
388
  actor: "nv_ingest.framework.orchestration.ray.stages.storage.image_storage:ImageStorageStage"
389
+ config:
390
+ storage_uri: $IMAGE_STORAGE_URI|"s3://nv-ingest/artifacts/store/images"
391
+ public_base_url: $IMAGE_STORAGE_PUBLIC_BASE_URL|""
375
392
  replicas:
376
393
  min_replicas: 0
377
394
  max_replicas:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest
3
- Version: 2025.11.29.dev20251129
3
+ Version: 2025.12.8.dev20251208
4
4
  Summary: Python module for multimodal document ingestion
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -219,6 +219,8 @@ Requires-Dist: diskcache>=5.6.3
219
219
  Requires-Dist: fastapi>=0.115.6
220
220
  Requires-Dist: fastparquet>=2024.11.0
221
221
  Requires-Dist: fsspec>=2024.10.0
222
+ Requires-Dist: universal_pathlib>=0.2.6
223
+ Requires-Dist: s3fs>=2024.10.0
222
224
  Requires-Dist: gunicorn
223
225
  Requires-Dist: h11>=0.16.0
224
226
  Requires-Dist: httpx>=0.28.1
@@ -4,6 +4,8 @@ diskcache>=5.6.3
4
4
  fastapi>=0.115.6
5
5
  fastparquet>=2024.11.0
6
6
  fsspec>=2024.10.0
7
+ universal_pathlib>=0.2.6
8
+ s3fs>=2024.10.0
7
9
  gunicorn
8
10
  h11>=0.16.0
9
11
  httpx>=0.28.1
@@ -26,6 +26,8 @@ dependencies = [
26
26
  "fastapi>=0.115.6",
27
27
  "fastparquet>=2024.11.0",
28
28
  "fsspec>=2024.10.0",
29
+ "universal_pathlib>=0.2.6",
30
+ "s3fs>=2024.10.0",
29
31
  "gunicorn",
30
32
  "h11>=0.16.0", # Must pin at or above 0.16.0 for CVE mitigation
31
33
  "httpx>=0.28.1",