nv-ingest 2025.11.22.dev20251122__tar.gz → 2026.1.6.dev20260106__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest might be problematic. Click here for more details.

Files changed (131) hide show
  1. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/PKG-INFO +4 -2
  2. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/api/main.py +1 -1
  3. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/api/v2/ingest.py +12 -7
  4. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/process/execution.py +6 -0
  5. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +9 -9
  6. nv_ingest-2026.1.6.dev20260106/nv_ingest/framework/orchestration/ray/stages/meta/udf_parallel_helper.py +64 -0
  7. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +72 -6
  8. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +40 -0
  9. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/pipeline/config/replica_resolver.py +12 -2
  10. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/pipeline/default_libmode_pipeline_impl.py +32 -17
  11. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/pipeline/default_pipeline_impl.py +26 -8
  12. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest.egg-info/PKG-INFO +4 -2
  13. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest.egg-info/SOURCES.txt +1 -0
  14. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest.egg-info/requires.txt +3 -1
  15. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/pyproject.toml +3 -1
  16. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/LICENSE +0 -0
  17. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/MANIFEST.in +0 -0
  18. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/__init__.py +0 -0
  19. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/api/__init__.py +0 -0
  20. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/api/tracing.py +0 -0
  21. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/api/v1/__init__.py +0 -0
  22. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/api/v1/health.py +0 -0
  23. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/api/v1/ingest.py +0 -0
  24. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/api/v1/metrics.py +0 -0
  25. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/api/v2/README.md +0 -0
  26. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/api/v2/__init__.py +0 -0
  27. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/__init__.py +0 -0
  28. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/__init__.py +0 -0
  29. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/execution/__init__.py +0 -0
  30. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/execution/helpers.py +0 -0
  31. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/execution/options.py +0 -0
  32. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/process/__init__.py +0 -0
  33. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/process/dependent_services.py +0 -0
  34. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/process/lifecycle.py +0 -0
  35. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/process/strategies.py +0 -0
  36. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/process/termination.py +0 -0
  37. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/__init__.py +0 -0
  38. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/edges/__init__.py +0 -0
  39. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/edges/async_queue_edge.py +0 -0
  40. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/edges/ray_queue_edge.py +0 -0
  41. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/edges/threaded_queue_edge.py +0 -0
  42. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/examples/__init__.py +0 -0
  43. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/examples/task_source_harness.py +0 -0
  44. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py +0 -0
  45. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/primitives/__init__.py +0 -0
  46. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/primitives/dataclasses.py +0 -0
  47. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py +0 -0
  48. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +0 -0
  49. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +0 -0
  50. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +0 -0
  51. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/__init__.py +0 -0
  52. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py +0 -0
  53. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +0 -0
  54. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +0 -0
  55. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +0 -0
  56. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py +0 -0
  57. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +0 -0
  58. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +0 -0
  59. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/extractors/ocr_extractor.py +0 -0
  60. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +0 -0
  61. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +0 -0
  62. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +0 -0
  63. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/injectors/__init__.py +0 -0
  64. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +0 -0
  65. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/meta/__init__.py +0 -0
  66. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_edge_base.py +0 -0
  67. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +0 -0
  68. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +0 -0
  69. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +0 -0
  70. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/mutate/__init__.py +0 -0
  71. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +0 -0
  72. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +0 -0
  73. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/sinks/__init__.py +0 -0
  74. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +0 -0
  75. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +0 -0
  76. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/sources/__init__.py +0 -0
  77. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +0 -0
  78. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/storage/__init__.py +0 -0
  79. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +0 -0
  80. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/telemetry/__init__.py +0 -0
  81. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +0 -0
  82. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/telemetry/otel_meter.py +0 -0
  83. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +0 -0
  84. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/transforms/__init__.py +0 -0
  85. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +0 -0
  86. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +0 -0
  87. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +0 -0
  88. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/utility/__init__.py +0 -0
  89. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +0 -0
  90. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/util/__init__.py +0 -0
  91. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/util/env_config.py +0 -0
  92. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/util/pipeline/__init__.py +0 -0
  93. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +0 -0
  94. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/util/pipeline/tools.py +0 -0
  95. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/util/system_tools/__init__.py +0 -0
  96. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/util/system_tools/memory.py +0 -0
  97. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/orchestration/ray/util/system_tools/visualizers.py +0 -0
  98. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/schemas/__init__.py +0 -0
  99. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/schemas/framework_ingest_config_schema.py +0 -0
  100. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/schemas/framework_job_counter_schema.py +0 -0
  101. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/schemas/framework_message_broker_sink_schema.py +0 -0
  102. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/schemas/framework_message_broker_source_schema.py +0 -0
  103. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/schemas/framework_message_wrapper_schema.py +0 -0
  104. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/schemas/framework_metadata_injector_schema.py +0 -0
  105. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/schemas/framework_otel_meter_schema.py +0 -0
  106. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/schemas/framework_otel_tracer_schema.py +0 -0
  107. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/schemas/framework_processing_job_schema.py +0 -0
  108. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/schemas/framework_task_injection_schema.py +0 -0
  109. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/schemas/framework_vdb_task_sink_schema.py +0 -0
  110. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/util/__init__.py +0 -0
  111. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/util/flow_control/__init__.py +0 -0
  112. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/util/flow_control/filter_by_task.py +0 -0
  113. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/util/flow_control/udf_intercept.py +0 -0
  114. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/util/service/__init__.py +0 -0
  115. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/util/service/impl/__init__.py +0 -0
  116. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/util/service/impl/ingest/__init__.py +0 -0
  117. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +0 -0
  118. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/util/service/meta/__init__.py +0 -0
  119. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/util/service/meta/ingest/__init__.py +0 -0
  120. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py +0 -0
  121. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/util/telemetry/__init__.py +0 -0
  122. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/framework/util/telemetry/global_stats.py +0 -0
  123. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/pipeline/__init__.py +0 -0
  124. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/pipeline/config/__init__.py +0 -0
  125. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/pipeline/config/loaders.py +0 -0
  126. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/pipeline/ingest_pipeline.py +0 -0
  127. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/pipeline/pipeline_schema.py +0 -0
  128. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest/version.py +0 -0
  129. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest.egg-info/dependency_links.txt +0 -0
  130. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/nv_ingest.egg-info/top_level.txt +0 -0
  131. {nv_ingest-2025.11.22.dev20251122 → nv_ingest-2026.1.6.dev20260106}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest
3
- Version: 2025.11.22.dev20251122
3
+ Version: 2026.1.6.dev20260106
4
4
  Summary: Python module for multimodal document ingestion
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -219,13 +219,15 @@ Requires-Dist: diskcache>=5.6.3
219
219
  Requires-Dist: fastapi>=0.115.6
220
220
  Requires-Dist: fastparquet>=2024.11.0
221
221
  Requires-Dist: fsspec>=2024.10.0
222
+ Requires-Dist: universal_pathlib>=0.2.6
223
+ Requires-Dist: s3fs>=2024.10.0
222
224
  Requires-Dist: gunicorn
223
225
  Requires-Dist: h11>=0.16.0
224
226
  Requires-Dist: httpx>=0.28.1
225
227
  Requires-Dist: isodate>=0.7.2
226
228
  Requires-Dist: langdetect>=1.0.9
227
229
  Requires-Dist: minio>=7.2.12
228
- Requires-Dist: librosa>=0.10.2
230
+ Requires-Dist: librosa==0.10.2
229
231
  Requires-Dist: opentelemetry-api>=1.27.0
230
232
  Requires-Dist: opentelemetry-exporter-otlp>=1.27.0
231
233
  Requires-Dist: opentelemetry-sdk>=1.27.0
@@ -23,7 +23,7 @@ logger = logging.getLogger(__name__)
23
23
  app = FastAPI(
24
24
  title="NV-Ingest Microservice",
25
25
  description="Service for ingesting heterogenous datatypes",
26
- version="25.4.2",
26
+ version="26.1.0",
27
27
  contact={
28
28
  "name": "NVIDIA Corporation",
29
29
  "url": "https://nvidia.com",
@@ -122,11 +122,16 @@ def get_pdf_split_page_count(client_override: Optional[int] = None) -> int:
122
122
  )
123
123
  return DEFAULT_PDF_SPLIT_PAGE_COUNT
124
124
 
125
- if parsed <= 0:
126
- logger.warning("PDF_SPLIT_PAGE_COUNT must be >= 1; received %s. Using 1.", parsed)
127
- return 1
128
-
129
- return parsed
125
+ clamped = max(MIN_PAGES, min(parsed, MAX_PAGES))
126
+ if clamped != parsed:
127
+ logger.warning(
128
+ "Env PDF_SPLIT_PAGE_COUNT=%s clamped to %s (min=%s, max=%s)",
129
+ parsed,
130
+ clamped,
131
+ MIN_PAGES,
132
+ MAX_PAGES,
133
+ )
134
+ return clamped
130
135
 
131
136
 
132
137
  def split_pdf_to_chunks(pdf_content: bytes, pages_per_chunk: int) -> List[Dict[str, Any]]:
@@ -955,7 +960,7 @@ async def submit_job_v2(
955
960
  "subjob_order": subjob_ids,
956
961
  }
957
962
  )
958
- elif document_types and payloads and document_types[0].lower() in ["mp4", "mov", "avi", "mp3", "wav"]:
963
+ elif document_types and payloads and document_types[0].lower() in ["mp4", "mov", "avi", "mp3", "wav", "mkv"]:
959
964
  document_type = document_types[0]
960
965
  upload_path = f"./{Path(original_source_id).name}"
961
966
  # dump the payload to a file, just came from client
@@ -1003,7 +1008,7 @@ async def submit_job_v2(
1003
1008
  "page_count": chunk.get("page_count", 0),
1004
1009
  }
1005
1010
  )
1006
- logger.error(f"Removing uploaded file {upload_path}")
1011
+ logger.debug(f"Removing uploaded file {upload_path}")
1007
1012
  os.remove(upload_path)
1008
1013
 
1009
1014
  if submission_items:
@@ -162,6 +162,11 @@ def build_logging_config_from_env() -> LoggingConfig:
162
162
  if key not in os.environ:
163
163
  os.environ[key] = default_value
164
164
 
165
+ # For PRODUCTION mode, also suppress nv-ingest module INFO logs
166
+ if preset_level == "PRODUCTION":
167
+ logging.getLogger("nv_ingest").setLevel(logging.WARNING)
168
+ logging.getLogger("nv_ingest_api").setLevel(logging.WARNING)
169
+
165
170
  logger.info(f"Applied Ray logging preset: {preset_level}")
166
171
 
167
172
  # Get log level from environment, default to INFO
@@ -324,6 +329,7 @@ def launch_pipeline(
324
329
  pipeline_config = resolve_static_replicas(pipeline_config)
325
330
 
326
331
  # Pretty print the final pipeline configuration (after replica resolution)
332
+ # INFO level so it shows in docker/helm deployments; quiet mode suppresses in library mode
327
333
  pretty_output = pretty_print_pipeline_config(pipeline_config, config_path=None)
328
334
  logger.info("\n" + pretty_output)
329
335
 
@@ -150,7 +150,7 @@ if __name__ == "__main__":
150
150
  os.environ["OCR_GRPC_ENDPOINT"] = "localhost:8010"
151
151
  os.environ["OCR_INFER_PROTOCOL"] = "grpc"
152
152
  os.environ["OCR_MODEL_NAME"] = "paddle"
153
- os.environ["NEMORETRIEVER_PARSE_HTTP_ENDPOINT"] = "https://integrate.api.nvidia.com/v1/chat/completions"
153
+ os.environ["NEMOTRON_PARSE_HTTP_ENDPOINT"] = "https://integrate.api.nvidia.com/v1/chat/completions"
154
154
  os.environ["VLM_CAPTION_ENDPOINT"] = "https://integrate.api.nvidia.com/v1/chat/completions"
155
155
  os.environ["VLM_CAPTION_MODEL_NAME"] = "nvidia/nemotron-nano-12b-v2-vl"
156
156
  logger.info("Environment variables set.")
@@ -170,23 +170,23 @@ if __name__ == "__main__":
170
170
  yolox_graphic_elements_auth,
171
171
  yolox_graphic_elements_protocol,
172
172
  ) = get_nim_service("yolox_graphic_elements")
173
- nemoretriever_parse_grpc, nemoretriever_parse_http, nemoretriever_parse_auth, nemoretriever_parse_protocol = (
174
- get_nim_service("nemoretriever_parse")
173
+ nemotron_parse_grpc, nemotron_parse_http, nemotron_parse_auth, nemotron_parse_protocol = get_nim_service(
174
+ "nemotron_parse"
175
175
  )
176
176
  ocr_grpc, ocr_http, ocr_auth, ocr_protocol = get_nim_service("ocr")
177
177
 
178
- model_name = os.environ.get("NEMORETRIEVER_PARSE_MODEL_NAME", "nvidia/nemoretriever-parse")
178
+ model_name = os.environ.get("NEMOTRON_PARSE_MODEL_NAME", "nvidia/nemotron-parse")
179
179
  pdf_extractor_config = {
180
180
  "pdfium_config": {
181
181
  "auth_token": yolox_auth, # All auth tokens are the same for the moment
182
182
  "yolox_endpoints": (yolox_grpc, yolox_http),
183
183
  "yolox_infer_protocol": yolox_protocol,
184
184
  },
185
- "nemoretriever_parse_config": {
186
- "auth_token": nemoretriever_parse_auth,
187
- "nemoretriever_parse_endpoints": (nemoretriever_parse_grpc, nemoretriever_parse_http),
188
- "nemoretriever_parse_infer_protocol": nemoretriever_parse_protocol,
189
- "nemoretriever_parse_model_name": model_name,
185
+ "nemotron_parse_config": {
186
+ "auth_token": nemotron_parse_auth,
187
+ "nemotron_parse_endpoints": (nemotron_parse_grpc, nemotron_parse_http),
188
+ "nemotron_parse_infer_protocol": nemotron_parse_protocol,
189
+ "nemotron_parse_model_name": model_name,
190
190
  "yolox_endpoints": (yolox_grpc, yolox_http),
191
191
  "yolox_infer_protocol": yolox_protocol,
192
192
  },
@@ -0,0 +1,64 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ # Added this no-op UDF ray stage to the pipeline to help speed up the LLM api calls
6
+
7
+ """
8
+ UDF Parallel Stage - A high-concurrency no-op stage for parallel UDF execution.
9
+
10
+ This stage does nothing except pass messages through, but with high replica count
11
+ it provides a parallel execution pool for UDFs to achieve N-way concurrency.
12
+ """
13
+
14
+ import logging
15
+ from typing import Any, Optional
16
+ from pydantic import BaseModel
17
+ import ray
18
+
19
+ from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
20
+ from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
21
+ from nv_ingest_api.internal.primitives.tracing.tagging import traceable
22
+ from nv_ingest_api.util.exception_handlers.decorators import (
23
+ nv_ingest_node_failure_try_except,
24
+ )
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ @ray.remote
30
+ class UDFParallelStage(RayActorStage):
31
+ """
32
+ A no-op pass-through stage designed for parallel UDF execution.
33
+
34
+ This stage simply returns the input message unchanged, but when configured
35
+ with multiple replicas, it provides a high-concurrency pool for UDFs to
36
+ achieve parallel execution without blocking.
37
+ """
38
+
39
+ def __init__(self, config: BaseModel, stage_name: Optional[str] = None) -> None:
40
+ super().__init__(config, stage_name=stage_name)
41
+ logger.info(f"UDFParallelStage initialized: {stage_name}")
42
+
43
+ @nv_ingest_node_failure_try_except()
44
+ @traceable()
45
+ @udf_intercept_hook()
46
+ def on_data(self, message: Any) -> Any:
47
+ """
48
+ Pass-through processing that simply returns the message unchanged.
49
+
50
+ The @udf_intercept_hook decorator allows UDFs to target this stage,
51
+ and multiple replicas provide parallel execution capacity.
52
+
53
+ Parameters
54
+ ----------
55
+ message : Any
56
+ The incoming control message.
57
+
58
+ Returns
59
+ -------
60
+ Any
61
+ The unmodified control message.
62
+ """
63
+ # No-op: just return the message
64
+ return message
@@ -3,7 +3,9 @@
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
5
  import logging
6
+ import os
6
7
  from typing import Dict, Any, Optional
8
+ from urllib.parse import urlparse
7
9
 
8
10
  import pandas as pd
9
11
  import ray
@@ -26,7 +28,8 @@ logger = logging.getLogger(__name__)
26
28
  @ray.remote
27
29
  class ImageStorageStage(RayActorStage):
28
30
  """
29
- A Ray actor stage that stores images or structured content in MinIO and updates metadata with storage URLs.
31
+ A Ray actor stage that stores images or structured content using an fsspec-compatible backend and updates
32
+ metadata with storage URLs.
30
33
 
31
34
  This stage uses the validated configuration (ImageStorageModuleSchema) to process and store the DataFrame
32
35
  payload and updates the control message accordingly.
@@ -69,8 +72,16 @@ class ImageStorageStage(RayActorStage):
69
72
  task_config = remove_task_by_type(control_message, "store")
70
73
  # logger.debug("ImageStorageStage: Task configuration extracted: %s", pprint.pformat(task_config))
71
74
 
72
- store_structured: bool = task_config.get("structured", True)
73
- store_unstructured: bool = task_config.get("images", False)
75
+ stage_defaults = {
76
+ "structured": self.validated_config.structured,
77
+ "images": self.validated_config.images,
78
+ "storage_uri": self.validated_config.storage_uri,
79
+ "storage_options": self.validated_config.storage_options,
80
+ "public_base_url": self.validated_config.public_base_url,
81
+ }
82
+
83
+ store_structured: bool = task_config.get("structured", stage_defaults["structured"])
84
+ store_unstructured: bool = task_config.get("images", stage_defaults["images"])
74
85
 
75
86
  content_types: Dict[Any, Any] = {}
76
87
  if store_structured:
@@ -80,14 +91,34 @@ class ImageStorageStage(RayActorStage):
80
91
  content_types[ContentTypeEnum.IMAGE] = store_unstructured
81
92
 
82
93
  params: Dict[str, Any] = task_config.get("params", {})
83
- params["content_types"] = content_types
84
94
 
85
- logger.debug(f"Processing storage task with parameters: {params}")
95
+ storage_uri = task_config.get("storage_uri") or params.get("storage_uri") or stage_defaults["storage_uri"]
96
+ storage_options = {
97
+ **(stage_defaults["storage_options"] or {}),
98
+ **(task_config.get("storage_options") or {}),
99
+ **params.get("storage_options", {}),
100
+ }
101
+ if "public_base_url" in task_config:
102
+ public_base_url = task_config["public_base_url"]
103
+ else:
104
+ public_base_url = params.get("public_base_url", stage_defaults["public_base_url"])
105
+
106
+ storage_options = self._inject_storage_defaults(storage_uri, storage_options)
107
+
108
+ storage_params: Dict[str, Any] = {
109
+ "content_types": content_types,
110
+ "storage_uri": storage_uri,
111
+ "storage_options": storage_options,
112
+ }
113
+ if public_base_url:
114
+ storage_params["public_base_url"] = public_base_url
115
+
116
+ logger.debug("Processing storage task with parameters: %s", storage_params)
86
117
 
87
118
  # Store images or structured content.
88
119
  df_storage_ledger: pd.DataFrame = store_images_to_minio_internal(
89
120
  df_storage_ledger=df_payload,
90
- task_config=params,
121
+ task_config=storage_params,
91
122
  storage_config={},
92
123
  execution_trace_log=None,
93
124
  )
@@ -98,3 +129,38 @@ class ImageStorageStage(RayActorStage):
98
129
  control_message.payload(df_storage_ledger)
99
130
 
100
131
  return control_message
132
+
133
+ @staticmethod
134
+ def _inject_storage_defaults(storage_uri: str, storage_options: Dict[str, Any]) -> Dict[str, Any]:
135
+ """
136
+ Populate storage options for common backends (e.g., MinIO/S3) using environment defaults.
137
+ """
138
+ parsed_scheme = urlparse(storage_uri).scheme.lower()
139
+ merged_options: Dict[str, Any] = {k: v for k, v in storage_options.items() if v is not None}
140
+
141
+ if parsed_scheme not in {"s3", "s3a", "s3n"}:
142
+ return merged_options
143
+
144
+ def _set_if_absent(key: str, env_var: str) -> None:
145
+ if key not in merged_options and env_var in os.environ:
146
+ merged_options[key] = os.environ[env_var]
147
+
148
+ _set_if_absent("key", "MINIO_ACCESS_KEY")
149
+ _set_if_absent("secret", "MINIO_SECRET_KEY")
150
+ if "token" not in merged_options and os.environ.get("MINIO_SESSION_TOKEN"):
151
+ merged_options["token"] = os.environ["MINIO_SESSION_TOKEN"]
152
+
153
+ client_kwargs = dict(merged_options.get("client_kwargs", {}))
154
+ endpoint = os.environ.get("MINIO_INTERNAL_ADDRESS")
155
+ if not endpoint:
156
+ endpoint = "http://minio:9000"
157
+ if endpoint and not endpoint.startswith(("http://", "https://")):
158
+ endpoint = f"http://{endpoint}"
159
+ client_kwargs.setdefault("endpoint_url", endpoint)
160
+ region = os.environ.get("MINIO_REGION")
161
+ if region:
162
+ client_kwargs.setdefault("region_name", region)
163
+ if client_kwargs:
164
+ merged_options["client_kwargs"] = client_kwargs
165
+
166
+ return merged_options
@@ -3,6 +3,7 @@
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
5
  import logging
6
+ import os
6
7
  from typing import Union, Optional, TextIO
7
8
 
8
9
 
@@ -23,6 +24,34 @@ from nv_ingest.framework.orchestration.execution.helpers import (
23
24
  logger = logging.getLogger(__name__)
24
25
 
25
26
 
27
+ def _configure_quiet_mode():
28
+ """
29
+ Configure environment for quiet/production logging in library mode.
30
+
31
+ Sets INGEST_RAY_LOG_LEVEL=PRODUCTION if not already set by user, which:
32
+ - Sets Ray logging to ERROR level (suppresses INFO/WARNING)
33
+ - Disables Ray usage stats collection
34
+ - Disables Ray import warnings
35
+
36
+ Also silences other common warnings that are noisy in library mode.
37
+ """
38
+ # Only set if user hasn't explicitly configured
39
+ if "INGEST_RAY_LOG_LEVEL" not in os.environ:
40
+ os.environ["INGEST_RAY_LOG_LEVEL"] = "PRODUCTION"
41
+
42
+ # Silence Ray accelerator env var warning
43
+ if "RAY_ACCEL_ENV_VAR_OVERRIDE_ON_ZERO" not in os.environ:
44
+ os.environ["RAY_ACCEL_ENV_VAR_OVERRIDE_ON_ZERO"] = "0"
45
+
46
+ # Disable OTEL tracing export errors (no collector expected in library mode)
47
+ if "OTEL_SDK_DISABLED" not in os.environ:
48
+ os.environ["OTEL_SDK_DISABLED"] = "true"
49
+
50
+ # Set nv-ingest module loggers to WARNING to suppress INFO level startup messages
51
+ logging.getLogger("nv_ingest").setLevel(logging.WARNING)
52
+ logging.getLogger("nv_ingest_api").setLevel(logging.WARNING)
53
+
54
+
26
55
  def run_pipeline(
27
56
  pipeline_config: Optional[PipelineConfigSchema] = None,
28
57
  block: bool = True,
@@ -32,6 +61,7 @@ def run_pipeline(
32
61
  stdout: Optional[TextIO] = None,
33
62
  stderr: Optional[TextIO] = None,
34
63
  libmode: bool = True,
64
+ quiet: Optional[bool] = None,
35
65
  ) -> Union[RayPipelineInterface, float, RayPipelineSubprocessInterface]:
36
66
  """
37
67
  Launch and manage a pipeline using configuration.
@@ -65,6 +95,10 @@ def run_pipeline(
65
95
  libmode : bool, default=True
66
96
  If True and pipeline_config is None, loads the default libmode pipeline configuration.
67
97
  If False, requires pipeline_config to be provided.
98
+ quiet : Optional[bool], default=None
99
+ If True, configures logging for minimal output (PRODUCTION preset, suppresses
100
+ INFO-level startup messages). If None, defaults to True when libmode=True.
101
+ Set to False to see verbose startup logs even in library mode.
68
102
 
69
103
  Returns
70
104
  -------
@@ -83,6 +117,12 @@ def run_pipeline(
83
117
  Exception
84
118
  Any other exceptions raised during pipeline launch or configuration.
85
119
  """
120
+ # Configure quiet mode for library mode by default (unless explicitly disabled)
121
+ if quiet is None:
122
+ quiet = libmode
123
+ if quiet:
124
+ _configure_quiet_mode()
125
+
86
126
  # Resolve configuration
87
127
  config = resolve_pipeline_config(pipeline_config, libmode)
88
128
  overrides = create_runtime_overrides(disable_dynamic_scaling, dynamic_memory_threshold)
@@ -11,6 +11,7 @@ consumption stays within the static_memory_threshold.
11
11
  """
12
12
 
13
13
  import logging
14
+ import os
14
15
  from typing import List
15
16
  from copy import deepcopy
16
17
 
@@ -102,8 +103,17 @@ def resolve_static_replicas(pipeline_config: PipelineConfigSchema) -> PipelineCo
102
103
 
103
104
  logger.info(f"Total baseline memory demand: {total_memory_demand_mb}MB from {len(non_static_stages)} stages")
104
105
 
105
- # Check if we need to scale down
106
- if total_memory_demand_mb <= available_memory_mb:
106
+ # Optional bypass of global memory-based scale down via environment variable
107
+ bypass_env = os.getenv("NV_INGEST_BYPASS_STATIC_MEMORY_SCALE_DOWN", "").strip().lower()
108
+ bypass_scale_down = bypass_env in ("1", "true", "yes", "on")
109
+
110
+ # Check if we need to scale down (unless bypassed)
111
+ if bypass_scale_down:
112
+ logger.warning(
113
+ "Bypassing static memory-based replica scale-down due to NV_INGEST_BYPASS_STATIC_MEMORY_SCALE_DOWN"
114
+ )
115
+ scaling_factor = 1.0
116
+ elif total_memory_demand_mb <= available_memory_mb:
107
117
  logger.info("Memory demand within threshold, applying baseline replica counts")
108
118
  scaling_factor = 1.0
109
119
  else:
@@ -68,20 +68,20 @@ stages:
68
68
  auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
69
69
  yolox_endpoints: [
70
70
  $YOLOX_GRPC_ENDPOINT|"",
71
- $YOLOX_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v2"
71
+ $YOLOX_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v3"
72
72
  ]
73
73
  yolox_infer_protocol: $YOLOX_INFER_PROTOCOL|http
74
- nemoretriever_parse_config:
74
+ nemotron_parse_config:
75
75
  auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
76
- nemoretriever_parse_endpoints: [
77
- $NEMORETRIEVER_PARSE_GRPC_ENDPOINT|"",
78
- $NEMORETRIEVER_PARSE_HTTP_ENDPOINT|"https://integrate.api.nvidia.com/v1/chat/completions"
76
+ nemotron_parse_endpoints: [
77
+ $NEMOTRON_PARSE_GRPC_ENDPOINT|"",
78
+ $NEMOTRON_PARSE_HTTP_ENDPOINT|"https://integrate.api.nvidia.com/v1/chat/completions"
79
79
  ]
80
- nemoretriever_parse_infer_protocol: $NEMORETRIEVER_PARSE_INFER_PROTOCOL|http
81
- nemoretriever_parse_model_name: $NEMORETRIEVER_PARSE_MODEL_NAME|"nvidia/nemoretriever-parse"
80
+ nemotron_parse_infer_protocol: $NEMOTRON_PARSE_INFER_PROTOCOL|http
81
+ nemotron_parse_model_name: $NEMOTRON_PARSE_MODEL_NAME|"nvidia/nemotron-parse"
82
82
  yolox_endpoints: [
83
83
  $YOLOX_GRPC_ENDPOINT|"",
84
- $YOLOX_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v2"
84
+ $YOLOX_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v3"
85
85
  ]
86
86
  yolox_infer_protocol: $YOLOX_INFER_PROTOCOL|http
87
87
  replicas:
@@ -124,7 +124,14 @@ stages:
124
124
  docx_extraction_config:
125
125
  yolox_endpoints: [
126
126
  $YOLOX_GRPC_ENDPOINT|"",
127
- $YOLOX_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v2"
127
+ $YOLOX_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v3"
128
+ ]
129
+ yolox_infer_protocol: $YOLOX_INFER_PROTOCOL|http
130
+ auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
131
+ pdfium_config:
132
+ yolox_endpoints: [
133
+ $YOLOX_GRPC_ENDPOINT|"",
134
+ $YOLOX_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v3"
128
135
  ]
129
136
  yolox_infer_protocol: $YOLOX_INFER_PROTOCOL|http
130
137
  auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
@@ -145,7 +152,14 @@ stages:
145
152
  pptx_extraction_config:
146
153
  yolox_endpoints: [
147
154
  $YOLOX_GRPC_ENDPOINT|"",
148
- $YOLOX_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v2"
155
+ $YOLOX_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v3"
156
+ ]
157
+ yolox_infer_protocol: $YOLOX_INFER_PROTOCOL|http
158
+ auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
159
+ pdfium_config:
160
+ yolox_endpoints: [
161
+ $YOLOX_GRPC_ENDPOINT|"",
162
+ $YOLOX_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v3"
149
163
  ]
150
164
  yolox_infer_protocol: $YOLOX_INFER_PROTOCOL|http
151
165
  auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
@@ -166,7 +180,7 @@ stages:
166
180
  image_extraction_config:
167
181
  yolox_endpoints: [
168
182
  $YOLOX_GRPC_ENDPOINT|"",
169
- $YOLOX_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v2"
183
+ $YOLOX_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v3"
170
184
  ]
171
185
  yolox_infer_protocol: $YOLOX_INFER_PROTOCOL|http
172
186
  auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
@@ -201,7 +215,7 @@ stages:
201
215
  endpoint_config:
202
216
  ocr_endpoints: [
203
217
  $OCR_GRPC_ENDPOINT|"",
204
- $OCR_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/baidu/paddleocr"
218
+ $OCR_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-ocr-v1"
205
219
  ]
206
220
  ocr_infer_protocol: $OCR_INFER_PROTOCOL|"http"
207
221
  auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
@@ -227,9 +241,9 @@ stages:
227
241
  yolox_infer_protocol: $YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL|"http"
228
242
  ocr_endpoints: [
229
243
  $OCR_GRPC_ENDPOINT|"",
230
- $OCR_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/baidu/paddleocr"
244
+ $OCR_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-ocr-v1"
231
245
  ]
232
- ocr_infer_protocol: $PADDLE_INFER_PROTOCOL|"http"
246
+ ocr_infer_protocol: $OCR_INFER_PROTOCOL|"http"
233
247
  auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
234
248
  replicas:
235
249
  min_replicas: 0
@@ -254,7 +268,7 @@ stages:
254
268
  yolox_infer_protocol: $YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL|"http"
255
269
  ocr_endpoints: [
256
270
  $OCR_GRPC_ENDPOINT|"",
257
- $OCR_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/baidu/paddleocr"
271
+ $OCR_HTTP_ENDPOINT|"https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-ocr-v1"
258
272
  ]
259
273
  ocr_infer_protocol: $OCR_INFER_PROTOCOL|"http"
260
274
  auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
@@ -318,9 +332,10 @@ stages:
318
332
  actor: "nv_ingest.framework.orchestration.ray.stages.transforms.image_caption:ImageCaptionTransformStage"
319
333
  config:
320
334
  api_key: $NGC_API_KEY|$NVIDIA_API_KEY
321
- endpoint_url: $VLM_CAPTION_ENDPOINT|"http://vlm:8000/v1/chat/completions"
335
+ endpoint_url: $VLM_CAPTION_ENDPOINT|"https://integrate.api.nvidia.com/v1/chat/completions"
322
336
  model_name: $VLM_CAPTION_MODEL_NAME|"nvidia/nemotron-nano-12b-v2-vl"
323
- prompt: "Caption the content of this image:"
337
+ prompt: $VLM_CAPTION_PROMPT|"Caption the content of this image:"
338
+ system_prompt: $VLM_CAPTION_SYSTEM_PROMPT|"/no_think"
324
339
  replicas:
325
340
  min_replicas: 0
326
341
  max_replicas:
@@ -70,14 +70,14 @@ stages:
70
70
  $YOLOX_HTTP_ENDPOINT|"http://page-elements:8000/v1/infer",
71
71
  ]
72
72
  yolox_infer_protocol: $YOLOX_INFER_PROTOCOL|grpc
73
- nemoretriever_parse_config:
73
+ nemotron_parse_config:
74
74
  auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
75
- nemoretriever_parse_endpoints: [
76
- $NEMORETRIEVER_PARSE_GRPC_ENDPOINT|"",
77
- $NEMORETRIEVER_PARSE_HTTP_ENDPOINT|"http://nemoretriever-parse:8000/v1/chat/completions",
75
+ nemotron_parse_endpoints: [
76
+ $NEMOTRON_PARSE_GRPC_ENDPOINT|"",
77
+ $NEMOTRON_PARSE_HTTP_ENDPOINT|"http://nemotron-parse:8000/v1/chat/completions",
78
78
  ]
79
- nemoretriever_parse_infer_protocol: $NEMORETRIEVER_PARSE_INFER_PROTOCOL|http
80
- nemoretriever_parse_model_name: $NEMORETRIEVER_PARSE_MODEL_NAME|"nvidia/nemoretriever-parse"
79
+ nemotron_parse_infer_protocol: $NEMOTRON_PARSE_INFER_PROTOCOL|http
80
+ nemotron_parse_model_name: $NEMOTRON_PARSE_MODEL_NAME|"nvidia/nemotron-parse"
81
81
  yolox_endpoints: [
82
82
  $YOLOX_GRPC_ENDPOINT|"page-elements:8001",
83
83
  $YOLOX_HTTP_ENDPOINT|"http://page-elements:8000/v1/infer",
@@ -123,7 +123,14 @@ stages:
123
123
  docx_extraction_config:
124
124
  yolox_endpoints: [
125
125
  $YOLOX_GRPC_ENDPOINT|"page-elements:8001",
126
- $YOLOX_HTTP_ENDPOINT|"",
126
+ $YOLOX_HTTP_ENDPOINT|"http://page-elements:8000/v1/infer",
127
+ ]
128
+ yolox_infer_protocol: $YOLOX_INFER_PROTOCOL|grpc
129
+ auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
130
+ pdfium_config:
131
+ yolox_endpoints: [
132
+ $YOLOX_GRPC_ENDPOINT|"page-elements:8001",
133
+ $YOLOX_HTTP_ENDPOINT|"http://page-elements:8000/v1/infer",
127
134
  ]
128
135
  yolox_infer_protocol: $YOLOX_INFER_PROTOCOL|grpc
129
136
  auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
@@ -148,6 +155,13 @@ stages:
148
155
  ]
149
156
  yolox_infer_protocol: $YOLOX_INFER_PROTOCOL|grpc
150
157
  auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
158
+ pdfium_config:
159
+ yolox_endpoints: [
160
+ $YOLOX_GRPC_ENDPOINT|"page-elements:8001",
161
+ $YOLOX_HTTP_ENDPOINT|"http://page-elements:8000/v1/infer",
162
+ ]
163
+ yolox_infer_protocol: $YOLOX_INFER_PROTOCOL|grpc
164
+ auth_token: $NGC_API_KEY|$NVIDIA_API_KEY
151
165
  replicas:
152
166
  min_replicas: 0
153
167
  max_replicas:
@@ -340,7 +354,8 @@ stages:
340
354
  api_key: $NGC_API_KEY|$NVIDIA_API_KEY
341
355
  model_name: $VLM_CAPTION_MODEL_NAME|"nvidia/nemotron-nano-12b-v2-vl"
342
356
  endpoint_url: $VLM_CAPTION_ENDPOINT|"http://vlm:8000/v1/chat/completions"
343
- prompt: "Caption the content of this image:"
357
+ prompt: $VLM_CAPTION_PROMPT|"Caption the content of this image:"
358
+ system_prompt: $VLM_CAPTION_SYSTEM_PROMPT|"/no_think"
344
359
  replicas:
345
360
  min_replicas: 0
346
361
  max_replicas:
@@ -372,6 +387,9 @@ stages:
372
387
  type: "stage"
373
388
  phase: 5 # RESPONSE
374
389
  actor: "nv_ingest.framework.orchestration.ray.stages.storage.image_storage:ImageStorageStage"
390
+ config:
391
+ storage_uri: $IMAGE_STORAGE_URI|"s3://nv-ingest/artifacts/store/images"
392
+ public_base_url: $IMAGE_STORAGE_PUBLIC_BASE_URL|""
375
393
  replicas:
376
394
  min_replicas: 0
377
395
  max_replicas:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest
3
- Version: 2025.11.22.dev20251122
3
+ Version: 2026.1.6.dev20260106
4
4
  Summary: Python module for multimodal document ingestion
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -219,13 +219,15 @@ Requires-Dist: diskcache>=5.6.3
219
219
  Requires-Dist: fastapi>=0.115.6
220
220
  Requires-Dist: fastparquet>=2024.11.0
221
221
  Requires-Dist: fsspec>=2024.10.0
222
+ Requires-Dist: universal_pathlib>=0.2.6
223
+ Requires-Dist: s3fs>=2024.10.0
222
224
  Requires-Dist: gunicorn
223
225
  Requires-Dist: h11>=0.16.0
224
226
  Requires-Dist: httpx>=0.28.1
225
227
  Requires-Dist: isodate>=0.7.2
226
228
  Requires-Dist: langdetect>=1.0.9
227
229
  Requires-Dist: minio>=7.2.12
228
- Requires-Dist: librosa>=0.10.2
230
+ Requires-Dist: librosa==0.10.2
229
231
  Requires-Dist: opentelemetry-api>=1.27.0
230
232
  Requires-Dist: opentelemetry-exporter-otlp>=1.27.0
231
233
  Requires-Dist: opentelemetry-sdk>=1.27.0
@@ -63,6 +63,7 @@ nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_edge_base.py
63
63
  nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py
64
64
  nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py
65
65
  nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py
66
+ nv_ingest/framework/orchestration/ray/stages/meta/udf_parallel_helper.py
66
67
  nv_ingest/framework/orchestration/ray/stages/mutate/__init__.py
67
68
  nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py
68
69
  nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py
@@ -4,13 +4,15 @@ diskcache>=5.6.3
4
4
  fastapi>=0.115.6
5
5
  fastparquet>=2024.11.0
6
6
  fsspec>=2024.10.0
7
+ universal_pathlib>=0.2.6
8
+ s3fs>=2024.10.0
7
9
  gunicorn
8
10
  h11>=0.16.0
9
11
  httpx>=0.28.1
10
12
  isodate>=0.7.2
11
13
  langdetect>=1.0.9
12
14
  minio>=7.2.12
13
- librosa>=0.10.2
15
+ librosa==0.10.2
14
16
  opentelemetry-api>=1.27.0
15
17
  opentelemetry-exporter-otlp>=1.27.0
16
18
  opentelemetry-sdk>=1.27.0
@@ -26,13 +26,15 @@ dependencies = [
26
26
  "fastapi>=0.115.6",
27
27
  "fastparquet>=2024.11.0",
28
28
  "fsspec>=2024.10.0",
29
+ "universal_pathlib>=0.2.6",
30
+ "s3fs>=2024.10.0",
29
31
  "gunicorn",
30
32
  "h11>=0.16.0", # Must pin at or above 0.16.0 for CVE mitigation
31
33
  "httpx>=0.28.1",
32
34
  "isodate>=0.7.2",
33
35
  "langdetect>=1.0.9",
34
36
  "minio>=7.2.12",
35
- "librosa>=0.10.2",
37
+ "librosa==0.10.2",
36
38
  "opentelemetry-api>=1.27.0",
37
39
  "opentelemetry-exporter-otlp>=1.27.0",
38
40
  "opentelemetry-sdk>=1.27.0",