nv-ingest 2025.8.8.dev20250808__tar.gz → 2026.1.12.dev20260112__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest might be problematic. Click here for more details.

Files changed (135) hide show
  1. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/PKG-INFO +6 -3
  2. nv_ingest-2026.1.12.dev20260112/nv_ingest/api/__init__.py +9 -0
  3. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/api/main.py +3 -1
  4. nv_ingest-2026.1.12.dev20260112/nv_ingest/api/tracing.py +82 -0
  5. nv_ingest-2026.1.12.dev20260112/nv_ingest/api/v2/README.md +203 -0
  6. nv_ingest-2026.1.12.dev20260112/nv_ingest/api/v2/ingest.py +1305 -0
  7. nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/execution/helpers.py +85 -0
  8. nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/execution/options.py +112 -0
  9. nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/process/dependent_services.py +84 -0
  10. nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/process/execution.py +501 -0
  11. nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/process/lifecycle.py +214 -0
  12. nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/process/strategies.py +218 -0
  13. nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/process/termination.py +147 -0
  14. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +12 -12
  15. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +4 -4
  16. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +32 -38
  17. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +5 -5
  18. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +10 -7
  19. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +17 -14
  20. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +11 -6
  21. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py +10 -5
  22. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +12 -7
  23. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +22 -10
  24. nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/ray/stages/extractors/ocr_extractor.py +71 -0
  25. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +19 -15
  26. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +10 -5
  27. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +16 -14
  28. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +16 -13
  29. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +3 -0
  30. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +3 -3
  31. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +92 -4
  32. nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/ray/stages/meta/udf_parallel_helper.py +64 -0
  33. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +12 -8
  34. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +12 -9
  35. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +4 -4
  36. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +5 -2
  37. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +116 -69
  38. nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +166 -0
  39. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +10 -5
  40. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +8 -4
  41. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +17 -7
  42. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +12 -6
  43. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +17 -14
  44. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +21 -14
  45. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +11 -3
  46. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +1 -2
  47. nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +140 -0
  48. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/util/pipeline/tools.py +13 -3
  49. nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/util/flow_control/udf_intercept.py +352 -0
  50. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +215 -11
  51. nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/util/service/meta/ingest/__init__.py +3 -0
  52. nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/util/telemetry/__init__.py +3 -0
  53. nv_ingest-2026.1.12.dev20260112/nv_ingest/pipeline/__init__.py +3 -0
  54. nv_ingest-2026.1.12.dev20260112/nv_ingest/pipeline/config/__init__.py +3 -0
  55. nv_ingest-2026.1.12.dev20260112/nv_ingest/pipeline/config/loaders.py +229 -0
  56. nv_ingest-2026.1.12.dev20260112/nv_ingest/pipeline/config/replica_resolver.py +237 -0
  57. nv_ingest-2026.1.12.dev20260112/nv_ingest/pipeline/default_libmode_pipeline_impl.py +529 -0
  58. nv_ingest-2026.1.12.dev20260112/nv_ingest/pipeline/default_pipeline_impl.py +558 -0
  59. nv_ingest-2026.1.12.dev20260112/nv_ingest/pipeline/ingest_pipeline.py +389 -0
  60. nv_ingest-2026.1.12.dev20260112/nv_ingest/pipeline/pipeline_schema.py +398 -0
  61. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest.egg-info/PKG-INFO +6 -3
  62. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest.egg-info/SOURCES.txt +25 -3
  63. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest.egg-info/requires.txt +5 -2
  64. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/pyproject.toml +8 -2
  65. nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +0 -98
  66. nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_builders.py +0 -359
  67. nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +0 -393
  68. nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/orchestration/ray/util/pipeline/stage_builders.py +0 -649
  69. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/LICENSE +0 -0
  70. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/MANIFEST.in +0 -0
  71. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/__init__.py +0 -0
  72. {nv_ingest-2025.8.8.dev20250808/nv_ingest/api → nv_ingest-2026.1.12.dev20260112/nv_ingest/api/v1}/__init__.py +0 -0
  73. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/api/v1/health.py +0 -0
  74. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/api/v1/ingest.py +0 -0
  75. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/api/v1/metrics.py +0 -0
  76. {nv_ingest-2025.8.8.dev20250808/nv_ingest/api/v1 → nv_ingest-2026.1.12.dev20260112/nv_ingest/api/v2}/__init__.py +0 -0
  77. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/__init__.py +0 -0
  78. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/__init__.py +0 -0
  79. {nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/orchestration/ray → nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/execution}/__init__.py +0 -0
  80. {nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/orchestration/ray/edges → nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/process}/__init__.py +0 -0
  81. {nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/orchestration/ray/examples → nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/ray}/__init__.py +0 -0
  82. {nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/orchestration/ray/primitives → nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/ray/edges}/__init__.py +0 -0
  83. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/edges/async_queue_edge.py +0 -0
  84. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/edges/ray_queue_edge.py +0 -0
  85. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/edges/threaded_queue_edge.py +0 -0
  86. {nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/orchestration/ray/stages → nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/ray/examples}/__init__.py +0 -0
  87. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/examples/task_source_harness.py +0 -0
  88. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py +0 -0
  89. {nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/orchestration/ray/stages/extractors → nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/ray/primitives}/__init__.py +0 -0
  90. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/primitives/dataclasses.py +0 -0
  91. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py +0 -0
  92. {nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/orchestration/ray/stages/injectors → nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/ray/stages}/__init__.py +0 -0
  93. {nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/orchestration/ray/stages/meta → nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/ray/stages/extractors}/__init__.py +0 -0
  94. {nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/orchestration/ray/stages/mutate → nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/ray/stages/injectors}/__init__.py +0 -0
  95. {nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/orchestration/ray/stages/sinks → nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/ray/stages/meta}/__init__.py +0 -0
  96. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_edge_base.py +0 -0
  97. {nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/orchestration/ray/stages/sources → nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/ray/stages/mutate}/__init__.py +0 -0
  98. {nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/orchestration/ray/stages/storage → nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/ray/stages/sinks}/__init__.py +0 -0
  99. {nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/orchestration/ray/stages/telemetry → nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/ray/stages/sources}/__init__.py +0 -0
  100. {nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/orchestration/ray/stages/transforms → nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/ray/stages/storage}/__init__.py +0 -0
  101. {nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/orchestration/ray/stages/utility → nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/ray/stages/telemetry}/__init__.py +0 -0
  102. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/stages/telemetry/otel_meter.py +0 -0
  103. {nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/orchestration/ray/util → nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/ray/stages/transforms}/__init__.py +0 -0
  104. {nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/orchestration/ray/util/pipeline → nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/ray/stages/utility}/__init__.py +0 -0
  105. {nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/orchestration/ray/util/system_tools → nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/ray/util}/__init__.py +0 -0
  106. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/util/env_config.py +0 -0
  107. {nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/util → nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/ray/util/pipeline}/__init__.py +0 -0
  108. {nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/util/service → nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/orchestration/ray/util/system_tools}/__init__.py +0 -0
  109. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/util/system_tools/memory.py +0 -0
  110. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/orchestration/ray/util/system_tools/visualizers.py +0 -0
  111. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/schemas/__init__.py +0 -0
  112. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/schemas/framework_ingest_config_schema.py +0 -0
  113. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/schemas/framework_job_counter_schema.py +0 -0
  114. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/schemas/framework_message_broker_sink_schema.py +0 -0
  115. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/schemas/framework_message_broker_source_schema.py +0 -0
  116. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/schemas/framework_message_wrapper_schema.py +0 -0
  117. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/schemas/framework_metadata_injector_schema.py +0 -0
  118. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/schemas/framework_otel_meter_schema.py +0 -0
  119. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/schemas/framework_otel_tracer_schema.py +0 -0
  120. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/schemas/framework_processing_job_schema.py +0 -0
  121. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/schemas/framework_task_injection_schema.py +0 -0
  122. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/schemas/framework_vdb_task_sink_schema.py +0 -0
  123. {nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/util/service/impl → nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/util}/__init__.py +0 -0
  124. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/util/flow_control/__init__.py +0 -0
  125. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/util/flow_control/filter_by_task.py +0 -0
  126. {nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/util/service/impl/ingest → nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/util/service}/__init__.py +0 -0
  127. {nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/util/service/meta → nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/util/service/impl}/__init__.py +0 -0
  128. {nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/util/service/meta → nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/util/service/impl}/ingest/__init__.py +0 -0
  129. {nv_ingest-2025.8.8.dev20250808/nv_ingest/framework/util/telemetry → nv_ingest-2026.1.12.dev20260112/nv_ingest/framework/util/service/meta}/__init__.py +0 -0
  130. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py +0 -0
  131. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/framework/util/telemetry/global_stats.py +0 -0
  132. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest/version.py +0 -0
  133. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest.egg-info/dependency_links.txt +0 -0
  134. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/nv_ingest.egg-info/top_level.txt +0 -0
  135. {nv_ingest-2025.8.8.dev20250808 → nv_ingest-2026.1.12.dev20260112}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest
3
- Version: 2025.8.8.dev20250808
3
+ Version: 2026.1.12.dev20260112
4
4
  Summary: Python module for multimodal document ingestion
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -219,16 +219,19 @@ Requires-Dist: diskcache>=5.6.3
219
219
  Requires-Dist: fastapi>=0.115.6
220
220
  Requires-Dist: fastparquet>=2024.11.0
221
221
  Requires-Dist: fsspec>=2024.10.0
222
+ Requires-Dist: universal_pathlib>=0.2.6
223
+ Requires-Dist: s3fs>=2024.10.0
222
224
  Requires-Dist: gunicorn
223
225
  Requires-Dist: h11>=0.16.0
224
226
  Requires-Dist: httpx>=0.28.1
225
227
  Requires-Dist: isodate>=0.7.2
226
228
  Requires-Dist: langdetect>=1.0.9
227
229
  Requires-Dist: minio>=7.2.12
228
- Requires-Dist: openai>=1.82.0
230
+ Requires-Dist: librosa==0.10.2
229
231
  Requires-Dist: opentelemetry-api>=1.27.0
230
232
  Requires-Dist: opentelemetry-exporter-otlp>=1.27.0
231
233
  Requires-Dist: opentelemetry-sdk>=1.27.0
234
+ Requires-Dist: psutil>=7.1.0
232
235
  Requires-Dist: pydantic>2.0.0
233
236
  Requires-Dist: pydantic-settings>2.0.0
234
237
  Requires-Dist: pypdfium2==4.30.0
@@ -240,7 +243,7 @@ Requires-Dist: python-docx>=1.1.2
240
243
  Requires-Dist: python-dotenv>=1.0.1
241
244
  Requires-Dist: python-pptx>=1.0.2
242
245
  Requires-Dist: prometheus-client
243
- Requires-Dist: ray[all]>=2.37.0
246
+ Requires-Dist: ray[all]>=2.49.0
244
247
  Requires-Dist: redis>=5.2.1
245
248
  Requires-Dist: requests>=2.28.2
246
249
  Requires-Dist: scikit-learn>=1.6.0
@@ -0,0 +1,9 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ """nv_ingest.api package."""
6
+
7
+ from .tracing import traced_endpoint # re-export for convenience
8
+
9
+ __all__ = ["traced_endpoint"]
@@ -15,6 +15,7 @@ from opentelemetry.sdk.trace.export import BatchSpanProcessor
15
15
  from .v1.health import router as HealthApiRouter
16
16
  from .v1.ingest import router as IngestApiRouter
17
17
  from .v1.metrics import router as MetricsApiRouter
18
+ from .v2.ingest import router as IngestApiRouterV2
18
19
 
19
20
  logger = logging.getLogger(__name__)
20
21
 
@@ -22,7 +23,7 @@ logger = logging.getLogger(__name__)
22
23
  app = FastAPI(
23
24
  title="NV-Ingest Microservice",
24
25
  description="Service for ingesting heterogenous datatypes",
25
- version="25.4.2",
26
+ version="26.1.0",
26
27
  contact={
27
28
  "name": "NVIDIA Corporation",
28
29
  "url": "https://nvidia.com",
@@ -33,6 +34,7 @@ app = FastAPI(
33
34
  app.include_router(IngestApiRouter, prefix="/v1")
34
35
  app.include_router(HealthApiRouter, prefix="/v1/health")
35
36
  app.include_router(MetricsApiRouter, prefix="/v1")
37
+ app.include_router(IngestApiRouterV2, prefix="/v2")
36
38
 
37
39
  # Set up the tracer provider and add a processor for exporting traces
38
40
  resource = Resource(attributes={"service.name": "nv-ingest"})
@@ -0,0 +1,82 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ """HTTP endpoint tracing utilities."""
6
+
7
+ from functools import wraps
8
+ from inspect import iscoroutinefunction
9
+ from typing import Any, Callable, Optional, TypeVar
10
+
11
+ from fastapi import Request, Response
12
+ from opentelemetry import trace
13
+
14
+ F = TypeVar("F", bound=Callable[..., Any])
15
+
16
+ tracer = trace.get_tracer(__name__)
17
+
18
+
19
+ def traced_endpoint(name: Optional[str] = None) -> Callable[[F], F]:
20
+ """Wrap a FastAPI endpoint with a span whose name defaults to the function name.
21
+
22
+ The decorator preserves the wrapped callable's signature so FastAPI can continue
23
+ to perform dependency injection and generate OpenAPI documentation correctly.
24
+ """
25
+
26
+ def decorator(func: F) -> F:
27
+ span_name = name or func.__name__
28
+
29
+ if iscoroutinefunction(func):
30
+
31
+ @wraps(func)
32
+ async def async_wrapper(*args: Any, **kwargs: Any) -> Any:
33
+ with tracer.start_as_current_span(span_name) as span:
34
+ span.set_attribute("nv_ingest.endpoint", func.__qualname__)
35
+ _record_http_request(span, args, kwargs)
36
+ response = await func(*args, **kwargs)
37
+ _record_http_response(span, response)
38
+ return response
39
+
40
+ return async_wrapper # type: ignore[return-value]
41
+
42
+ @wraps(func)
43
+ def sync_wrapper(*args: Any, **kwargs: Any) -> Any:
44
+ with tracer.start_as_current_span(span_name) as span:
45
+ span.set_attribute("nv_ingest.endpoint", func.__qualname__)
46
+ _record_http_request(span, args, kwargs)
47
+ result = func(*args, **kwargs)
48
+ _record_http_response(span, result)
49
+ return result
50
+
51
+ return sync_wrapper # type: ignore[return-value]
52
+
53
+ return decorator
54
+
55
+
56
+ def _record_http_request(span, args: tuple[Any, ...], kwargs: dict[str, Any]) -> None:
57
+ request = _find_type(Request, args, kwargs)
58
+ if request is None:
59
+ return
60
+ span.set_attribute("http.method", request.method)
61
+ span.set_attribute("http.url", str(request.url))
62
+
63
+
64
+ def _record_http_response(span, response: Any) -> None:
65
+ maybe_response = response if isinstance(response, Response) else None
66
+ if maybe_response is None:
67
+ maybe_response = _find_type(Response, (response,), {})
68
+ if maybe_response is None:
69
+ return
70
+ span.set_attribute("http.status_code", maybe_response.status_code)
71
+
72
+
73
+ def _find_type(expected_type: type, args: tuple[Any, ...], kwargs: dict[str, Any]) -> Optional[Any]:
74
+ """Return the first argument matching ``expected_type`` from args or kwargs."""
75
+
76
+ for arg in args:
77
+ if isinstance(arg, expected_type):
78
+ return arg
79
+ for value in kwargs.values():
80
+ if isinstance(value, expected_type):
81
+ return value
82
+ return None
@@ -0,0 +1,203 @@
1
+ # NV-Ingest V2 API
2
+
3
+ ## Overview
4
+
5
+ The V2 API introduces automatic PDF splitting at the REST layer to improve processing throughput. When a multi-page PDF is submitted, it's automatically split into configurable multi-page chunks (default 32 pages) before being sent to the Redis service that then communicates with our Ray processing backend.
6
+
7
+ ## Key Changes from V1
8
+
9
+ 1. **Automatic PDF Splitting**: PDFs over the configured `PDF_SPLIT_PAGE_COUNT` are automatically split into multi-page chunks
10
+ 2. **Parent-Child Job Tracking**: Parent jobs maintain relationships with their subjobs via Redis
11
+ 3. **Transparent Aggregation**: Results are automatically aggregated when fetching parent jobs
12
+ 4. **Backward Compatible**: PDFs with page counts ≤ `PDF_SPLIT_PAGE_COUNT` behave identical to V1
13
+
14
+ ## How It Works
15
+
16
+ 1. **Submit**: When a PDF with pages exceeding `PDF_SPLIT_PAGE_COUNT` is submitted to `/v2/submit_job`:
17
+ - The PDF is split into page chunks (size determined by `PDF_SPLIT_PAGE_COUNT`)
18
+ - Each chunk becomes a subjob with deterministic IDs derived from the parent
19
+ - Source IDs are modified to maintain association: `document.pdf#page_1`
20
+ - Parent-child mapping is stored in Redis
21
+
22
+ 2. **Processing**: Each subjob is processed independently by Ray, appearing as chunk-sized PDFs that honor the configured `PDF_SPLIT_PAGE_COUNT`
23
+
24
+ 3. **Fetch**: When fetching the parent job via `/v2/fetch_job/{parent_id}`:
25
+ - Subjob states and results are retrieved concurrently (bounded by the Redis connection pool)
26
+ - If all complete, results are aggregated in original page order
27
+ - Pending work returns 202 (processing)
28
+ - Failed chunks are noted without failing the entire job; metadata records which chunks failed
29
+
30
+
31
+ ## Client Library Features
32
+
33
+ ### Accessing Trace Metrics
34
+
35
+ The Python client library provides convenient access to trace metrics via the `return_traces` parameter:
36
+
37
+ ```python
38
+ from nv_ingest_client.client import Ingestor
39
+
40
+ ingestor = Ingestor(
41
+ message_client_hostname="localhost",
42
+ message_client_port=7670,
43
+ message_client_kwargs={"api_version": "v2"}
44
+ ).files("/path/to/pdfs").extract().embed()
45
+
46
+ # Get results with trace metrics
47
+ results, traces = ingestor.ingest(return_traces=True)
48
+
49
+ # Access timing for first document
50
+ pdf_time = traces[0]["trace::resident_time::pdf_extractor"] / 1e9
51
+ table_time = traces[0]["trace::resident_time::table_extractor"] / 1e9
52
+ print(f"PDF: {pdf_time:.2f}s, Tables: {table_time:.2f}s")
53
+ ```
54
+
55
+ **Note:** For split PDFs, `resident_time` represents aggregated compute time across all chunks. For non-split PDFs, it is computed client-side from entry/exit pairs.
56
+
57
+ ### Aggregated response
58
+
59
+ The fetch endpoint returns a JSON body shaped like the following:
60
+
61
+ ```json
62
+ {
63
+ "data": [...],
64
+ "status": "success",
65
+ "trace": {
66
+ "trace::entry::pdf_extractor": 1000,
67
+ "trace::exit::pdf_extractor": 2150,
68
+ "trace::resident_time::pdf_extractor": 250,
69
+ "trace::entry::table_extractor": 1200,
70
+ "trace::exit::table_extractor": 2300,
71
+ "trace::resident_time::table_extractor": 300
72
+ // ... parent-level aggregated traces only (clean, V1-compatible)
73
+ },
74
+ "annotations": {
75
+ "annotation::uuid-1": {"task_id": "pdf_extractor", "task_result": "SUCCESS"},
76
+ "annotation::uuid-2": {"task_id": "table_extractor", "task_result": "SUCCESS"}
77
+ // ... all annotations from all chunks (annotations have unique UUIDs)
78
+ },
79
+ "metadata": {
80
+ "parent_job_id": "<uuid>",
81
+ "total_pages": 320,
82
+ "pages_per_chunk": 32,
83
+ "original_source_id": "document.pdf",
84
+ "subjob_ids": ["...", "..."],
85
+ "subjobs_failed": 0,
86
+ "failed_subjobs": [],
87
+ "chunks": [
88
+ {
89
+ "job_id": "...",
90
+ "chunk_index": 1,
91
+ "start_page": 1,
92
+ "end_page": 32,
93
+ "page_count": 32
94
+ }
95
+ // ... additional chunks ...
96
+ ],
97
+ "trace_segments": [
98
+ {
99
+ "job_id": "...",
100
+ "chunk_index": 1,
101
+ "start_page": 1,
102
+ "end_page": 32,
103
+ "trace": {"trace::entry::pdf_extractor": 1.7599e18, ...}
104
+ }
105
+ // ... per-chunk trace details
106
+ ],
107
+ "annotation_segments": [
108
+ {
109
+ "job_id": "...",
110
+ "chunk_index": 1,
111
+ "start_page": 1,
112
+ "end_page": 32,
113
+ "annotations": {"annotation::uuid": {...}, ...}
114
+ }
115
+ // ... per-chunk annotation details
116
+ ]
117
+ }
118
+ }
119
+ ```
120
+
121
+ **Top-level trace and annotations** (V1 compatibility):
122
+ - `trace`: Contains **only parent-level aggregated traces** for clean V1 compatibility
123
+ - `trace::entry::<stage>` - Earliest entry time across all chunks
124
+ - `trace::exit::<stage>` - Latest exit time across all chunks
125
+ - `trace::resident_time::<stage>` - Sum of all chunk durations (total compute time)
126
+ - `annotations`: Merged annotations from all chunks (annotations have unique UUIDs so merge safely)
127
+ - These fields match V1 structure, allowing existing client code to work without modification
128
+
129
+ **Note:** Chunk-level trace details are available in `metadata.trace_segments[]` for granular analysis
130
+
131
+ **Parent-Level Trace Aggregation:**
132
+
133
+ For split PDFs, parent-level metrics are automatically computed for each stage (including nested stages):
134
+
135
+ - `trace::entry::<stage>` - Earliest entry time across all chunks (when first chunk entered stage)
136
+ - `trace::exit::<stage>` - Latest exit time across all chunks (when last chunk exited stage)
137
+ - `trace::resident_time::<stage>` - Sum of all chunk durations (total compute time in stage)
138
+
139
+ **Supports arbitrary nesting depth:**
140
+ - Simple: `trace::entry::pdf_extractor`
141
+ - Nested: `trace::entry::pdf_extractor::pdf_extraction::pdfium_pages_to_numpy_0`
142
+
143
+ **Example:**
144
+ ```json
145
+ {
146
+ "trace": {
147
+ "trace::entry::pdf_extractor": 1000,
148
+ "trace::exit::pdf_extractor": 2150,
149
+ "trace::resident_time::pdf_extractor": 250
150
+ // ... only parent-level aggregations (clean, concise)
151
+ },
152
+ "metadata": {
153
+ "trace_segments": [
154
+ {
155
+ "chunk_index": 1,
156
+ "start_page": 1,
157
+ "end_page": 32,
158
+ "trace": {
159
+ "trace::entry::pdf_extractor": 1000,
160
+ "trace::exit::pdf_extractor": 1100
161
+ }
162
+ },
163
+ {
164
+ "chunk_index": 2,
165
+ "trace": {
166
+ "trace::entry::pdf_extractor": 2000,
167
+ "trace::exit::pdf_extractor": 2150
168
+ }
169
+ }
170
+ ]
171
+ }
172
+ }
173
+ ```
174
+
175
+ **Note:** `resident_time` represents total compute time (sum of chunk durations), while `exit - entry` shows wall-clock span.
176
+
177
+ **Detailed metadata** (V2-specific):
178
+ - `trace_segments`: **Chunk-level trace data** with page ranges for granular per-chunk analysis
179
+ - `annotation_segments`: Per-chunk annotation data with page ranges
180
+ - Clients can correlate chunk data by matching `job_id` or `chunk_index` across arrays
181
+ - Failed chunk entries remain in `failed_subjobs`; missing chunks indicate the sink did not emit telemetry
182
+ - **To access chunk traces:** Use `metadata.trace_segments[]` - each segment contains the full trace dict for that chunk
183
+
184
+ ### Advanced: Accessing Full Metadata
185
+
186
+ For advanced use cases requiring per-chunk trace breakdown or full metadata, use `include_parent_trace_ids`:
187
+
188
+ ```python
189
+ results, traces, parent_trace_ids = ingestor.ingest(
190
+ return_traces=True,
191
+ include_parent_trace_ids=True
192
+ )
193
+
194
+ # Fetch full parent job metadata (including trace_segments)
195
+ import requests
196
+ response = requests.get(f"http://localhost:7670/v2/fetch_job/{parent_trace_ids[0]}")
197
+ metadata = response.json()["metadata"]
198
+
199
+ # Access per-chunk traces
200
+ for segment in metadata["trace_segments"]:
201
+ print(f"Chunk {segment['chunk_index']}: pages {segment['start_page']}-{segment['end_page']}")
202
+ print(f" Traces: {len(segment['trace'])} entries")
203
+ ```