nv-ingest 2025.10.21.dev20251021__tar.gz → 2025.10.23.dev20251023__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest might be problematic. Click here for more details.

Files changed (129) hide show
  1. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/PKG-INFO +1 -1
  2. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/api/v2/README.md +81 -8
  3. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/api/v2/ingest.py +164 -0
  4. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/pipeline/default_pipeline_impl.py +1 -0
  5. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest.egg-info/PKG-INFO +1 -1
  6. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/LICENSE +0 -0
  7. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/MANIFEST.in +0 -0
  8. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/__init__.py +0 -0
  9. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/api/__init__.py +0 -0
  10. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/api/main.py +0 -0
  11. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/api/tracing.py +0 -0
  12. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/api/v1/__init__.py +0 -0
  13. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/api/v1/health.py +0 -0
  14. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/api/v1/ingest.py +0 -0
  15. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/api/v1/metrics.py +0 -0
  16. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/api/v2/__init__.py +0 -0
  17. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/__init__.py +0 -0
  18. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/__init__.py +0 -0
  19. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/execution/__init__.py +0 -0
  20. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/execution/helpers.py +0 -0
  21. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/execution/options.py +0 -0
  22. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/process/__init__.py +0 -0
  23. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/process/dependent_services.py +0 -0
  24. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/process/execution.py +0 -0
  25. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/process/lifecycle.py +0 -0
  26. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/process/strategies.py +0 -0
  27. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/process/termination.py +0 -0
  28. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/__init__.py +0 -0
  29. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/edges/__init__.py +0 -0
  30. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/edges/async_queue_edge.py +0 -0
  31. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/edges/ray_queue_edge.py +0 -0
  32. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/edges/threaded_queue_edge.py +0 -0
  33. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/examples/__init__.py +0 -0
  34. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +0 -0
  35. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/examples/task_source_harness.py +0 -0
  36. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py +0 -0
  37. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/primitives/__init__.py +0 -0
  38. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/primitives/dataclasses.py +0 -0
  39. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py +0 -0
  40. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +0 -0
  41. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +0 -0
  42. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +0 -0
  43. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/stages/__init__.py +0 -0
  44. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py +0 -0
  45. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +0 -0
  46. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +0 -0
  47. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +0 -0
  48. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py +0 -0
  49. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +0 -0
  50. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +0 -0
  51. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +0 -0
  52. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +0 -0
  53. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +0 -0
  54. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/stages/injectors/__init__.py +0 -0
  55. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +0 -0
  56. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/stages/meta/__init__.py +0 -0
  57. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_edge_base.py +0 -0
  58. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +0 -0
  59. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +0 -0
  60. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +0 -0
  61. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/stages/mutate/__init__.py +0 -0
  62. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +0 -0
  63. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +0 -0
  64. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/stages/sinks/__init__.py +0 -0
  65. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +0 -0
  66. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +0 -0
  67. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/stages/sources/__init__.py +0 -0
  68. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +0 -0
  69. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/stages/storage/__init__.py +0 -0
  70. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +0 -0
  71. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +0 -0
  72. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/stages/telemetry/__init__.py +0 -0
  73. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +0 -0
  74. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/stages/telemetry/otel_meter.py +0 -0
  75. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +0 -0
  76. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/stages/transforms/__init__.py +0 -0
  77. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +0 -0
  78. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +0 -0
  79. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +0 -0
  80. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/stages/utility/__init__.py +0 -0
  81. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +0 -0
  82. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/util/__init__.py +0 -0
  83. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/util/env_config.py +0 -0
  84. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/util/pipeline/__init__.py +0 -0
  85. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +0 -0
  86. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +0 -0
  87. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/util/pipeline/tools.py +0 -0
  88. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/util/system_tools/__init__.py +0 -0
  89. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/util/system_tools/memory.py +0 -0
  90. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/orchestration/ray/util/system_tools/visualizers.py +0 -0
  91. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/schemas/__init__.py +0 -0
  92. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/schemas/framework_ingest_config_schema.py +0 -0
  93. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/schemas/framework_job_counter_schema.py +0 -0
  94. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/schemas/framework_message_broker_sink_schema.py +0 -0
  95. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/schemas/framework_message_broker_source_schema.py +0 -0
  96. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/schemas/framework_message_wrapper_schema.py +0 -0
  97. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/schemas/framework_metadata_injector_schema.py +0 -0
  98. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/schemas/framework_otel_meter_schema.py +0 -0
  99. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/schemas/framework_otel_tracer_schema.py +0 -0
  100. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/schemas/framework_processing_job_schema.py +0 -0
  101. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/schemas/framework_task_injection_schema.py +0 -0
  102. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/schemas/framework_vdb_task_sink_schema.py +0 -0
  103. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/util/__init__.py +0 -0
  104. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/util/flow_control/__init__.py +0 -0
  105. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/util/flow_control/filter_by_task.py +0 -0
  106. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/util/flow_control/udf_intercept.py +0 -0
  107. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/util/service/__init__.py +0 -0
  108. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/util/service/impl/__init__.py +0 -0
  109. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/util/service/impl/ingest/__init__.py +0 -0
  110. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +0 -0
  111. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/util/service/meta/__init__.py +0 -0
  112. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/util/service/meta/ingest/__init__.py +0 -0
  113. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py +0 -0
  114. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/util/telemetry/__init__.py +0 -0
  115. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/framework/util/telemetry/global_stats.py +0 -0
  116. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/pipeline/__init__.py +0 -0
  117. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/pipeline/config/__init__.py +0 -0
  118. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/pipeline/config/loaders.py +0 -0
  119. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/pipeline/config/replica_resolver.py +0 -0
  120. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/pipeline/default_libmode_pipeline_impl.py +0 -0
  121. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/pipeline/ingest_pipeline.py +0 -0
  122. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/pipeline/pipeline_schema.py +0 -0
  123. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest/version.py +0 -0
  124. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest.egg-info/SOURCES.txt +0 -0
  125. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest.egg-info/dependency_links.txt +0 -0
  126. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest.egg-info/requires.txt +0 -0
  127. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/nv_ingest.egg-info/top_level.txt +0 -0
  128. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/pyproject.toml +0 -0
  129. {nv_ingest-2025.10.21.dev20251021 → nv_ingest-2025.10.23.dev20251023}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest
3
- Version: 2025.10.21.dev20251021
3
+ Version: 2025.10.23.dev20251023
4
4
  Summary: Python module for multimodal document ingestion
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -40,10 +40,24 @@ This behaviour matches the V1 tracing model and sets the foundation for adding W
40
40
 
41
41
  The fetch endpoint returns a JSON body shaped like the following:
42
42
 
43
- ```
43
+ ```json
44
44
  {
45
45
  "data": [...],
46
46
  "status": "success",
47
+ "trace": {
48
+ "trace::entry::pdf_extractor": 1000,
49
+ "trace::exit::pdf_extractor": 2150,
50
+ "trace::resident_time::pdf_extractor": 250,
51
+ "trace::entry::table_extractor": 1200,
52
+ "trace::exit::table_extractor": 2300,
53
+ "trace::resident_time::table_extractor": 300
54
+ // ... parent-level aggregated traces only (clean, V1-compatible)
55
+ },
56
+ "annotations": {
57
+ "annotation::uuid-1": {"task_id": "pdf_extractor", "task_result": "SUCCESS"},
58
+ "annotation::uuid-2": {"task_id": "table_extractor", "task_result": "SUCCESS"}
59
+ // ... all annotations from all chunks (annotations have unique UUIDs)
60
+ },
47
61
  "metadata": {
48
62
  "parent_job_id": "<uuid>",
49
63
  "total_pages": 320,
@@ -68,9 +82,9 @@ The fetch endpoint returns a JSON body shaped like the following:
68
82
  "chunk_index": 1,
69
83
  "start_page": 1,
70
84
  "end_page": 32,
71
- "trace": {"trace::sink_push": 1.7285796e+18, ...}
85
+ "trace": {"trace::entry::pdf_extractor": 1.7599e18, ...}
72
86
  }
73
- // ...
87
+ // ... per-chunk trace details
74
88
  ],
75
89
  "annotation_segments": [
76
90
  {
@@ -78,17 +92,76 @@ The fetch endpoint returns a JSON body shaped like the following:
78
92
  "chunk_index": 1,
79
93
  "start_page": 1,
80
94
  "end_page": 32,
81
- "annotations": {"annotation::stage": "sink", ...}
95
+ "annotations": {"annotation::uuid": {...}, ...}
96
+ }
97
+ // ... per-chunk annotation details
98
+ ]
99
+ }
100
+ }
101
+ ```
102
+
103
+ **Top-level trace and annotations** (V1 compatibility):
104
+ - `trace`: Contains **only parent-level aggregated traces** for clean V1 compatibility
105
+ - `trace::entry::<stage>` - Earliest entry time across all chunks
106
+ - `trace::exit::<stage>` - Latest exit time across all chunks
107
+ - `trace::resident_time::<stage>` - Sum of all chunk durations (total compute time)
108
+ - `annotations`: Merged annotations from all chunks (annotations have unique UUIDs so merge safely)
109
+ - These fields match V1 structure, allowing existing client code to work without modification
110
+
111
+ **Note:** Chunk-level trace details are available in `metadata.trace_segments[]` for granular analysis
112
+
113
+ **Parent-Level Trace Aggregation:**
114
+
115
+ For split PDFs, parent-level metrics are automatically computed for each stage (including nested stages):
116
+
117
+ - `trace::entry::<stage>` - Earliest entry time across all chunks (when first chunk entered stage)
118
+ - `trace::exit::<stage>` - Latest exit time across all chunks (when last chunk exited stage)
119
+ - `trace::resident_time::<stage>` - Sum of all chunk durations (total compute time in stage)
120
+
121
+ **Supports arbitrary nesting depth:**
122
+ - Simple: `trace::entry::pdf_extractor`
123
+ - Nested: `trace::entry::pdf_extractor::pdf_extraction::pdfium_pages_to_numpy_0`
124
+
125
+ **Example:**
126
+ ```json
127
+ {
128
+ "trace": {
129
+ "trace::entry::pdf_extractor": 1000,
130
+ "trace::exit::pdf_extractor": 2150,
131
+ "trace::resident_time::pdf_extractor": 250
132
+ // ... only parent-level aggregations (clean, concise)
133
+ },
134
+ "metadata": {
135
+ "trace_segments": [
136
+ {
137
+ "chunk_index": 1,
138
+ "start_page": 1,
139
+ "end_page": 32,
140
+ "trace": {
141
+ "trace::entry::pdf_extractor": 1000,
142
+ "trace::exit::pdf_extractor": 1100
143
+ }
144
+ },
145
+ {
146
+ "chunk_index": 2,
147
+ "trace": {
148
+ "trace::entry::pdf_extractor": 2000,
149
+ "trace::exit::pdf_extractor": 2150
150
+ }
82
151
  }
83
- // ...
84
152
  ]
85
153
  }
86
154
  }
87
155
  ```
88
156
 
89
- - `trace_segments` and `annotation_segments` appear only when the sink emits telemetry for a given chunk.
90
- - Clients can correlate chunk data by matching `job_id` or `chunk_index` across `chunks`, `trace_segments`, and `annotation_segments`.
91
- - Failed chunk entries remain in `failed_subjobs`; if a chunk is missing from the telemetry arrays, the sink did not emit trace/annotation payloads for that chunk.
157
+ **Note:** `resident_time` represents total compute time (sum of chunk durations), while `exit - entry` shows wall-clock span.
158
+
159
+ **Detailed metadata** (V2-specific):
160
+ - `trace_segments`: **Chunk-level trace data** with page ranges for granular per-chunk analysis
161
+ - `annotation_segments`: Per-chunk annotation data with page ranges
162
+ - Clients can correlate chunk data by matching `job_id` or `chunk_index` across arrays
163
+ - Failed chunk entries remain in `failed_subjobs`; missing chunks indicate the sink did not emit telemetry
164
+ - **To access chunk traces:** Use `metadata.trace_segments[]` - each segment contains the full trace dict for that chunk
92
165
 
93
166
  ## Testing
94
167
 
@@ -432,6 +432,88 @@ def _extract_ray_telemetry(result: Dict[str, Any]) -> Tuple[Optional[Dict[str, A
432
432
  return trace_dict, annotations_dict
433
433
 
434
434
 
435
+ def _aggregate_parent_traces(chunk_traces: Dict[str, Any]) -> Dict[str, Any]:
436
+ """
437
+ Aggregate chunk-level traces into parent-level metrics.
438
+
439
+ For each stage found in chunk traces:
440
+ - trace::entry::<stage> = min(all chunk entries) - earliest start
441
+ - trace::exit::<stage> = max(all chunk exits) - latest finish
442
+ - trace::resident_time::<stage> = sum(chunk durations) - total compute
443
+
444
+ Parameters
445
+ ----------
446
+ chunk_traces : Dict[str, Any]
447
+ Trace dict with chunk-prefixed keys (chunk_N::trace::entry::stage_name)
448
+
449
+ Returns
450
+ -------
451
+ Dict[str, Any]
452
+ Parent-level aggregated traces (trace::entry::stage_name, etc.)
453
+ """
454
+ # Group by stage: {stage_name: {chunk_idx: {entry: float, exit: float}}}
455
+ stage_data: Dict[str, Dict[int, Dict[str, Any]]] = {}
456
+
457
+ for key, value in chunk_traces.items():
458
+ if not key.startswith("chunk_"):
459
+ continue
460
+
461
+ parts = key.split("::")
462
+ if len(parts) < 4: # Minimum: chunk_N::trace::entry/exit::stage_name
463
+ continue
464
+
465
+ if parts[1] != "trace": # Ensure it's a trace key
466
+ continue
467
+
468
+ chunk_idx_str = parts[0].split("_")[1] # "chunk_1" -> "1"
469
+ try:
470
+ chunk_idx = int(chunk_idx_str)
471
+ except ValueError:
472
+ continue
473
+
474
+ event_type = parts[2] # "entry" or "exit"
475
+
476
+ # Stage name is everything after trace::entry:: or trace::exit::
477
+ # Handles both simple (pdf_extractor) and nested (pdf_extractor::pdf_extraction::pdfium_0)
478
+ stage_name = "::".join(parts[3:]) # Join remaining parts
479
+
480
+ if event_type not in ("entry", "exit"):
481
+ continue
482
+
483
+ if stage_name not in stage_data:
484
+ stage_data[stage_name] = {}
485
+ if chunk_idx not in stage_data[stage_name]:
486
+ stage_data[stage_name][chunk_idx] = {}
487
+
488
+ stage_data[stage_name][chunk_idx][event_type] = value
489
+
490
+ # Compute aggregated metrics
491
+ parent_traces: Dict[str, Any] = {}
492
+
493
+ for stage_name, chunks in stage_data.items():
494
+ entries = []
495
+ exits = []
496
+ durations = []
497
+
498
+ for chunk_data in chunks.values():
499
+ entry = chunk_data.get("entry")
500
+ exit_time = chunk_data.get("exit")
501
+
502
+ # Both entry and exit must exist for valid pair
503
+ if entry is not None and exit_time is not None:
504
+ entries.append(entry)
505
+ exits.append(exit_time)
506
+ durations.append(exit_time - entry)
507
+
508
+ # Only add parent traces if we have valid data
509
+ if entries and exits:
510
+ parent_traces[f"trace::entry::{stage_name}"] = min(entries)
511
+ parent_traces[f"trace::exit::{stage_name}"] = max(exits)
512
+ parent_traces[f"trace::resident_time::{stage_name}"] = sum(durations)
513
+
514
+ return parent_traces
515
+
516
+
435
517
  def _build_aggregated_response(
436
518
  parent_job_id: str,
437
519
  subjob_results: List[Optional[Dict[str, Any]]],
@@ -469,6 +551,9 @@ def _build_aggregated_response(
469
551
  "description": (
470
552
  "One or more subjobs failed to complete" if any_failed else "Aggregated result composed from subjob outputs"
471
553
  ),
554
+ # Top-level trace/annotations for V1 compatibility
555
+ "trace": {},
556
+ "annotations": {},
472
557
  "metadata": {
473
558
  "parent_job_id": parent_job_id,
474
559
  "total_pages": metadata.get("total_pages", len(subjob_ids)),
@@ -498,6 +583,7 @@ def _build_aggregated_response(
498
583
  end_page = descriptor.get("end_page")
499
584
 
500
585
  if trace_data:
586
+ # Add to trace_segments (detailed, per-chunk view)
501
587
  aggregated_result["metadata"]["trace_segments"].append(
502
588
  {
503
589
  "job_id": descriptor.get("job_id"),
@@ -507,8 +593,10 @@ def _build_aggregated_response(
507
593
  "trace": trace_data,
508
594
  }
509
595
  )
596
+ # Chunk traces stay in metadata.trace_segments only (not in top-level)
510
597
 
511
598
  if annotation_data:
599
+ # Add to annotation_segments (detailed, per-chunk view)
512
600
  aggregated_result["metadata"]["annotation_segments"].append(
513
601
  {
514
602
  "job_id": descriptor.get("job_id"),
@@ -518,10 +606,28 @@ def _build_aggregated_response(
518
606
  "annotations": annotation_data,
519
607
  }
520
608
  )
609
+ # Merge into top-level annotations (annotations have unique UUIDs, safe to merge)
610
+ aggregated_result["annotations"].update(annotation_data)
521
611
  else:
522
612
  # Note failed page
523
613
  logger.warning(f"Page {page_num} failed or missing")
524
614
 
615
+ # Compute parent-level trace aggregations from trace_segments
616
+ trace_segments = aggregated_result["metadata"]["trace_segments"]
617
+ if trace_segments:
618
+ # Build a temporary chunk trace dict for aggregation
619
+ temp_chunk_traces = {}
620
+ for segment in trace_segments:
621
+ chunk_idx = segment.get("chunk_index")
622
+ chunk_trace = segment.get("trace", {})
623
+ for trace_key, trace_value in chunk_trace.items():
624
+ prefixed_key = f"chunk_{chunk_idx}::{trace_key}"
625
+ temp_chunk_traces[prefixed_key] = trace_value
626
+
627
+ # Aggregate and set as top-level trace (only parent traces, no chunk traces)
628
+ parent_level_traces = _aggregate_parent_traces(temp_chunk_traces)
629
+ aggregated_result["trace"] = parent_level_traces
630
+
525
631
  return aggregated_result
526
632
 
527
633
 
@@ -566,11 +672,15 @@ async def submit_job_v2(
566
672
  original_source_id = source_ids[0] if source_ids else "unknown_source.pdf"
567
673
  original_source_name = source_names[0] if source_names else "unknown_source.pdf"
568
674
 
675
+ # Track page count for all PDFs (used for both splitting logic and metadata)
676
+ pdf_page_count_cache = None
677
+
569
678
  # Check if this is a PDF that needs splitting
570
679
  if document_types and payloads and document_types[0].lower() == "pdf":
571
680
  # Decode the payload to check page count
572
681
  pdf_content = base64.b64decode(payloads[0])
573
682
  page_count = get_pdf_page_count(pdf_content)
683
+ pdf_page_count_cache = page_count # Cache for later use
574
684
  pages_per_chunk = get_pdf_split_page_count(client_override=client_split_page_count)
575
685
 
576
686
  # Split if the document has more pages than our chunk size
@@ -656,6 +766,34 @@ async def submit_job_v2(
656
766
  await ingest_service.submit_job(updated_job_spec, parent_job_id)
657
767
  await ingest_service.set_job_state(parent_job_id, STATE_SUBMITTED)
658
768
 
769
+ # If this was a PDF (even if not split), store page count metadata for tracking
770
+ if pdf_page_count_cache is not None:
771
+ try:
772
+ # Use cached page count from earlier check to avoid re-decoding
773
+ # Store minimal metadata for non-split PDFs (consistent with split PDFs)
774
+ single_pdf_metadata: Dict[str, Any] = {
775
+ "total_pages": pdf_page_count_cache,
776
+ "pages_per_chunk": pdf_page_count_cache, # Single chunk = entire document
777
+ "original_source_id": original_source_id,
778
+ "original_source_name": original_source_name,
779
+ "document_type": document_types[0],
780
+ "subjob_order": [], # No subjobs for non-split PDFs
781
+ }
782
+
783
+ # Store as parent job metadata with empty subjob list for consistency
784
+ await ingest_service.set_parent_job_mapping(
785
+ parent_job_id,
786
+ [], # Empty subjob list
787
+ single_pdf_metadata,
788
+ subjob_descriptors=[],
789
+ )
790
+ logger.debug(
791
+ f"Stored page count metadata for non-split PDF {original_source_name}: {pdf_page_count_cache} pages"
792
+ )
793
+ except Exception as metadata_err:
794
+ # Don't fail the job if metadata storage fails
795
+ logger.warning(f"Failed to store page count metadata for {parent_job_id}: {metadata_err}")
796
+
659
797
  response.headers["x-trace-id"] = trace.format_trace_id(current_trace_id)
660
798
  return parent_job_id
661
799
 
@@ -792,6 +930,32 @@ async def fetch_job_v2(job_id: str, ingest_service: INGEST_SERVICE_T):
792
930
 
793
931
  logger.debug(f"Parent job {job_id} has {len(subjob_ids)} subjobs")
794
932
 
933
+ # Special case: Non-split PDFs have metadata but no subjobs
934
+ # Fetch the result directly and augment with page count metadata
935
+ if len(subjob_ids) == 0:
936
+ logger.debug(f"Job {job_id} is a non-split PDF, fetching result directly")
937
+ try:
938
+ job_response = await ingest_service.fetch_job(job_id)
939
+
940
+ # Augment response with page count metadata
941
+ if isinstance(job_response, dict):
942
+ if "metadata" not in job_response:
943
+ job_response["metadata"] = {}
944
+ job_response["metadata"]["total_pages"] = metadata.get("total_pages")
945
+ job_response["metadata"]["original_source_id"] = metadata.get("original_source_id")
946
+ job_response["metadata"]["original_source_name"] = metadata.get("original_source_name")
947
+
948
+ # Update job state after successful fetch
949
+ await _update_job_state_after_fetch(job_id, ingest_service)
950
+
951
+ return _stream_json_response(job_response)
952
+ except (TimeoutError, RedisError, ConnectionError):
953
+ logger.debug(f"Job {job_id} (non-split PDF) not ready yet")
954
+ raise HTTPException(status_code=202, detail="Job is processing. Retry later.")
955
+ except Exception as e:
956
+ logger.exception(f"Error fetching non-split PDF job {job_id}: {e}")
957
+ raise HTTPException(status_code=500, detail="Internal server error during job fetch.")
958
+
795
959
  # Build ordered descriptors for subjobs
796
960
  stored_descriptors = subjob_info.get("subjob_descriptors") or []
797
961
  descriptor_lookup = {entry.get("job_id"): entry for entry in stored_descriptors if isinstance(entry, dict)}
@@ -318,6 +318,7 @@ stages:
318
318
  config:
319
319
  api_key: $NGC_API_KEY|$NVIDIA_API_KEY
320
320
  model_name: $VLM_CAPTION_MODEL_NAME|"nvidia/llama-3.1-nemotron-nano-vl-8b-v1"
321
+ endpoint_url: $VLM_CAPTION_ENDPOINT|"http://vlm:8000/v1/chat/completions"
321
322
  prompt: "Caption the content of this image:"
322
323
  replicas:
323
324
  min_replicas: 0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest
3
- Version: 2025.10.21.dev20251021
3
+ Version: 2025.10.23.dev20251023
4
4
  Summary: Python module for multimodal document ingestion
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License