nv-ingest 2025.10.20.dev20251020__tar.gz → 2025.10.22.dev20251022__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (129) hide show
  1. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/PKG-INFO +1 -1
  2. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/api/v2/README.md +81 -8
  3. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/api/v2/ingest.py +106 -0
  4. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest.egg-info/PKG-INFO +1 -1
  5. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/LICENSE +0 -0
  6. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/MANIFEST.in +0 -0
  7. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/__init__.py +0 -0
  8. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/api/__init__.py +0 -0
  9. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/api/main.py +0 -0
  10. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/api/tracing.py +0 -0
  11. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/api/v1/__init__.py +0 -0
  12. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/api/v1/health.py +0 -0
  13. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/api/v1/ingest.py +0 -0
  14. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/api/v1/metrics.py +0 -0
  15. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/api/v2/__init__.py +0 -0
  16. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/__init__.py +0 -0
  17. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/__init__.py +0 -0
  18. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/execution/__init__.py +0 -0
  19. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/execution/helpers.py +0 -0
  20. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/execution/options.py +0 -0
  21. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/process/__init__.py +0 -0
  22. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/process/dependent_services.py +0 -0
  23. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/process/execution.py +0 -0
  24. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/process/lifecycle.py +0 -0
  25. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/process/strategies.py +0 -0
  26. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/process/termination.py +0 -0
  27. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/__init__.py +0 -0
  28. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/edges/__init__.py +0 -0
  29. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/edges/async_queue_edge.py +0 -0
  30. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/edges/ray_queue_edge.py +0 -0
  31. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/edges/threaded_queue_edge.py +0 -0
  32. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/examples/__init__.py +0 -0
  33. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +0 -0
  34. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/examples/task_source_harness.py +0 -0
  35. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py +0 -0
  36. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/primitives/__init__.py +0 -0
  37. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/primitives/dataclasses.py +0 -0
  38. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py +0 -0
  39. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +0 -0
  40. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +0 -0
  41. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +0 -0
  42. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/stages/__init__.py +0 -0
  43. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py +0 -0
  44. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +0 -0
  45. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +0 -0
  46. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +0 -0
  47. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py +0 -0
  48. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +0 -0
  49. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +0 -0
  50. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +0 -0
  51. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +0 -0
  52. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +0 -0
  53. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/stages/injectors/__init__.py +0 -0
  54. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +0 -0
  55. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/stages/meta/__init__.py +0 -0
  56. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_edge_base.py +0 -0
  57. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +0 -0
  58. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +0 -0
  59. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +0 -0
  60. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/stages/mutate/__init__.py +0 -0
  61. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +0 -0
  62. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +0 -0
  63. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/stages/sinks/__init__.py +0 -0
  64. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +0 -0
  65. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +0 -0
  66. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/stages/sources/__init__.py +0 -0
  67. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +0 -0
  68. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/stages/storage/__init__.py +0 -0
  69. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +0 -0
  70. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +0 -0
  71. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/stages/telemetry/__init__.py +0 -0
  72. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +0 -0
  73. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/stages/telemetry/otel_meter.py +0 -0
  74. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +0 -0
  75. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/stages/transforms/__init__.py +0 -0
  76. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +0 -0
  77. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +0 -0
  78. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +0 -0
  79. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/stages/utility/__init__.py +0 -0
  80. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +0 -0
  81. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/util/__init__.py +0 -0
  82. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/util/env_config.py +0 -0
  83. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/util/pipeline/__init__.py +0 -0
  84. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +0 -0
  85. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +0 -0
  86. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/util/pipeline/tools.py +0 -0
  87. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/util/system_tools/__init__.py +0 -0
  88. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/util/system_tools/memory.py +0 -0
  89. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/orchestration/ray/util/system_tools/visualizers.py +0 -0
  90. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/schemas/__init__.py +0 -0
  91. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/schemas/framework_ingest_config_schema.py +0 -0
  92. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/schemas/framework_job_counter_schema.py +0 -0
  93. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/schemas/framework_message_broker_sink_schema.py +0 -0
  94. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/schemas/framework_message_broker_source_schema.py +0 -0
  95. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/schemas/framework_message_wrapper_schema.py +0 -0
  96. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/schemas/framework_metadata_injector_schema.py +0 -0
  97. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/schemas/framework_otel_meter_schema.py +0 -0
  98. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/schemas/framework_otel_tracer_schema.py +0 -0
  99. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/schemas/framework_processing_job_schema.py +0 -0
  100. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/schemas/framework_task_injection_schema.py +0 -0
  101. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/schemas/framework_vdb_task_sink_schema.py +0 -0
  102. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/util/__init__.py +0 -0
  103. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/util/flow_control/__init__.py +0 -0
  104. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/util/flow_control/filter_by_task.py +0 -0
  105. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/util/flow_control/udf_intercept.py +0 -0
  106. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/util/service/__init__.py +0 -0
  107. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/util/service/impl/__init__.py +0 -0
  108. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/util/service/impl/ingest/__init__.py +0 -0
  109. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +0 -0
  110. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/util/service/meta/__init__.py +0 -0
  111. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/util/service/meta/ingest/__init__.py +0 -0
  112. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py +0 -0
  113. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/util/telemetry/__init__.py +0 -0
  114. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/framework/util/telemetry/global_stats.py +0 -0
  115. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/pipeline/__init__.py +0 -0
  116. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/pipeline/config/__init__.py +0 -0
  117. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/pipeline/config/loaders.py +0 -0
  118. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/pipeline/config/replica_resolver.py +0 -0
  119. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/pipeline/default_libmode_pipeline_impl.py +0 -0
  120. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/pipeline/default_pipeline_impl.py +0 -0
  121. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/pipeline/ingest_pipeline.py +0 -0
  122. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/pipeline/pipeline_schema.py +0 -0
  123. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest/version.py +0 -0
  124. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest.egg-info/SOURCES.txt +0 -0
  125. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest.egg-info/dependency_links.txt +0 -0
  126. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest.egg-info/requires.txt +0 -0
  127. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/nv_ingest.egg-info/top_level.txt +0 -0
  128. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/pyproject.toml +0 -0
  129. {nv_ingest-2025.10.20.dev20251020 → nv_ingest-2025.10.22.dev20251022}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest
3
- Version: 2025.10.20.dev20251020
3
+ Version: 2025.10.22.dev20251022
4
4
  Summary: Python module for multimodal document ingestion
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -40,10 +40,24 @@ This behaviour matches the V1 tracing model and sets the foundation for adding W
40
40
 
41
41
  The fetch endpoint returns a JSON body shaped like the following:
42
42
 
43
- ```
43
+ ```json
44
44
  {
45
45
  "data": [...],
46
46
  "status": "success",
47
+ "trace": {
48
+ "trace::entry::pdf_extractor": 1000,
49
+ "trace::exit::pdf_extractor": 2150,
50
+ "trace::resident_time::pdf_extractor": 250,
51
+ "trace::entry::table_extractor": 1200,
52
+ "trace::exit::table_extractor": 2300,
53
+ "trace::resident_time::table_extractor": 300
54
+ // ... parent-level aggregated traces only (clean, V1-compatible)
55
+ },
56
+ "annotations": {
57
+ "annotation::uuid-1": {"task_id": "pdf_extractor", "task_result": "SUCCESS"},
58
+ "annotation::uuid-2": {"task_id": "table_extractor", "task_result": "SUCCESS"}
59
+ // ... all annotations from all chunks (annotations have unique UUIDs)
60
+ },
47
61
  "metadata": {
48
62
  "parent_job_id": "<uuid>",
49
63
  "total_pages": 320,
@@ -68,9 +82,9 @@ The fetch endpoint returns a JSON body shaped like the following:
68
82
  "chunk_index": 1,
69
83
  "start_page": 1,
70
84
  "end_page": 32,
71
- "trace": {"trace::sink_push": 1.7285796e+18, ...}
85
+ "trace": {"trace::entry::pdf_extractor": 1.7599e18, ...}
72
86
  }
73
- // ...
87
+ // ... per-chunk trace details
74
88
  ],
75
89
  "annotation_segments": [
76
90
  {
@@ -78,17 +92,76 @@ The fetch endpoint returns a JSON body shaped like the following:
78
92
  "chunk_index": 1,
79
93
  "start_page": 1,
80
94
  "end_page": 32,
81
- "annotations": {"annotation::stage": "sink", ...}
95
+ "annotations": {"annotation::uuid": {...}, ...}
96
+ }
97
+ // ... per-chunk annotation details
98
+ ]
99
+ }
100
+ }
101
+ ```
102
+
103
+ **Top-level trace and annotations** (V1 compatibility):
104
+ - `trace`: Contains **only parent-level aggregated traces** for clean V1 compatibility
105
+ - `trace::entry::<stage>` - Earliest entry time across all chunks
106
+ - `trace::exit::<stage>` - Latest exit time across all chunks
107
+ - `trace::resident_time::<stage>` - Sum of all chunk durations (total compute time)
108
+ - `annotations`: Merged annotations from all chunks (annotations have unique UUIDs so merge safely)
109
+ - These fields match V1 structure, allowing existing client code to work without modification
110
+
111
+ **Note:** Chunk-level trace details are available in `metadata.trace_segments[]` for granular analysis
112
+
113
+ **Parent-Level Trace Aggregation:**
114
+
115
+ For split PDFs, parent-level metrics are automatically computed for each stage (including nested stages):
116
+
117
+ - `trace::entry::<stage>` - Earliest entry time across all chunks (when first chunk entered stage)
118
+ - `trace::exit::<stage>` - Latest exit time across all chunks (when last chunk exited stage)
119
+ - `trace::resident_time::<stage>` - Sum of all chunk durations (total compute time in stage)
120
+
121
+ **Supports arbitrary nesting depth:**
122
+ - Simple: `trace::entry::pdf_extractor`
123
+ - Nested: `trace::entry::pdf_extractor::pdf_extraction::pdfium_pages_to_numpy_0`
124
+
125
+ **Example:**
126
+ ```json
127
+ {
128
+ "trace": {
129
+ "trace::entry::pdf_extractor": 1000,
130
+ "trace::exit::pdf_extractor": 2150,
131
+ "trace::resident_time::pdf_extractor": 250
132
+ // ... only parent-level aggregations (clean, concise)
133
+ },
134
+ "metadata": {
135
+ "trace_segments": [
136
+ {
137
+ "chunk_index": 1,
138
+ "start_page": 1,
139
+ "end_page": 32,
140
+ "trace": {
141
+ "trace::entry::pdf_extractor": 1000,
142
+ "trace::exit::pdf_extractor": 1100
143
+ }
144
+ },
145
+ {
146
+ "chunk_index": 2,
147
+ "trace": {
148
+ "trace::entry::pdf_extractor": 2000,
149
+ "trace::exit::pdf_extractor": 2150
150
+ }
82
151
  }
83
- // ...
84
152
  ]
85
153
  }
86
154
  }
87
155
  ```
88
156
 
89
- - `trace_segments` and `annotation_segments` appear only when the sink emits telemetry for a given chunk.
90
- - Clients can correlate chunk data by matching `job_id` or `chunk_index` across `chunks`, `trace_segments`, and `annotation_segments`.
91
- - Failed chunk entries remain in `failed_subjobs`; if a chunk is missing from the telemetry arrays, the sink did not emit trace/annotation payloads for that chunk.
157
+ **Note:** `resident_time` represents total compute time (sum of chunk durations), while `exit - entry` shows wall-clock span.
158
+
159
+ **Detailed metadata** (V2-specific):
160
+ - `trace_segments`: **Chunk-level trace data** with page ranges for granular per-chunk analysis
161
+ - `annotation_segments`: Per-chunk annotation data with page ranges
162
+ - Clients can correlate chunk data by matching `job_id` or `chunk_index` across arrays
163
+ - Failed chunk entries remain in `failed_subjobs`; missing chunks indicate the sink did not emit telemetry
164
+ - **To access chunk traces:** Use `metadata.trace_segments[]` - each segment contains the full trace dict for that chunk
92
165
 
93
166
  ## Testing
94
167
 
@@ -432,6 +432,88 @@ def _extract_ray_telemetry(result: Dict[str, Any]) -> Tuple[Optional[Dict[str, A
432
432
  return trace_dict, annotations_dict
433
433
 
434
434
 
435
+ def _aggregate_parent_traces(chunk_traces: Dict[str, Any]) -> Dict[str, Any]:
436
+ """
437
+ Aggregate chunk-level traces into parent-level metrics.
438
+
439
+ For each stage found in chunk traces:
440
+ - trace::entry::<stage> = min(all chunk entries) - earliest start
441
+ - trace::exit::<stage> = max(all chunk exits) - latest finish
442
+ - trace::resident_time::<stage> = sum(chunk durations) - total compute
443
+
444
+ Parameters
445
+ ----------
446
+ chunk_traces : Dict[str, Any]
447
+ Trace dict with chunk-prefixed keys (chunk_N::trace::entry::stage_name)
448
+
449
+ Returns
450
+ -------
451
+ Dict[str, Any]
452
+ Parent-level aggregated traces (trace::entry::stage_name, etc.)
453
+ """
454
+ # Group by stage: {stage_name: {chunk_idx: {entry: float, exit: float}}}
455
+ stage_data: Dict[str, Dict[int, Dict[str, Any]]] = {}
456
+
457
+ for key, value in chunk_traces.items():
458
+ if not key.startswith("chunk_"):
459
+ continue
460
+
461
+ parts = key.split("::")
462
+ if len(parts) < 4: # Minimum: chunk_N::trace::entry/exit::stage_name
463
+ continue
464
+
465
+ if parts[1] != "trace": # Ensure it's a trace key
466
+ continue
467
+
468
+ chunk_idx_str = parts[0].split("_")[1] # "chunk_1" -> "1"
469
+ try:
470
+ chunk_idx = int(chunk_idx_str)
471
+ except ValueError:
472
+ continue
473
+
474
+ event_type = parts[2] # "entry" or "exit"
475
+
476
+ # Stage name is everything after trace::entry:: or trace::exit::
477
+ # Handles both simple (pdf_extractor) and nested (pdf_extractor::pdf_extraction::pdfium_0)
478
+ stage_name = "::".join(parts[3:]) # Join remaining parts
479
+
480
+ if event_type not in ("entry", "exit"):
481
+ continue
482
+
483
+ if stage_name not in stage_data:
484
+ stage_data[stage_name] = {}
485
+ if chunk_idx not in stage_data[stage_name]:
486
+ stage_data[stage_name][chunk_idx] = {}
487
+
488
+ stage_data[stage_name][chunk_idx][event_type] = value
489
+
490
+ # Compute aggregated metrics
491
+ parent_traces: Dict[str, Any] = {}
492
+
493
+ for stage_name, chunks in stage_data.items():
494
+ entries = []
495
+ exits = []
496
+ durations = []
497
+
498
+ for chunk_data in chunks.values():
499
+ entry = chunk_data.get("entry")
500
+ exit_time = chunk_data.get("exit")
501
+
502
+ # Both entry and exit must exist for valid pair
503
+ if entry is not None and exit_time is not None:
504
+ entries.append(entry)
505
+ exits.append(exit_time)
506
+ durations.append(exit_time - entry)
507
+
508
+ # Only add parent traces if we have valid data
509
+ if entries and exits:
510
+ parent_traces[f"trace::entry::{stage_name}"] = min(entries)
511
+ parent_traces[f"trace::exit::{stage_name}"] = max(exits)
512
+ parent_traces[f"trace::resident_time::{stage_name}"] = sum(durations)
513
+
514
+ return parent_traces
515
+
516
+
435
517
  def _build_aggregated_response(
436
518
  parent_job_id: str,
437
519
  subjob_results: List[Optional[Dict[str, Any]]],
@@ -469,6 +551,9 @@ def _build_aggregated_response(
469
551
  "description": (
470
552
  "One or more subjobs failed to complete" if any_failed else "Aggregated result composed from subjob outputs"
471
553
  ),
554
+ # Top-level trace/annotations for V1 compatibility
555
+ "trace": {},
556
+ "annotations": {},
472
557
  "metadata": {
473
558
  "parent_job_id": parent_job_id,
474
559
  "total_pages": metadata.get("total_pages", len(subjob_ids)),
@@ -498,6 +583,7 @@ def _build_aggregated_response(
498
583
  end_page = descriptor.get("end_page")
499
584
 
500
585
  if trace_data:
586
+ # Add to trace_segments (detailed, per-chunk view)
501
587
  aggregated_result["metadata"]["trace_segments"].append(
502
588
  {
503
589
  "job_id": descriptor.get("job_id"),
@@ -507,8 +593,10 @@ def _build_aggregated_response(
507
593
  "trace": trace_data,
508
594
  }
509
595
  )
596
+ # Chunk traces stay in metadata.trace_segments only (not in top-level)
510
597
 
511
598
  if annotation_data:
599
+ # Add to annotation_segments (detailed, per-chunk view)
512
600
  aggregated_result["metadata"]["annotation_segments"].append(
513
601
  {
514
602
  "job_id": descriptor.get("job_id"),
@@ -518,10 +606,28 @@ def _build_aggregated_response(
518
606
  "annotations": annotation_data,
519
607
  }
520
608
  )
609
+ # Merge into top-level annotations (annotations have unique UUIDs, safe to merge)
610
+ aggregated_result["annotations"].update(annotation_data)
521
611
  else:
522
612
  # Note failed page
523
613
  logger.warning(f"Page {page_num} failed or missing")
524
614
 
615
+ # Compute parent-level trace aggregations from trace_segments
616
+ trace_segments = aggregated_result["metadata"]["trace_segments"]
617
+ if trace_segments:
618
+ # Build a temporary chunk trace dict for aggregation
619
+ temp_chunk_traces = {}
620
+ for segment in trace_segments:
621
+ chunk_idx = segment.get("chunk_index")
622
+ chunk_trace = segment.get("trace", {})
623
+ for trace_key, trace_value in chunk_trace.items():
624
+ prefixed_key = f"chunk_{chunk_idx}::{trace_key}"
625
+ temp_chunk_traces[prefixed_key] = trace_value
626
+
627
+ # Aggregate and set as top-level trace (only parent traces, no chunk traces)
628
+ parent_level_traces = _aggregate_parent_traces(temp_chunk_traces)
629
+ aggregated_result["trace"] = parent_level_traces
630
+
525
631
  return aggregated_result
526
632
 
527
633
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest
3
- Version: 2025.10.20.dev20251020
3
+ Version: 2025.10.22.dev20251022
4
4
  Summary: Python module for multimodal document ingestion
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License