nv-ingest 2025.10.27.dev20251027__tar.gz → 2025.10.29.dev20251029__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest might be problematic. Click here for more details.

Files changed (129) hide show
  1. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/PKG-INFO +1 -1
  2. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/api/v2/README.md +44 -18
  3. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/api/v2/ingest.py +72 -1
  4. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/process/dependent_services.py +17 -10
  5. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/process/strategies.py +6 -2
  6. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +4 -4
  7. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest.egg-info/PKG-INFO +1 -1
  8. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/LICENSE +0 -0
  9. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/MANIFEST.in +0 -0
  10. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/__init__.py +0 -0
  11. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/api/__init__.py +0 -0
  12. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/api/main.py +0 -0
  13. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/api/tracing.py +0 -0
  14. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/api/v1/__init__.py +0 -0
  15. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/api/v1/health.py +0 -0
  16. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/api/v1/ingest.py +0 -0
  17. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/api/v1/metrics.py +0 -0
  18. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/api/v2/__init__.py +0 -0
  19. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/__init__.py +0 -0
  20. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/__init__.py +0 -0
  21. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/execution/__init__.py +0 -0
  22. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/execution/helpers.py +0 -0
  23. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/execution/options.py +0 -0
  24. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/process/__init__.py +0 -0
  25. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/process/execution.py +0 -0
  26. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/process/lifecycle.py +0 -0
  27. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/process/termination.py +0 -0
  28. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/__init__.py +0 -0
  29. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/edges/__init__.py +0 -0
  30. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/edges/async_queue_edge.py +0 -0
  31. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/edges/ray_queue_edge.py +0 -0
  32. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/edges/threaded_queue_edge.py +0 -0
  33. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/examples/__init__.py +0 -0
  34. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +0 -0
  35. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/examples/task_source_harness.py +0 -0
  36. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py +0 -0
  37. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/primitives/__init__.py +0 -0
  38. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/primitives/dataclasses.py +0 -0
  39. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py +0 -0
  40. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +0 -0
  41. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +0 -0
  42. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +0 -0
  43. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/stages/__init__.py +0 -0
  44. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py +0 -0
  45. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +0 -0
  46. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +0 -0
  47. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +0 -0
  48. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py +0 -0
  49. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +0 -0
  50. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +0 -0
  51. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +0 -0
  52. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +0 -0
  53. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +0 -0
  54. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/stages/injectors/__init__.py +0 -0
  55. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +0 -0
  56. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/stages/meta/__init__.py +0 -0
  57. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_edge_base.py +0 -0
  58. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +0 -0
  59. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +0 -0
  60. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +0 -0
  61. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/stages/mutate/__init__.py +0 -0
  62. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +0 -0
  63. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +0 -0
  64. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/stages/sinks/__init__.py +0 -0
  65. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +0 -0
  66. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +0 -0
  67. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/stages/sources/__init__.py +0 -0
  68. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +0 -0
  69. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/stages/storage/__init__.py +0 -0
  70. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +0 -0
  71. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +0 -0
  72. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/stages/telemetry/__init__.py +0 -0
  73. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +0 -0
  74. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/stages/telemetry/otel_meter.py +0 -0
  75. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +0 -0
  76. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/stages/transforms/__init__.py +0 -0
  77. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +0 -0
  78. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +0 -0
  79. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +0 -0
  80. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/stages/utility/__init__.py +0 -0
  81. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +0 -0
  82. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/util/__init__.py +0 -0
  83. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/util/env_config.py +0 -0
  84. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/util/pipeline/__init__.py +0 -0
  85. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +0 -0
  86. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +0 -0
  87. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/util/pipeline/tools.py +0 -0
  88. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/util/system_tools/__init__.py +0 -0
  89. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/util/system_tools/memory.py +0 -0
  90. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/orchestration/ray/util/system_tools/visualizers.py +0 -0
  91. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/schemas/__init__.py +0 -0
  92. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/schemas/framework_ingest_config_schema.py +0 -0
  93. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/schemas/framework_job_counter_schema.py +0 -0
  94. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/schemas/framework_message_broker_sink_schema.py +0 -0
  95. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/schemas/framework_message_broker_source_schema.py +0 -0
  96. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/schemas/framework_message_wrapper_schema.py +0 -0
  97. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/schemas/framework_metadata_injector_schema.py +0 -0
  98. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/schemas/framework_otel_meter_schema.py +0 -0
  99. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/schemas/framework_otel_tracer_schema.py +0 -0
  100. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/schemas/framework_processing_job_schema.py +0 -0
  101. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/schemas/framework_task_injection_schema.py +0 -0
  102. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/schemas/framework_vdb_task_sink_schema.py +0 -0
  103. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/util/__init__.py +0 -0
  104. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/util/flow_control/__init__.py +0 -0
  105. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/util/flow_control/filter_by_task.py +0 -0
  106. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/util/flow_control/udf_intercept.py +0 -0
  107. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/util/service/__init__.py +0 -0
  108. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/util/service/impl/__init__.py +0 -0
  109. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/util/service/impl/ingest/__init__.py +0 -0
  110. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/util/service/meta/__init__.py +0 -0
  111. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/util/service/meta/ingest/__init__.py +0 -0
  112. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py +0 -0
  113. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/util/telemetry/__init__.py +0 -0
  114. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/framework/util/telemetry/global_stats.py +0 -0
  115. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/pipeline/__init__.py +0 -0
  116. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/pipeline/config/__init__.py +0 -0
  117. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/pipeline/config/loaders.py +0 -0
  118. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/pipeline/config/replica_resolver.py +0 -0
  119. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/pipeline/default_libmode_pipeline_impl.py +0 -0
  120. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/pipeline/default_pipeline_impl.py +0 -0
  121. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/pipeline/ingest_pipeline.py +0 -0
  122. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/pipeline/pipeline_schema.py +0 -0
  123. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest/version.py +0 -0
  124. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest.egg-info/SOURCES.txt +0 -0
  125. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest.egg-info/dependency_links.txt +0 -0
  126. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest.egg-info/requires.txt +0 -0
  127. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/nv_ingest.egg-info/top_level.txt +0 -0
  128. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/pyproject.toml +0 -0
  129. {nv_ingest-2025.10.27.dev20251027 → nv_ingest-2025.10.29.dev20251029}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest
3
- Version: 2025.10.27.dev20251027
3
+ Version: 2025.10.29.dev20251029
4
4
  Summary: Python module for multimodal document ingestion
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -11,15 +11,6 @@ The V2 API introduces automatic PDF splitting at the REST layer to improve proce
11
11
  3. **Transparent Aggregation**: Results are automatically aggregated when fetching parent jobs
12
12
  4. **Backward Compatible**: PDFs with page counts ≤ `PDF_SPLIT_PAGE_COUNT` behave identical to V1
13
13
 
14
- ## Tracing & Aggregated Metadata
15
-
16
- - V2 endpoints open an OpenTelemetry span using the shared `traced_endpoint` decorator. The span name defaults to the function name, or can be overridden when applying the decorator.
17
- - `submit_job_v2` records the parent span's `trace_id` into each subjob's `tracing_options`, enabling downstream Ray stages (e.g., the message broker sink) to attach chunk-level telemetry consistently.
18
- - Response headers still return `x-trace-id` derived from the active span context, allowing clients to correlate downstream work.
19
- - When `/v2/fetch_job/{parent_id}` aggregates completed chunks, it captures any `trace` / `annotations` dictionaries emitted by the sink for each subjob and includes them in the response payload (see "Aggregated response" below).
20
-
21
- This behaviour matches the V1 tracing model and sets the foundation for adding W3C `traceparent` propagation in future changes.
22
-
23
14
  ## How It Works
24
15
 
25
16
  1. **Submit**: When a PDF with pages exceeding `PDF_SPLIT_PAGE_COUNT` is submitted to `/v2/submit_job`:
@@ -36,6 +27,33 @@ This behaviour matches the V1 tracing model and sets the foundation for adding W
36
27
  - Pending work returns 202 (processing)
37
28
  - Failed chunks are noted without failing the entire job; metadata records which chunks failed
38
29
 
30
+
31
+ ## Client Library Features
32
+
33
+ ### Accessing Trace Metrics
34
+
35
+ The Python client library provides convenient access to trace metrics via the `return_traces` parameter:
36
+
37
+ ```python
38
+ from nv_ingest_client.client import Ingestor
39
+
40
+ ingestor = Ingestor(
41
+ message_client_hostname="localhost",
42
+ message_client_port=7670,
43
+ message_client_kwargs={"api_version": "v2"}
44
+ ).files("/path/to/pdfs").extract().embed()
45
+
46
+ # Get results with trace metrics
47
+ results, traces = ingestor.ingest(return_traces=True)
48
+
49
+ # Access timing for first document
50
+ pdf_time = traces[0]["trace::resident_time::pdf_extractor"] / 1e9
51
+ table_time = traces[0]["trace::resident_time::table_extractor"] / 1e9
52
+ print(f"PDF: {pdf_time:.2f}s, Tables: {table_time:.2f}s")
53
+ ```
54
+
55
+ **Note:** For split PDFs, `resident_time` represents aggregated compute time across all chunks. For non-split PDFs, it is computed client-side from entry/exit pairs.
56
+
39
57
  ### Aggregated response
40
58
 
41
59
  The fetch endpoint returns a JSON body shaped like the following:
@@ -163,15 +181,23 @@ For split PDFs, parent-level metrics are automatically computed for each stage (
163
181
  - Failed chunk entries remain in `failed_subjobs`; missing chunks indicate the sink did not emit telemetry
164
182
  - **To access chunk traces:** Use `metadata.trace_segments[]` - each segment contains the full trace dict for that chunk
165
183
 
166
- ## Testing
184
+ ### Advanced: Accessing Full Metadata
167
185
 
168
- Use the V2 test script with environment variable:
169
- ```bash
170
- # Run with V2 endpoints
171
- DATASET_DIR=/data/splits python scripts/tests/cases/dc20_v2_e2e.py
172
- ```
186
+ For advanced use cases requiring per-chunk trace breakdown or full metadata, use `include_parent_trace_ids`:
187
+
188
+ ```python
189
+ results, traces, parent_trace_ids = ingestor.ingest(
190
+ return_traces=True,
191
+ include_parent_trace_ids=True
192
+ )
193
+
194
+ # Fetch full parent job metadata (including trace_segments)
195
+ import requests
196
+ response = requests.get(f"http://localhost:7670/v2/fetch_job/{parent_trace_ids[0]}")
197
+ metadata = response.json()["metadata"]
173
198
 
174
- Or set the API version for any existing code:
175
- ```bash
176
- export NV_INGEST_API_VERSION=v2
199
+ # Access per-chunk traces
200
+ for segment in metadata["trace_segments"]:
201
+ print(f"Chunk {segment['chunk_index']}: pages {segment['start_page']}-{segment['end_page']}")
202
+ print(f" Traces: {len(segment['trace'])} entries")
177
203
  ```
@@ -432,6 +432,76 @@ def _extract_ray_telemetry(result: Dict[str, Any]) -> Tuple[Optional[Dict[str, A
432
432
  return trace_dict, annotations_dict
433
433
 
434
434
 
435
+ def _normalize_chunk_records(
436
+ records: Optional[List[Any]],
437
+ descriptor: Dict[str, Any],
438
+ parent_metadata: Dict[str, Any],
439
+ ) -> List[Any]:
440
+ """Re-map chunk-local metadata to document-level context for aggregation."""
441
+
442
+ if not isinstance(records, list):
443
+ return []
444
+
445
+ total_pages = parent_metadata.get("total_pages")
446
+ original_source_id = parent_metadata.get("original_source_id")
447
+ original_source_name = parent_metadata.get("original_source_name")
448
+
449
+ start_page = descriptor.get("start_page")
450
+ page_offset = start_page - 1 if isinstance(start_page, int) and start_page > 0 else 0
451
+
452
+ normalized_entries: List[Any] = []
453
+
454
+ for entry in records:
455
+ if not isinstance(entry, dict):
456
+ normalized_entries.append(entry)
457
+ continue
458
+
459
+ normalized_entry = entry.copy()
460
+ original_metadata = entry.get("metadata")
461
+
462
+ if isinstance(original_metadata, dict):
463
+ normalized_metadata = original_metadata.copy()
464
+ normalized_entry["metadata"] = normalized_metadata
465
+
466
+ original_source_meta = original_metadata.get("source_metadata")
467
+ if isinstance(original_source_meta, dict):
468
+ normalized_source_meta = original_source_meta.copy()
469
+ normalized_metadata["source_metadata"] = normalized_source_meta
470
+
471
+ if original_source_id:
472
+ normalized_source_meta["source_id"] = original_source_id
473
+ if original_source_name:
474
+ normalized_source_meta["source_name"] = original_source_name
475
+
476
+ original_content_meta = original_metadata.get("content_metadata")
477
+ if isinstance(original_content_meta, dict):
478
+ normalized_content_meta = original_content_meta.copy()
479
+ normalized_metadata["content_metadata"] = normalized_content_meta
480
+
481
+ page_number = normalized_content_meta.get("page_number")
482
+ if isinstance(page_number, int) and page_number >= 0:
483
+ normalized_content_meta["page_number"] = page_number + page_offset
484
+
485
+ if isinstance(total_pages, int) and isinstance(normalized_content_meta.get("page_count"), int):
486
+ # Ensure optional per-record page count reflects the full document
487
+ normalized_content_meta["page_count"] = total_pages
488
+
489
+ original_hierarchy = original_content_meta.get("hierarchy")
490
+ if isinstance(original_hierarchy, dict):
491
+ normalized_hierarchy = original_hierarchy.copy()
492
+ normalized_content_meta["hierarchy"] = normalized_hierarchy
493
+
494
+ hierarchy_page = normalized_hierarchy.get("page")
495
+ if isinstance(hierarchy_page, int) and hierarchy_page >= 0:
496
+ normalized_hierarchy["page"] = hierarchy_page + page_offset
497
+ if isinstance(total_pages, int):
498
+ normalized_hierarchy["page_count"] = total_pages
499
+
500
+ normalized_entries.append(normalized_entry)
501
+
502
+ return normalized_entries
503
+
504
+
435
505
  def _aggregate_parent_traces(chunk_traces: Dict[str, Any]) -> Dict[str, Any]:
436
506
  """
437
507
  Aggregate chunk-level traces into parent-level metrics.
@@ -574,7 +644,8 @@ def _build_aggregated_response(
574
644
  if result is not None:
575
645
  # Add page data to aggregated result
576
646
  if "data" in result:
577
- aggregated_result["data"].extend(result["data"])
647
+ normalized_records = _normalize_chunk_records(result.get("data"), descriptor, metadata)
648
+ aggregated_result["data"].extend(normalized_records)
578
649
  chunk_entry = dict(descriptor)
579
650
  aggregated_result["metadata"]["chunks"].append(chunk_entry)
580
651
 
@@ -18,6 +18,18 @@ from nv_ingest_api.util.message_brokers.simple_message_broker.broker import Simp
18
18
  logger = logging.getLogger(__name__)
19
19
 
20
20
 
21
+ def _broker_server_target(host, port, max_queue_size):
22
+ """
23
+ Target function to be run in a separate process for the SimpleMessageBroker.
24
+ """
25
+ server = SimpleMessageBroker(host, port, max_queue_size)
26
+ try:
27
+ server.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
28
+ except Exception:
29
+ pass
30
+ server.serve_forever()
31
+
32
+
21
33
  def start_simple_message_broker(broker_client: dict) -> multiprocessing.Process:
22
34
  """
23
35
  Starts a SimpleMessageBroker server in a separate process.
@@ -58,16 +70,11 @@ def start_simple_message_broker(broker_client: dict) -> multiprocessing.Process:
58
70
  f"continuing to spawn a broker process (tests expect a Process to be returned)"
59
71
  )
60
72
 
61
- def broker_server():
62
- # Optionally, set socket options here for reuse (note: binding occurs in server __init__).
63
- server = SimpleMessageBroker(server_host, server_port, max_queue_size)
64
- try:
65
- server.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
66
- except Exception:
67
- pass
68
- server.serve_forever()
69
-
70
- p = multiprocessing.Process(target=broker_server)
73
+ p = multiprocessing.Process(
74
+ target=_broker_server_target,
75
+ args=(server_host, server_port, max_queue_size),
76
+ daemon=True,
77
+ )
71
78
  # If we're launching from inside the pipeline subprocess, mark daemon so the
72
79
  # broker dies automatically when the subprocess exits.
73
80
  p.daemon = os.environ.get("NV_INGEST_BROKER_IN_SUBPROCESS") == "1"
@@ -11,9 +11,10 @@ Strategy pattern for clean separation of execution concerns.
11
11
  """
12
12
 
13
13
  import atexit
14
- import os
15
14
  import logging
16
15
  import multiprocessing
16
+ import os
17
+ import sys
17
18
  import time
18
19
  from abc import ABC, abstractmethod
19
20
 
@@ -132,7 +133,10 @@ class SubprocessStrategy(ProcessExecutionStrategy):
132
133
  logger.info("Launching pipeline in Python subprocess using multiprocessing.")
133
134
 
134
135
  # Create subprocess using fork context
135
- ctx = multiprocessing.get_context("fork")
136
+ start_method = "fork"
137
+ if sys.platform.lower() == "darwin":
138
+ start_method = "spawn"
139
+ ctx = multiprocessing.get_context(start_method)
136
140
  process = ctx.Process(
137
141
  target=run_pipeline_process,
138
142
  args=(
@@ -501,21 +501,21 @@ class RedisIngestService(IngestServiceMeta):
501
501
  metadata_key = f"parent:{parent_job_id}:metadata"
502
502
 
503
503
  try:
504
- # Check if this is a parent job
504
+ # Check if this is a parent job (check metadata_key since non-split PDFs may not have parent_key)
505
505
  exists = await self._run_bounded_to_thread(
506
506
  self._ingest_client.get_client().exists,
507
- parent_key,
507
+ metadata_key, # Check metadata instead of parent_key for non-split PDF support
508
508
  )
509
509
 
510
510
  if not exists:
511
511
  return None
512
512
 
513
- # Get subjob IDs
513
+ # Get subjob IDs (may be empty for non-split PDFs)
514
514
  subjob_ids_bytes = await self._run_bounded_to_thread(
515
515
  self._ingest_client.get_client().smembers,
516
516
  parent_key,
517
517
  )
518
- subjob_id_set = {id.decode("utf-8") for id in subjob_ids_bytes}
518
+ subjob_id_set = {id.decode("utf-8") for id in subjob_ids_bytes} if subjob_ids_bytes else set()
519
519
 
520
520
  # Get metadata
521
521
  metadata_dict = await self._run_bounded_to_thread(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest
3
- Version: 2025.10.27.dev20251027
3
+ Version: 2025.10.29.dev20251029
4
4
  Summary: Python module for multimodal document ingestion
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License