nv-ingest 2025.11.9.dev20251109__tar.gz → 2025.12.15.dev20251215__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (131) hide show
  1. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/PKG-INFO +3 -2
  2. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/api/main.py +1 -1
  3. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/api/v2/ingest.py +171 -61
  4. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/process/execution.py +6 -0
  5. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/examples/pipeline_test_harness.py +9 -9
  6. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/stages/extractors/audio_extractor.py +0 -2
  7. nv_ingest-2025.12.15.dev20251215/nv_ingest/framework/orchestration/ray/stages/extractors/ocr_extractor.py +71 -0
  8. nv_ingest-2025.12.15.dev20251215/nv_ingest/framework/orchestration/ray/stages/meta/udf_parallel_helper.py +64 -0
  9. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/stages/storage/image_storage.py +72 -6
  10. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/util/pipeline/pipeline_runners.py +40 -0
  11. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/pipeline/config/replica_resolver.py +12 -2
  12. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/pipeline/default_libmode_pipeline_impl.py +25 -10
  13. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/pipeline/default_pipeline_impl.py +50 -8
  14. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest.egg-info/PKG-INFO +3 -2
  15. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest.egg-info/SOURCES.txt +2 -0
  16. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest.egg-info/requires.txt +2 -1
  17. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/pyproject.toml +2 -1
  18. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/LICENSE +0 -0
  19. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/MANIFEST.in +0 -0
  20. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/__init__.py +0 -0
  21. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/api/__init__.py +0 -0
  22. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/api/tracing.py +0 -0
  23. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/api/v1/__init__.py +0 -0
  24. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/api/v1/health.py +0 -0
  25. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/api/v1/ingest.py +0 -0
  26. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/api/v1/metrics.py +0 -0
  27. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/api/v2/README.md +0 -0
  28. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/api/v2/__init__.py +0 -0
  29. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/__init__.py +0 -0
  30. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/__init__.py +0 -0
  31. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/execution/__init__.py +0 -0
  32. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/execution/helpers.py +0 -0
  33. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/execution/options.py +0 -0
  34. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/process/__init__.py +0 -0
  35. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/process/dependent_services.py +0 -0
  36. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/process/lifecycle.py +0 -0
  37. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/process/strategies.py +0 -0
  38. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/process/termination.py +0 -0
  39. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/__init__.py +0 -0
  40. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/edges/__init__.py +0 -0
  41. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/edges/async_queue_edge.py +0 -0
  42. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/edges/ray_queue_edge.py +0 -0
  43. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/edges/threaded_queue_edge.py +0 -0
  44. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/examples/__init__.py +0 -0
  45. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/examples/task_source_harness.py +0 -0
  46. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/examples/task_source_sink_harness.py +0 -0
  47. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/primitives/__init__.py +0 -0
  48. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/primitives/dataclasses.py +0 -0
  49. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/primitives/pipeline_monitor.py +0 -0
  50. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/primitives/pipeline_topology.py +0 -0
  51. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/primitives/ray_pipeline.py +0 -0
  52. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/primitives/ray_stat_collector.py +0 -0
  53. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/stages/__init__.py +0 -0
  54. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/stages/extractors/__init__.py +0 -0
  55. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/stages/extractors/chart_extractor.py +0 -0
  56. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/stages/extractors/docx_extractor.py +0 -0
  57. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/stages/extractors/html_extractor.py +0 -0
  58. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/stages/extractors/image_extractor.py +0 -0
  59. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/stages/extractors/infographic_extractor.py +0 -0
  60. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/stages/extractors/pdf_extractor.py +0 -0
  61. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/stages/extractors/pptx_extractor.py +0 -0
  62. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/stages/extractors/table_extractor.py +0 -0
  63. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/stages/injectors/__init__.py +0 -0
  64. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/stages/injectors/metadata_injector.py +0 -0
  65. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/stages/meta/__init__.py +0 -0
  66. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_edge_base.py +0 -0
  67. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_sink_stage_base.py +0 -0
  68. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_source_stage_base.py +0 -0
  69. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/stages/meta/ray_actor_stage_base.py +0 -0
  70. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/stages/mutate/__init__.py +0 -0
  71. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/stages/mutate/image_dedup.py +0 -0
  72. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/stages/mutate/image_filter.py +0 -0
  73. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/stages/sinks/__init__.py +0 -0
  74. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/stages/sinks/default_drain.py +0 -0
  75. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/stages/sinks/message_broker_task_sink.py +0 -0
  76. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/stages/sources/__init__.py +0 -0
  77. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/stages/sources/message_broker_task_source.py +0 -0
  78. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/stages/storage/__init__.py +0 -0
  79. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/stages/storage/store_embeddings.py +0 -0
  80. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/stages/telemetry/__init__.py +0 -0
  81. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/stages/telemetry/job_counter.py +0 -0
  82. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/stages/telemetry/otel_meter.py +0 -0
  83. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/stages/telemetry/otel_tracer.py +0 -0
  84. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/stages/transforms/__init__.py +0 -0
  85. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/stages/transforms/image_caption.py +0 -0
  86. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/stages/transforms/text_embed.py +0 -0
  87. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/stages/transforms/text_splitter.py +0 -0
  88. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/stages/utility/__init__.py +0 -0
  89. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/stages/utility/throughput_monitor.py +0 -0
  90. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/util/__init__.py +0 -0
  91. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/util/env_config.py +0 -0
  92. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/util/pipeline/__init__.py +0 -0
  93. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/util/pipeline/pid_controller.py +0 -0
  94. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/util/pipeline/tools.py +0 -0
  95. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/util/system_tools/__init__.py +0 -0
  96. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/util/system_tools/memory.py +0 -0
  97. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/orchestration/ray/util/system_tools/visualizers.py +0 -0
  98. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/schemas/__init__.py +0 -0
  99. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/schemas/framework_ingest_config_schema.py +0 -0
  100. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/schemas/framework_job_counter_schema.py +0 -0
  101. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/schemas/framework_message_broker_sink_schema.py +0 -0
  102. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/schemas/framework_message_broker_source_schema.py +0 -0
  103. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/schemas/framework_message_wrapper_schema.py +0 -0
  104. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/schemas/framework_metadata_injector_schema.py +0 -0
  105. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/schemas/framework_otel_meter_schema.py +0 -0
  106. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/schemas/framework_otel_tracer_schema.py +0 -0
  107. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/schemas/framework_processing_job_schema.py +0 -0
  108. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/schemas/framework_task_injection_schema.py +0 -0
  109. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/schemas/framework_vdb_task_sink_schema.py +0 -0
  110. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/util/__init__.py +0 -0
  111. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/util/flow_control/__init__.py +0 -0
  112. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/util/flow_control/filter_by_task.py +0 -0
  113. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/util/flow_control/udf_intercept.py +0 -0
  114. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/util/service/__init__.py +0 -0
  115. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/util/service/impl/__init__.py +0 -0
  116. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/util/service/impl/ingest/__init__.py +0 -0
  117. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/util/service/impl/ingest/redis_ingest_service.py +0 -0
  118. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/util/service/meta/__init__.py +0 -0
  119. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/util/service/meta/ingest/__init__.py +0 -0
  120. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/util/service/meta/ingest/ingest_service_meta.py +0 -0
  121. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/util/telemetry/__init__.py +0 -0
  122. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/framework/util/telemetry/global_stats.py +0 -0
  123. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/pipeline/__init__.py +0 -0
  124. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/pipeline/config/__init__.py +0 -0
  125. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/pipeline/config/loaders.py +0 -0
  126. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/pipeline/ingest_pipeline.py +0 -0
  127. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/pipeline/pipeline_schema.py +0 -0
  128. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest/version.py +0 -0
  129. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest.egg-info/dependency_links.txt +0 -0
  130. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/nv_ingest.egg-info/top_level.txt +0 -0
  131. {nv_ingest-2025.11.9.dev20251109 → nv_ingest-2025.12.15.dev20251215}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest
3
- Version: 2025.11.9.dev20251109
3
+ Version: 2025.12.15.dev20251215
4
4
  Summary: Python module for multimodal document ingestion
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -219,6 +219,8 @@ Requires-Dist: diskcache>=5.6.3
219
219
  Requires-Dist: fastapi>=0.115.6
220
220
  Requires-Dist: fastparquet>=2024.11.0
221
221
  Requires-Dist: fsspec>=2024.10.0
222
+ Requires-Dist: universal_pathlib>=0.2.6
223
+ Requires-Dist: s3fs>=2024.10.0
222
224
  Requires-Dist: gunicorn
223
225
  Requires-Dist: h11>=0.16.0
224
226
  Requires-Dist: httpx>=0.28.1
@@ -226,7 +228,6 @@ Requires-Dist: isodate>=0.7.2
226
228
  Requires-Dist: langdetect>=1.0.9
227
229
  Requires-Dist: minio>=7.2.12
228
230
  Requires-Dist: librosa>=0.10.2
229
- Requires-Dist: openai>=1.82.0
230
231
  Requires-Dist: opentelemetry-api>=1.27.0
231
232
  Requires-Dist: opentelemetry-exporter-otlp>=1.27.0
232
233
  Requires-Dist: opentelemetry-sdk>=1.27.0
@@ -23,7 +23,7 @@ logger = logging.getLogger(__name__)
23
23
  app = FastAPI(
24
24
  title="NV-Ingest Microservice",
25
25
  description="Service for ingesting heterogenous datatypes",
26
- version="25.4.2",
26
+ version="26.1.0",
27
27
  contact={
28
28
  "name": "NVIDIA Corporation",
29
29
  "url": "https://nvidia.com",
@@ -13,6 +13,8 @@ import os
13
13
  import time
14
14
  import uuid
15
15
  import random
16
+ from pathlib import Path
17
+ import fsspec
16
18
 
17
19
  from fastapi import APIRouter, Request, Response
18
20
  from fastapi import HTTPException
@@ -21,6 +23,8 @@ from redis import RedisError
21
23
 
22
24
  from nv_ingest.framework.schemas.framework_message_wrapper_schema import MessageWrapper
23
25
  from nv_ingest_api.util.service_clients.client_base import FetchMode
26
+ from nv_ingest_api.util.dataloader.dataloader import DataLoader
27
+ from nv_ingest_api.internal.schemas.meta.ingest_job_schema import DocumentTypeEnum
24
28
 
25
29
  # For PDF splitting
26
30
  import pypdfium2 as pdfium
@@ -188,28 +192,42 @@ def get_pdf_page_count(pdf_content: bytes) -> int:
188
192
  return 1 # Assume single page on error
189
193
 
190
194
 
191
- def _prepare_chunk_submission(
195
+ def _create_subjob_dict(
196
+ job_id: str,
197
+ job_payload: Dict[str, Any],
192
198
  job_spec_template: Dict[str, Any],
193
- chunk: Dict[str, Any],
194
- *,
195
- parent_uuid: uuid.UUID,
196
- parent_job_id: str,
197
199
  current_trace_id: int,
198
- original_source_id: str,
199
- original_source_name: str,
200
- ) -> Tuple[str, MessageWrapper]:
201
- """Create a subjob MessageWrapper for a PDF chunk and return its identifier."""
202
-
203
- chunk_number = chunk["chunk_index"] + 1
204
- start_page = chunk["start_page"]
205
- end_page = chunk["end_page"]
206
-
207
- subjob_spec = {
200
+ parent_job_id: str,
201
+ start_key: Dict[str, Any],
202
+ ) -> Dict[str, Any]:
203
+ job_spec = {
208
204
  key: value
209
205
  for key, value in job_spec_template.items()
210
206
  if key not in {"job_payload", "job_id", "tracing_options"}
211
207
  }
208
+ job_spec["job_payload"] = job_payload
209
+ job_spec["job_id"] = job_id
212
210
 
211
+ base_tracing_options = job_spec_template.get("tracing_options") or {}
212
+ tracing_options = dict(base_tracing_options)
213
+ tracing_options.setdefault("trace", True)
214
+ tracing_options["trace_id"] = str(current_trace_id)
215
+ tracing_options["ts_send"] = int(time.time() * 1000)
216
+ tracing_options["parent_job_id"] = parent_job_id
217
+ for key, value in start_key.items():
218
+ tracing_options[key] = value
219
+
220
+ job_spec["tracing_options"] = tracing_options
221
+ return job_spec
222
+
223
+
224
+ def _create_payload_dict(
225
+ job_spec_template: Dict[str, Any],
226
+ content: str,
227
+ source_id: str,
228
+ source_name: str,
229
+ document_type: str,
230
+ ) -> Dict[str, Any]:
213
231
  subjob_payload_template = job_spec_template.get("job_payload", {})
214
232
  subjob_payload = {
215
233
  key: value
@@ -217,27 +235,40 @@ def _prepare_chunk_submission(
217
235
  if key not in {"content", "source_id", "source_name"}
218
236
  }
219
237
 
220
- chunk_bytes = chunk["bytes"]
221
- subjob_payload["content"] = [base64.b64encode(chunk_bytes).decode("utf-8")]
238
+ subjob_payload["content"] = [content]
222
239
 
223
- page_suffix = f"page_{start_page}" if start_page == end_page else f"pages_{start_page}-{end_page}"
224
- subjob_payload["source_id"] = [f"{original_source_id}#{page_suffix}"]
225
- subjob_payload["source_name"] = [f"{original_source_name}#{page_suffix}"]
240
+ subjob_payload["source_id"] = [source_id]
241
+ subjob_payload["source_name"] = [source_name]
242
+ subjob_payload["document_type"] = [document_type]
243
+ return subjob_payload
244
+
245
+
246
+ def _prepare_chunk_submission(
247
+ job_spec_template: Dict[str, Any],
248
+ chunk: Dict[str, Any],
249
+ *,
250
+ parent_uuid: uuid.UUID,
251
+ parent_job_id: str,
252
+ current_trace_id: int,
253
+ source_id: str,
254
+ source_name: str,
255
+ document_type: str,
256
+ ) -> Tuple[str, MessageWrapper]:
257
+ """Create a subjob MessageWrapper for a PDF chunk and return its identifier."""
258
+
259
+ chunk_number = chunk["chunk_index"] + 1
226
260
 
227
261
  subjob_uuid = uuid.uuid5(parent_uuid, f"chunk-{chunk_number}")
228
262
  subjob_id = str(subjob_uuid)
229
- subjob_spec["job_payload"] = subjob_payload
230
- subjob_spec["job_id"] = subjob_id
231
263
 
232
- base_tracing_options = job_spec_template.get("tracing_options") or {}
233
- tracing_options = dict(base_tracing_options)
234
- tracing_options.setdefault("trace", True)
235
- tracing_options["trace_id"] = str(current_trace_id)
236
- tracing_options["ts_send"] = int(time.time() * 1000)
237
- tracing_options["parent_job_id"] = parent_job_id
238
- tracing_options["page_num"] = start_page
264
+ subjob_payload_template = job_spec_template.get("job_payload", {})
265
+ chunk_bytes = base64.b64encode(chunk["bytes"]).decode("utf-8")
266
+ subjob_payload = _create_payload_dict(subjob_payload_template, chunk_bytes, source_id, source_name, document_type)
267
+ start = chunk["start_page"] if "start_page" in chunk else chunk["start"]
239
268
 
240
- subjob_spec["tracing_options"] = tracing_options
269
+ subjob_spec = _create_subjob_dict(
270
+ subjob_id, subjob_payload, job_spec_template, current_trace_id, parent_job_id, {"page_num": start}
271
+ )
241
272
 
242
273
  return subjob_id, MessageWrapper(payload=json.dumps(subjob_spec))
243
274
 
@@ -801,6 +832,8 @@ async def submit_job_v2(
801
832
  request: Request, response: Response, job_spec: MessageWrapper, ingest_service: INGEST_SERVICE_T
802
833
  ):
803
834
  span = trace.get_current_span()
835
+ source_id = None
836
+ document_type = None
804
837
  try:
805
838
  span.add_event("Submitting file for processing (V2)")
806
839
 
@@ -827,7 +860,19 @@ async def submit_job_v2(
827
860
 
828
861
  # Track page count for all PDFs (used for both splitting logic and metadata)
829
862
  pdf_page_count_cache = None
830
-
863
+ submission_items: List[Tuple[str, MessageWrapper]] = []
864
+ subjob_ids: List[str] = []
865
+ subjob_descriptors: List[Dict[str, Any]] = []
866
+ parent_metadata: Dict[str, Any] = {}
867
+ submission_items: List[Tuple[str, MessageWrapper]] = []
868
+ try:
869
+ parent_uuid = uuid.UUID(parent_job_id)
870
+ except ValueError:
871
+ logger.warning(
872
+ "Parent job id %s is not a valid UUID; generating fallback namespace for subjobs",
873
+ parent_job_id,
874
+ )
875
+ parent_uuid = uuid.uuid4()
831
876
  # Check if this is a PDF that needs splitting
832
877
  if document_types and payloads and document_types[0].lower() == "pdf":
833
878
  # Decode the payload to check page count
@@ -836,6 +881,7 @@ async def submit_job_v2(
836
881
  pdf_page_count_cache = page_count # Cache for later use
837
882
  qos_tier = get_qos_tier_for_page_count(page_count)
838
883
  pages_per_chunk = get_pdf_split_page_count(client_override=client_split_page_count)
884
+ document_type = DocumentTypeEnum.PDF
839
885
 
840
886
  # Split if the document has more pages than our chunk size
841
887
  if page_count > pages_per_chunk:
@@ -846,13 +892,11 @@ async def submit_job_v2(
846
892
  page_count,
847
893
  qos_tier,
848
894
  )
849
-
850
895
  chunks = split_pdf_to_chunks(pdf_content, pages_per_chunk)
851
896
 
852
897
  subjob_ids: List[str] = []
853
898
  subjob_descriptors: List[Dict[str, Any]] = []
854
899
  submission_items: List[Tuple[str, MessageWrapper]] = []
855
-
856
900
  try:
857
901
  parent_uuid = uuid.UUID(parent_job_id)
858
902
  except ValueError:
@@ -863,14 +907,20 @@ async def submit_job_v2(
863
907
  parent_uuid = uuid.uuid4()
864
908
 
865
909
  for chunk in chunks:
910
+ start = chunk["start_page"]
911
+ end = chunk["end_page"]
912
+ page_suffix = f"page_{start}" if start == end else f"pages_{start}-{end}"
913
+ source_id = f"{original_source_id}#{page_suffix}"
914
+ source_name = f"{original_source_name}#{page_suffix}"
866
915
  subjob_id, subjob_wrapper = _prepare_chunk_submission(
867
916
  job_spec_dict,
868
917
  chunk,
918
+ document_type=DocumentTypeEnum.PDF,
869
919
  parent_uuid=parent_uuid,
870
920
  parent_job_id=parent_job_id,
871
921
  current_trace_id=current_trace_id,
872
- original_source_id=original_source_id,
873
- original_source_name=original_source_name,
922
+ source_id=source_id,
923
+ source_name=source_name,
874
924
  )
875
925
 
876
926
  # Inject QoS routing hint into subjob routing_options (keeps API and service loosely coupled)
@@ -895,38 +945,98 @@ async def submit_job_v2(
895
945
  "page_count": chunk.get("page_count"),
896
946
  }
897
947
  )
948
+ parent_metadata.update(
949
+ {
950
+ "total_pages": page_count,
951
+ "pages_per_chunk": pages_per_chunk,
952
+ "original_source_id": original_source_id,
953
+ "original_source_name": original_source_name,
954
+ "document_type": document_types[0] if document_types else "pdf",
955
+ "subjob_order": subjob_ids,
956
+ }
957
+ )
958
+ elif document_types and payloads and document_types[0].lower() in ["mp4", "mov", "avi", "mp3", "wav"]:
959
+ document_type = document_types[0]
960
+ upload_path = f"./{Path(original_source_id).name}"
961
+ # dump the payload to a file, just came from client
962
+ with fsspec.open(upload_path, "wb") as f:
963
+ f.write(base64.b64decode(payloads[0]))
964
+ dataloader = DataLoader(
965
+ path=upload_path, output_dir="./audio_chunks/", audio_only=True, split_interval=50000000
966
+ )
967
+ document_type = DocumentTypeEnum.MP3
968
+
969
+ parent_uuid = uuid.UUID(parent_job_id)
970
+ for task in job_spec_dict["tasks"]:
971
+ if "task_properties" in task and "document_type" in task["task_properties"]:
972
+ task["task_properties"]["document_type"] = document_type
973
+ end = 0
974
+ for idx, (file_path, duration) in enumerate(dataloader.files_completed):
975
+ start = end
976
+ end = int(start + duration)
977
+ chunk = {
978
+ "bytes": file_path.encode("utf-8"),
979
+ "chunk_index": idx,
980
+ "start": start,
981
+ "end": end,
982
+ }
898
983
 
899
- if submission_items:
900
- burst_size, pause_ms, jitter_ms = _get_submit_burst_params()
901
- await _submit_subjobs_in_bursts(
902
- submission_items,
903
- ingest_service,
904
- burst_size=burst_size,
905
- pause_ms=pause_ms,
906
- jitter_ms=jitter_ms,
907
- )
984
+ subjob_id, subjob_wrapper = _prepare_chunk_submission(
985
+ job_spec_dict,
986
+ chunk,
987
+ parent_uuid=parent_uuid,
988
+ parent_job_id=parent_job_id,
989
+ current_trace_id=current_trace_id,
990
+ source_id=file_path,
991
+ source_name=upload_path,
992
+ document_type=document_type,
993
+ )
908
994
 
909
- parent_metadata: Dict[str, Any] = {
910
- "total_pages": page_count,
911
- "pages_per_chunk": pages_per_chunk,
995
+ submission_items.append((subjob_id, subjob_wrapper))
996
+ subjob_ids.append(subjob_id)
997
+ subjob_descriptors.append(
998
+ {
999
+ "job_id": subjob_id,
1000
+ "chunk_index": idx + 1,
1001
+ "start_page": chunk.get("start"),
1002
+ "end_page": chunk.get("end"),
1003
+ "page_count": chunk.get("page_count", 0),
1004
+ }
1005
+ )
1006
+ logger.debug(f"Removing uploaded file {upload_path}")
1007
+ os.remove(upload_path)
1008
+
1009
+ if submission_items:
1010
+ burst_size, pause_ms, jitter_ms = _get_submit_burst_params()
1011
+ await _submit_subjobs_in_bursts(
1012
+ submission_items,
1013
+ ingest_service,
1014
+ burst_size=burst_size,
1015
+ pause_ms=pause_ms,
1016
+ jitter_ms=jitter_ms,
1017
+ )
1018
+
1019
+ parent_metadata.update(
1020
+ {
912
1021
  "original_source_id": original_source_id,
913
1022
  "original_source_name": original_source_name,
914
- "document_type": document_types[0] if document_types else "pdf",
1023
+ "document_type": document_type,
915
1024
  "subjob_order": subjob_ids,
916
1025
  }
1026
+ )
1027
+ # raise ValueError(f"Setting parent job mapping for {parent_job_id} with {len(subjob_ids)} subjobs")
1028
+ await ingest_service.set_parent_job_mapping(
1029
+ parent_job_id,
1030
+ subjob_ids,
1031
+ parent_metadata,
1032
+ subjob_descriptors=subjob_descriptors,
1033
+ )
917
1034
 
918
- await ingest_service.set_parent_job_mapping(
919
- parent_job_id,
920
- subjob_ids,
921
- parent_metadata,
922
- subjob_descriptors=subjob_descriptors,
923
- )
924
-
925
- await ingest_service.set_job_state(parent_job_id, STATE_SUBMITTED)
1035
+ await ingest_service.set_job_state(parent_job_id, STATE_SUBMITTED)
926
1036
 
927
- span.add_event(f"Split into {len(subjob_ids)} subjobs")
928
- response.headers["x-trace-id"] = trace.format_trace_id(current_trace_id)
929
- return parent_job_id
1037
+ span.add_event(f"Split into {len(subjob_ids)} subjobs")
1038
+ response.headers["x-trace-id"] = trace.format_trace_id(current_trace_id)
1039
+ return parent_job_id
930
1040
 
931
1041
  # For non-PDFs or cases where splitting is not required, submit as normal
932
1042
  if "tracing_options" not in job_spec_dict:
@@ -982,8 +1092,8 @@ async def submit_job_v2(
982
1092
  return parent_job_id
983
1093
 
984
1094
  except Exception as ex:
985
- logger.exception(f"Error submitting job: {str(ex)}")
986
- raise HTTPException(status_code=500, detail=f"Nv-Ingest Internal Server Error: {str(ex)}")
1095
+ logger.exception(f"Error submitting job: {str(ex)}, {source_id}")
1096
+ raise HTTPException(status_code=500, detail=f"Nv-Ingest Internal Server Error: {str(ex)}, for: \n{source_id}")
987
1097
 
988
1098
 
989
1099
  # GET /v2/fetch_job
@@ -162,6 +162,11 @@ def build_logging_config_from_env() -> LoggingConfig:
162
162
  if key not in os.environ:
163
163
  os.environ[key] = default_value
164
164
 
165
+ # For PRODUCTION mode, also suppress nv-ingest module INFO logs
166
+ if preset_level == "PRODUCTION":
167
+ logging.getLogger("nv_ingest").setLevel(logging.WARNING)
168
+ logging.getLogger("nv_ingest_api").setLevel(logging.WARNING)
169
+
165
170
  logger.info(f"Applied Ray logging preset: {preset_level}")
166
171
 
167
172
  # Get log level from environment, default to INFO
@@ -324,6 +329,7 @@ def launch_pipeline(
324
329
  pipeline_config = resolve_static_replicas(pipeline_config)
325
330
 
326
331
  # Pretty print the final pipeline configuration (after replica resolution)
332
+ # INFO level so it shows in docker/helm deployments; quiet mode suppresses in library mode
327
333
  pretty_output = pretty_print_pipeline_config(pipeline_config, config_path=None)
328
334
  logger.info("\n" + pretty_output)
329
335
 
@@ -150,7 +150,7 @@ if __name__ == "__main__":
150
150
  os.environ["OCR_GRPC_ENDPOINT"] = "localhost:8010"
151
151
  os.environ["OCR_INFER_PROTOCOL"] = "grpc"
152
152
  os.environ["OCR_MODEL_NAME"] = "paddle"
153
- os.environ["NEMORETRIEVER_PARSE_HTTP_ENDPOINT"] = "https://integrate.api.nvidia.com/v1/chat/completions"
153
+ os.environ["NEMOTRON_PARSE_HTTP_ENDPOINT"] = "https://integrate.api.nvidia.com/v1/chat/completions"
154
154
  os.environ["VLM_CAPTION_ENDPOINT"] = "https://integrate.api.nvidia.com/v1/chat/completions"
155
155
  os.environ["VLM_CAPTION_MODEL_NAME"] = "nvidia/nemotron-nano-12b-v2-vl"
156
156
  logger.info("Environment variables set.")
@@ -170,23 +170,23 @@ if __name__ == "__main__":
170
170
  yolox_graphic_elements_auth,
171
171
  yolox_graphic_elements_protocol,
172
172
  ) = get_nim_service("yolox_graphic_elements")
173
- nemoretriever_parse_grpc, nemoretriever_parse_http, nemoretriever_parse_auth, nemoretriever_parse_protocol = (
174
- get_nim_service("nemoretriever_parse")
173
+ nemotron_parse_grpc, nemotron_parse_http, nemotron_parse_auth, nemotron_parse_protocol = get_nim_service(
174
+ "nemotron_parse"
175
175
  )
176
176
  ocr_grpc, ocr_http, ocr_auth, ocr_protocol = get_nim_service("ocr")
177
177
 
178
- model_name = os.environ.get("NEMORETRIEVER_PARSE_MODEL_NAME", "nvidia/nemoretriever-parse")
178
+ model_name = os.environ.get("NEMOTRON_PARSE_MODEL_NAME", "nvidia/nemotron-parse")
179
179
  pdf_extractor_config = {
180
180
  "pdfium_config": {
181
181
  "auth_token": yolox_auth, # All auth tokens are the same for the moment
182
182
  "yolox_endpoints": (yolox_grpc, yolox_http),
183
183
  "yolox_infer_protocol": yolox_protocol,
184
184
  },
185
- "nemoretriever_parse_config": {
186
- "auth_token": nemoretriever_parse_auth,
187
- "nemoretriever_parse_endpoints": (nemoretriever_parse_grpc, nemoretriever_parse_http),
188
- "nemoretriever_parse_infer_protocol": nemoretriever_parse_protocol,
189
- "nemoretriever_parse_model_name": model_name,
185
+ "nemotron_parse_config": {
186
+ "auth_token": nemotron_parse_auth,
187
+ "nemotron_parse_endpoints": (nemotron_parse_grpc, nemotron_parse_http),
188
+ "nemotron_parse_infer_protocol": nemotron_parse_protocol,
189
+ "nemotron_parse_model_name": model_name,
190
190
  "yolox_endpoints": (yolox_grpc, yolox_http),
191
191
  "yolox_infer_protocol": yolox_protocol,
192
192
  },
@@ -5,7 +5,6 @@
5
5
 
6
6
  import logging
7
7
  from typing import Optional
8
-
9
8
  import ray
10
9
 
11
10
  from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
@@ -67,7 +66,6 @@ class AudioExtractorStage(RayActorStage):
67
66
  # Extract the DataFrame payload.
68
67
  df_ledger = control_message.payload()
69
68
  self._logger.debug("Extracted payload with %d rows.", len(df_ledger))
70
-
71
69
  # Remove the "audio_data_extract" task from the message to obtain task-specific configuration.
72
70
  task_config = remove_task_by_type(control_message, "extract")
73
71
  self._logger.debug("Extracted task config: %s", sanitize_for_logging(task_config))
@@ -0,0 +1,71 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ import logging
6
+ import ray
7
+
8
+ from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
9
+ from nv_ingest.framework.util.flow_control import filter_by_task
10
+ from nv_ingest_api.internal.extract.image.ocr_extractor import extract_text_data_from_image_internal
11
+ from nv_ingest_api.internal.primitives.ingest_control_message import IngestControlMessage, remove_task_by_type
12
+ from nv_ingest_api.internal.primitives.tracing.tagging import traceable, set_trace_timestamps_with_parent_context
13
+ from nv_ingest_api.internal.schemas.extract.extract_ocr_schema import OCRExtractorSchema
14
+ from nv_ingest_api.util.exception_handlers.decorators import nv_ingest_node_failure_try_except
15
+ from typing import Optional
16
+
17
+ from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ @ray.remote
23
+ class OCRExtractorStage(RayActorStage):
24
+ """
25
+ A Ray actor stage that extracts text data from image content.
26
+
27
+ It expects an IngestControlMessage containing a DataFrame with image data. It then:
28
+ 1. Removes the "text_data_extract" task from the message.
29
+ 2. Calls the text extraction logic using a validated configuration.
30
+ 3. Updates the message payload with the extracted text DataFrame.
31
+ """
32
+
33
+ def __init__(self, config: OCRExtractorSchema, stage_name: Optional[str] = None) -> None:
34
+ super().__init__(config, log_to_stdout=False, stage_name=stage_name)
35
+ try:
36
+ self.validated_config = config
37
+ self._logger.info("OCRExtractorStage configuration validated successfully.")
38
+ except Exception as e:
39
+ self._logger.exception(f"Error validating Text extractor config: {e}")
40
+ raise
41
+
42
+ @nv_ingest_node_failure_try_except()
43
+ @traceable()
44
+ @udf_intercept_hook()
45
+ @filter_by_task(required_tasks=["ocr_data_extract"])
46
+ def on_data(self, control_message: IngestControlMessage) -> IngestControlMessage:
47
+ # Extract DataFrame payload
48
+ df_ledger = control_message.payload()
49
+ if df_ledger.empty:
50
+ return control_message
51
+
52
+ # Remove the "text_data_extract" task from the message
53
+ task_config = remove_task_by_type(control_message, "ocr_data_extract")
54
+
55
+ execution_trace_log = {}
56
+ new_df, extraction_info = extract_text_data_from_image_internal(
57
+ df_extraction_ledger=df_ledger,
58
+ task_config=task_config,
59
+ extraction_config=self.validated_config,
60
+ execution_trace_log=execution_trace_log,
61
+ )
62
+
63
+ control_message.payload(new_df)
64
+ control_message.set_metadata("ocr_extraction_info", extraction_info)
65
+
66
+ do_trace_tagging = control_message.get_metadata("config::add_trace_tagging") is True
67
+ if do_trace_tagging and execution_trace_log:
68
+ parent_name = self.stage_name if self.stage_name else "ocr_extractor"
69
+ set_trace_timestamps_with_parent_context(control_message, execution_trace_log, parent_name, logger)
70
+
71
+ return control_message
@@ -0,0 +1,64 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024-25, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ # Added this no-op UDF ray stage to the pipeline to help speed up the LLM api calls
6
+
7
+ """
8
+ UDF Parallel Stage - A high-concurrency no-op stage for parallel UDF execution.
9
+
10
+ This stage does nothing except pass messages through, but with high replica count
11
+ it provides a parallel execution pool for UDFs to achieve N-way concurrency.
12
+ """
13
+
14
+ import logging
15
+ from typing import Any, Optional
16
+ from pydantic import BaseModel
17
+ import ray
18
+
19
+ from nv_ingest.framework.orchestration.ray.stages.meta.ray_actor_stage_base import RayActorStage
20
+ from nv_ingest.framework.util.flow_control.udf_intercept import udf_intercept_hook
21
+ from nv_ingest_api.internal.primitives.tracing.tagging import traceable
22
+ from nv_ingest_api.util.exception_handlers.decorators import (
23
+ nv_ingest_node_failure_try_except,
24
+ )
25
+
26
+ logger = logging.getLogger(__name__)
27
+
28
+
29
+ @ray.remote
30
+ class UDFParallelStage(RayActorStage):
31
+ """
32
+ A no-op pass-through stage designed for parallel UDF execution.
33
+
34
+ This stage simply returns the input message unchanged, but when configured
35
+ with multiple replicas, it provides a high-concurrency pool for UDFs to
36
+ achieve parallel execution without blocking.
37
+ """
38
+
39
+ def __init__(self, config: BaseModel, stage_name: Optional[str] = None) -> None:
40
+ super().__init__(config, stage_name=stage_name)
41
+ logger.info(f"UDFParallelStage initialized: {stage_name}")
42
+
43
+ @nv_ingest_node_failure_try_except()
44
+ @traceable()
45
+ @udf_intercept_hook()
46
+ def on_data(self, message: Any) -> Any:
47
+ """
48
+ Pass-through processing that simply returns the message unchanged.
49
+
50
+ The @udf_intercept_hook decorator allows UDFs to target this stage,
51
+ and multiple replicas provide parallel execution capacity.
52
+
53
+ Parameters
54
+ ----------
55
+ message : Any
56
+ The incoming control message.
57
+
58
+ Returns
59
+ -------
60
+ Any
61
+ The unmodified control message.
62
+ """
63
+ # No-op: just return the message
64
+ return message