nv-ingest-api 2025.10.8.dev20251008__tar.gz → 2025.10.10.dev20251010__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (181) hide show
  1. {nv_ingest_api-2025.10.8.dev20251008/src/nv_ingest_api.egg-info → nv_ingest_api-2025.10.10.dev20251010}/PKG-INFO +1 -1
  2. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/image/chart_extractor.py +7 -3
  3. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/image/infographic_extractor.py +7 -3
  4. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/image/table_extractor.py +7 -3
  5. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/pdf/engines/pdfium.py +6 -4
  6. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +9 -2
  7. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/primitives/nim/nim_client.py +44 -11
  8. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +5 -1
  9. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/service_clients/rest/rest_client.py +9 -2
  10. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010/src/nv_ingest_api.egg-info}/PKG-INFO +1 -1
  11. nv_ingest_api-2025.10.10.dev20251010/src/udfs/llm_summarizer_udf.py +204 -0
  12. nv_ingest_api-2025.10.8.dev20251008/src/udfs/llm_summarizer_udf.py +0 -210
  13. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/LICENSE +0 -0
  14. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/MANIFEST.in +0 -0
  15. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/README.md +0 -0
  16. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/pyproject.toml +0 -0
  17. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/setup.cfg +0 -0
  18. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/__init__.py +0 -0
  19. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/interface/__init__.py +0 -0
  20. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/interface/extract.py +0 -0
  21. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/interface/mutate.py +0 -0
  22. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/interface/store.py +0 -0
  23. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/interface/transform.py +0 -0
  24. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/interface/utility.py +0 -0
  25. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/__init__.py +0 -0
  26. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/enums/__init__.py +0 -0
  27. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/enums/common.py +0 -0
  28. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/__init__.py +0 -0
  29. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/audio/__init__.py +0 -0
  30. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/audio/audio_extraction.py +0 -0
  31. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/docx/__init__.py +0 -0
  32. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/docx/docx_extractor.py +0 -0
  33. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  34. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +0 -0
  35. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +0 -0
  36. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +0 -0
  37. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/html/__init__.py +0 -0
  38. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/html/html_extractor.py +0 -0
  39. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/image/__init__.py +0 -0
  40. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/image/image_extractor.py +0 -0
  41. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/image/image_helpers/__init__.py +0 -0
  42. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/image/image_helpers/common.py +0 -0
  43. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/pdf/__init__.py +0 -0
  44. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/pdf/engines/__init__.py +0 -0
  45. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/pdf/engines/adobe.py +0 -0
  46. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/pdf/engines/llama.py +0 -0
  47. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +0 -0
  48. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +0 -0
  49. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/pdf/engines/tika.py +0 -0
  50. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +0 -0
  51. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/pdf/pdf_extractor.py +0 -0
  52. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/pptx/__init__.py +0 -0
  53. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  54. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +0 -0
  55. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/extract/pptx/pptx_extractor.py +0 -0
  56. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/meta/__init__.py +0 -0
  57. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/meta/udf.py +0 -0
  58. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/mutate/__init__.py +0 -0
  59. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/mutate/deduplicate.py +0 -0
  60. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/mutate/filter.py +0 -0
  61. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/primitives/__init__.py +0 -0
  62. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/primitives/control_message_task.py +0 -0
  63. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/primitives/ingest_control_message.py +0 -0
  64. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/primitives/nim/__init__.py +0 -0
  65. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/primitives/nim/default_values.py +0 -0
  66. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +0 -0
  67. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/primitives/nim/model_interface/cached.py +0 -0
  68. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +0 -0
  69. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +0 -0
  70. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +0 -0
  71. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +0 -0
  72. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +0 -0
  73. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +0 -0
  74. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +0 -0
  75. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +0 -0
  76. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/primitives/nim/nim_model_interface.py +0 -0
  77. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  78. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/primitives/tracing/latency.py +0 -0
  79. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/primitives/tracing/logging.py +0 -0
  80. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/primitives/tracing/tagging.py +0 -0
  81. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/__init__.py +0 -0
  82. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/extract/__init__.py +0 -0
  83. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +0 -0
  84. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +0 -0
  85. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +0 -0
  86. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/extract/extract_html_schema.py +0 -0
  87. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/extract/extract_image_schema.py +0 -0
  88. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +0 -0
  89. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +0 -0
  90. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +0 -0
  91. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/extract/extract_table_schema.py +0 -0
  92. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/message_brokers/__init__.py +0 -0
  93. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +0 -0
  94. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/message_brokers/request_schema.py +0 -0
  95. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/message_brokers/response_schema.py +0 -0
  96. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/meta/__init__.py +0 -0
  97. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/meta/base_model_noext.py +0 -0
  98. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/meta/metadata_schema.py +0 -0
  99. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/meta/udf.py +0 -0
  100. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/mutate/__init__.py +0 -0
  101. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +0 -0
  102. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/store/__init__.py +0 -0
  103. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/store/store_embedding_schema.py +0 -0
  104. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/store/store_image_schema.py +0 -0
  105. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/transform/__init__.py +0 -0
  106. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +0 -0
  107. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +0 -0
  108. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +0 -0
  109. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +0 -0
  110. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/store/__init__.py +0 -0
  111. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/store/embed_text_upload.py +0 -0
  112. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/store/image_upload.py +0 -0
  113. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/transform/__init__.py +0 -0
  114. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/transform/caption_image.py +0 -0
  115. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/transform/embed_text.py +0 -0
  116. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/internal/transform/split_text.py +0 -0
  117. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/__init__.py +0 -0
  118. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/control_message/__init__.py +0 -0
  119. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/control_message/validators.py +0 -0
  120. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/converters/__init__.py +0 -0
  121. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/converters/bytetools.py +0 -0
  122. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/converters/containers.py +0 -0
  123. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/converters/datetools.py +0 -0
  124. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/converters/dftools.py +0 -0
  125. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/converters/formats.py +0 -0
  126. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/converters/type_mappings.py +0 -0
  127. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/dataloader/__init__.py +0 -0
  128. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/dataloader/dataloader.py +0 -0
  129. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/detectors/__init__.py +0 -0
  130. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/detectors/language.py +0 -0
  131. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  132. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/exception_handlers/converters.py +0 -0
  133. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/exception_handlers/decorators.py +0 -0
  134. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/exception_handlers/detectors.py +0 -0
  135. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/exception_handlers/pdf.py +0 -0
  136. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/exception_handlers/schemas.py +0 -0
  137. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/image_processing/__init__.py +0 -0
  138. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/image_processing/clustering.py +0 -0
  139. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/image_processing/processing.py +0 -0
  140. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/image_processing/table_and_chart.py +0 -0
  141. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/image_processing/transforms.py +0 -0
  142. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/imports/__init__.py +0 -0
  143. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/imports/callable_signatures.py +0 -0
  144. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/imports/dynamic_resolvers.py +0 -0
  145. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/introspection/__init__.py +0 -0
  146. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/introspection/class_inspect.py +0 -0
  147. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/introspection/function_inspect.py +0 -0
  148. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/logging/__init__.py +0 -0
  149. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/logging/configuration.py +0 -0
  150. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/logging/sanitize.py +0 -0
  151. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/message_brokers/__init__.py +0 -0
  152. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +0 -0
  153. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +0 -0
  154. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +0 -0
  155. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +0 -0
  156. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/metadata/__init__.py +0 -0
  157. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/metadata/aggregators.py +0 -0
  158. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/multi_processing/__init__.py +0 -0
  159. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/multi_processing/mp_pool_singleton.py +0 -0
  160. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/nim/__init__.py +0 -0
  161. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/pdf/__init__.py +0 -0
  162. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/pdf/pdfium.py +0 -0
  163. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/schema/__init__.py +0 -0
  164. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/schema/schema_validator.py +0 -0
  165. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/service_clients/__init__.py +0 -0
  166. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/service_clients/client_base.py +0 -0
  167. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/service_clients/kafka/__init__.py +0 -0
  168. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
  169. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/service_clients/redis/redis_client.py +0 -0
  170. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  171. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/string_processing/__init__.py +0 -0
  172. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/string_processing/configuration.py +0 -0
  173. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/string_processing/yaml.py +0 -0
  174. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/system/__init__.py +0 -0
  175. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api/util/system/hardware_info.py +0 -0
  176. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api.egg-info/SOURCES.txt +0 -0
  177. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api.egg-info/dependency_links.txt +0 -0
  178. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api.egg-info/requires.txt +0 -0
  179. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/nv_ingest_api.egg-info/top_level.txt +0 -0
  180. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/udfs/__init__.py +0 -0
  181. {nv_ingest_api-2025.10.8.dev20251008 → nv_ingest_api-2025.10.10.dev20251010}/src/version.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-api
3
- Version: 2025.10.8.dev20251008
3
+ Version: 2025.10.10.dev20251010
4
4
  Summary: Python module with core document ingestion functions.
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -97,7 +97,7 @@ def _run_chart_inference(
97
97
  model_name="paddle",
98
98
  max_batch_size=1 if ocr_client.protocol == "grpc" else 2,
99
99
  )
100
- elif ocr_model_name == "scene_text_ensemble":
100
+ elif ocr_model_name in {"scene_text_ensemble", "scene_text_wrapper", "scene_text_python"}:
101
101
  future_ocr_kwargs.update(
102
102
  model_name=ocr_model_name,
103
103
  input_names=["INPUT_IMAGE_URLS", "MERGE_LEVELS"],
@@ -237,7 +237,9 @@ def _create_ocr_client(
237
237
  auth_token: str,
238
238
  ) -> NimClient:
239
239
  ocr_model_interface = (
240
- NemoRetrieverOCRModelInterface() if ocr_model_name == "scene_text_ensemble" else PaddleOCRModelInterface()
240
+ NemoRetrieverOCRModelInterface()
241
+ if ocr_model_name in {"scene_text_ensemble", "scene_text_wrapper", "scene_text_python"}
242
+ else PaddleOCRModelInterface()
241
243
  )
242
244
 
243
245
  ocr_client = create_inference_client(
@@ -245,7 +247,9 @@ def _create_ocr_client(
245
247
  model_interface=ocr_model_interface,
246
248
  auth_token=auth_token,
247
249
  infer_protocol=ocr_protocol,
248
- enable_dynamic_batching=(True if ocr_model_name == "scene_text_ensemble" else False),
250
+ enable_dynamic_batching=(
251
+ True if ocr_model_name in {"scene_text_ensemble", "scene_text_wrapper", "scene_text_python"} else False
252
+ ),
249
253
  dynamic_batch_memory_budget_mb=32,
250
254
  )
251
255
 
@@ -107,7 +107,7 @@ def _update_infographic_metadata(
107
107
  model_name="paddle",
108
108
  max_batch_size=1 if ocr_client.protocol == "grpc" else 2,
109
109
  )
110
- elif ocr_model_name == "scene_text_ensemble":
110
+ elif ocr_model_name in {"scene_text_ensemble", "scene_text_wrapper", "scene_text_python"}:
111
111
  infer_kwargs.update(
112
112
  model_name=ocr_model_name,
113
113
  input_names=["INPUT_IMAGE_URLS", "MERGE_LEVELS"],
@@ -152,7 +152,9 @@ def _create_ocr_client(
152
152
  auth_token: str,
153
153
  ) -> NimClient:
154
154
  ocr_model_interface = (
155
- NemoRetrieverOCRModelInterface() if ocr_model_name == "scene_text_ensemble" else PaddleOCRModelInterface()
155
+ NemoRetrieverOCRModelInterface()
156
+ if ocr_model_name in {"scene_text_ensemble", "scene_text_wrapper", "scene_text_python"}
157
+ else PaddleOCRModelInterface()
156
158
  )
157
159
 
158
160
  ocr_client = create_inference_client(
@@ -160,7 +162,9 @@ def _create_ocr_client(
160
162
  model_interface=ocr_model_interface,
161
163
  auth_token=auth_token,
162
164
  infer_protocol=ocr_protocol,
163
- enable_dynamic_batching=(True if ocr_model_name == "scene_text_ensemble" else False),
165
+ enable_dynamic_batching=(
166
+ True if ocr_model_name in {"scene_text_ensemble", "scene_text_wrapper", "scene_text_python"} else False
167
+ ),
164
168
  dynamic_batch_memory_budget_mb=32,
165
169
  )
166
170
 
@@ -99,7 +99,7 @@ def _run_inference(
99
99
  model_name="paddle",
100
100
  max_batch_size=1 if ocr_client.protocol == "grpc" else 2,
101
101
  )
102
- elif ocr_model_name == "scene_text_ensemble":
102
+ elif ocr_model_name in {"scene_text_ensemble", "scene_text_wrapper", "scene_text_python"}:
103
103
  future_ocr_kwargs.update(
104
104
  model_name=ocr_model_name,
105
105
  input_names=["INPUT_IMAGE_URLS", "MERGE_LEVELS"],
@@ -246,7 +246,9 @@ def _create_ocr_client(
246
246
  auth_token: str,
247
247
  ) -> NimClient:
248
248
  ocr_model_interface = (
249
- NemoRetrieverOCRModelInterface() if ocr_model_name == "scene_text_ensemble" else PaddleOCRModelInterface()
249
+ NemoRetrieverOCRModelInterface()
250
+ if ocr_model_name in {"scene_text_ensemble", "scene_text_wrapper", "scene_text_python"}
251
+ else PaddleOCRModelInterface()
250
252
  )
251
253
 
252
254
  ocr_client = create_inference_client(
@@ -254,7 +256,9 @@ def _create_ocr_client(
254
256
  model_interface=ocr_model_interface,
255
257
  auth_token=auth_token,
256
258
  infer_protocol=ocr_protocol,
257
- enable_dynamic_batching=(True if ocr_model_name == "scene_text_ensemble" else False),
259
+ enable_dynamic_batching=(
260
+ True if ocr_model_name in {"scene_text_ensemble", "scene_text_wrapper", "scene_text_python"} else False
261
+ ),
258
262
  dynamic_batch_memory_budget_mb=32,
259
263
  )
260
264
 
@@ -332,6 +332,7 @@ def _extract_page_elements(
332
332
 
333
333
  # Process each extracted element based on extraction flags
334
334
  for page_idx, page_element in page_element_results:
335
+ page_reading_index = page_idx + 1
335
336
  # Skip elements that shouldn't be extracted based on flags
336
337
  if (not extract_tables) and (page_element.type_string == "table"):
337
338
  continue
@@ -347,7 +348,7 @@ def _extract_page_elements(
347
348
  # Construct metadata for the page element
348
349
  page_element_meta = construct_page_element_metadata(
349
350
  page_element,
350
- page_idx,
351
+ page_reading_index,
351
352
  page_count,
352
353
  source_metadata,
353
354
  base_unified_metadata,
@@ -473,6 +474,7 @@ def pdfium_extractor(
473
474
  for page_idx in range(page_count):
474
475
  page = doc.get_page(page_idx)
475
476
  page_width, page_height = page.get_size()
477
+ page_reading_index = page_idx + 1
476
478
 
477
479
  # Text extraction
478
480
  if extract_text:
@@ -481,7 +483,7 @@ def pdfium_extractor(
481
483
  text_meta = construct_text_metadata(
482
484
  [page_text],
483
485
  pdf_metadata.keywords,
484
- page_idx,
486
+ page_reading_index,
485
487
  -1,
486
488
  -1,
487
489
  -1,
@@ -499,7 +501,7 @@ def pdfium_extractor(
499
501
  image_data = _extract_page_images(
500
502
  extract_images_method,
501
503
  page,
502
- page_idx,
504
+ page_reading_index,
503
505
  page_width,
504
506
  page_height,
505
507
  page_count,
@@ -518,7 +520,7 @@ def pdfium_extractor(
518
520
  base64_image, _ = scale_image_to_encoding_size(base64_image, max_base64_size=2**24 - 1)
519
521
  image_meta = construct_image_metadata_from_base64(
520
522
  base64_image,
521
- page_idx,
523
+ page_reading_index,
522
524
  page_count,
523
525
  source_metadata,
524
526
  base_unified_metadata,
@@ -21,7 +21,10 @@ from nv_ingest_api.internal.primitives.nim.model_interface.helpers import prepro
21
21
  from nv_ingest_api.util.image_processing.transforms import base64_to_numpy
22
22
 
23
23
  DEFAULT_OCR_MODEL_NAME = "paddle"
24
- NEMORETRIEVER_OCR_MODEL_NAME = "scene_text_ensemble"
24
+ NEMORETRIEVER_OCR_MODEL_NAME = "scene_text_wrapper"
25
+ NEMORETRIEVER_OCR_ENSEMBLE_MODEL_NAME = "scene_text_ensemble"
26
+ NEMORETRIEVER_OCR_BLS_MODEL_NAME = "scene_text_python"
27
+
25
28
 
26
29
  logger = logging.getLogger(__name__)
27
30
 
@@ -231,7 +234,11 @@ class OCRModelInterfaceBase(ModelInterface):
231
234
  if not isinstance(response, np.ndarray):
232
235
  raise ValueError("Unexpected response format: response is not a NumPy array.")
233
236
 
234
- if model_name == NEMORETRIEVER_OCR_MODEL_NAME:
237
+ if model_name in [
238
+ NEMORETRIEVER_OCR_MODEL_NAME,
239
+ NEMORETRIEVER_OCR_ENSEMBLE_MODEL_NAME,
240
+ NEMORETRIEVER_OCR_BLS_MODEL_NAME,
241
+ ]:
235
242
  response = response.transpose((1, 0))
236
243
 
237
244
  # If we have shape (3,), convert to (3, 1)
@@ -121,9 +121,6 @@ class NimClient:
121
121
  if model_name == "yolox_ensemble":
122
122
  model_name = "yolox"
123
123
 
124
- if model_name == "scene_text_ensemble":
125
- model_name = "scene_text_pre"
126
-
127
124
  if model_name in self._max_batch_sizes:
128
125
  return self._max_batch_sizes[model_name]
129
126
 
@@ -326,16 +323,52 @@ class NimClient:
326
323
 
327
324
  outputs = [grpcclient.InferRequestedOutput(output_name) for output_name in output_names]
328
325
 
329
- response = self.client.infer(
330
- model_name=model_name, parameters=parameters, inputs=input_tensors, outputs=outputs
331
- )
326
+ base_delay = 0.5
327
+ attempt = 0
328
+ retries_429 = 0
329
+ max_grpc_retries = self.max_429_retries
330
+
331
+ while attempt < self.max_retries:
332
+ try:
333
+ response = self.client.infer(
334
+ model_name=model_name, parameters=parameters, inputs=input_tensors, outputs=outputs
335
+ )
332
336
 
333
- logger.debug(f"gRPC inference response: {response}")
337
+ logger.debug(f"gRPC inference response: {response}")
334
338
 
335
- if len(outputs) == 1:
336
- return response.as_numpy(outputs[0].name())
337
- else:
338
- return [response.as_numpy(output.name()) for output in outputs]
339
+ if len(outputs) == 1:
340
+ return response.as_numpy(outputs[0].name())
341
+ else:
342
+ return [response.as_numpy(output.name()) for output in outputs]
343
+
344
+ except grpcclient.InferenceServerException as e:
345
+ status = e.status()
346
+ if status == "StatusCode.UNAVAILABLE" and "Exceeds maximum queue size".lower() in e.message().lower():
347
+ retries_429 += 1
348
+ logger.warning(
349
+ f"Received gRPC {status} for model '{model_name}'. "
350
+ f"Attempt {retries_429} of {max_grpc_retries}."
351
+ )
352
+ if retries_429 >= max_grpc_retries:
353
+ logger.error(f"Max retries for gRPC {status} exceeded for model '{model_name}'.")
354
+ raise
355
+
356
+ backoff_time = base_delay * (2**retries_429)
357
+ time.sleep(backoff_time)
358
+ continue
359
+
360
+ else:
361
+ # For other server-side errors (e.g., INVALID_ARGUMENT, NOT_FOUND),
362
+ # retrying will not help. We should fail fast.
363
+ logger.error(
364
+ f"Received non-retryable gRPC error from Triton for model '{model_name}': {e.message()}"
365
+ )
366
+ raise
367
+
368
+ except Exception as e:
369
+ # Catch any other unexpected exceptions (e.g., network issues not caught by Triton client)
370
+ logger.error(f"An unexpected error occurred during gRPC inference for model '{model_name}': {e}")
371
+ raise
339
372
 
340
373
  def _http_infer(self, formatted_input: dict) -> dict:
341
374
  """
@@ -24,8 +24,12 @@ logger = logging.getLogger(__name__)
24
24
  # Tracing Options Schema
25
25
  class TracingOptionsSchema(BaseModelNoExt):
26
26
  trace: bool = False
27
- ts_send: int
27
+ ts_send: Optional[int] = None
28
28
  trace_id: Optional[str] = None
29
+ # V2 PDF splitting support
30
+ parent_job_id: Optional[str] = None
31
+ page_num: Optional[int] = None
32
+ total_pages: Optional[int] = None
29
33
 
30
34
 
31
35
  # Ingest Task Schemas
@@ -3,6 +3,7 @@
3
3
  # SPDX-License-Identifier: Apache-2.0
4
4
 
5
5
  import logging
6
+ import os
6
7
  import re
7
8
  import time
8
9
  from typing import Any, Union, Tuple, Optional, Dict, Callable
@@ -137,13 +138,19 @@ class RestClient(MessageBrokerClientBase):
137
138
  )
138
139
  self._client = requests.Session()
139
140
 
140
- self._submit_endpoint: str = "/v1/submit_job"
141
- self._fetch_endpoint: str = "/v1/fetch_job"
141
+ # Allow API version override via environment variable or kwargs
142
+ api_version = kwargs.get("api_version") or os.getenv("NV_INGEST_API_VERSION", "v1")
143
+ self._api_version = api_version
144
+ self._submit_endpoint: str = f"/{api_version}/submit_job"
145
+ self._fetch_endpoint: str = f"/{api_version}/fetch_job"
142
146
  self._base_url: str = kwargs.get("base_url") or self._generate_url(self._host, self._port)
143
147
  self._headers = kwargs.get("headers", {})
144
148
  self._auth = kwargs.get("auth", None)
145
149
 
146
150
  logger.debug(f"RestClient base URL set to: {self._base_url}")
151
+ logger.info(
152
+ f"RestClient using API version: {api_version} (endpoints: {self._submit_endpoint}, {self._fetch_endpoint})"
153
+ )
147
154
 
148
155
  @staticmethod
149
156
  def _generate_url(host: str, port: int) -> str:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-api
3
- Version: 2025.10.8.dev20251008
3
+ Version: 2025.10.10.dev20251010
4
4
  Summary: Python module with core document ingestion functions.
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -0,0 +1,204 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ LLM Content Summarizer UDF for NV-Ingest Pipeline
4
+
5
+ This UDF uses an LLM to generate concise summaries of text content chunks. These summaries are added to the metadata
6
+ for enhanced downstream processing and search capabilities.
7
+
8
+ These variables can be set in the environment before running the pipeline. These can be treated as kwargs.
9
+ - NVIDIA_API_KEY: API key for NVIDIA NIM endpoints (required)
10
+ - LLM_SUMMARIZATION_MODEL: Model to use (default: nvidia/llama-3.1-nemotron-70b-instruct)
11
+ - LLM_BASE_URL: base URL (default: https://integrate.api.nvidia.com/v1)
12
+ - TIMEOUT: API timeout in seconds (default: 60)
13
+ - MIN_CONTENT_LENGTH: Minimum content length to summarize (default: 50)
14
+ - MAX_CONTENT_LENGTH: Maximum content length to send to API (default: 12000)
15
+ TODO: Implement this
16
+ - NUM_CHUNKS: (Optional) Number of first and last pages to summarize. default=1
17
+ """
18
+
19
+ import logging
20
+ import os
21
+ import time
22
+
23
+ # REMOVE BEFORE MERGING
24
+ # import yaml
25
+ # from pathlib import Path
26
+
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+ PROMPT = """
31
+ Here are the contents from the first and last page of a document. Focus on the main purpose, key topics,
32
+ and important details. Just return the summary as a paragraph. Do not add special characters for formatting.
33
+ This summary will be used for document search and understanding.
34
+
35
+ [CONTENT]
36
+ {content}
37
+ [END CONTENT]
38
+ """
39
+
40
+
41
+ def content_summarizer(control_message: "IngestControlMessage") -> "IngestControlMessage": # noqa: F821
42
+ """
43
+ UDF function that adds LLM-generated summaries to text content chunks.
44
+
45
+ This function processes text primitives and generates concise summaries using
46
+ an LLM API, storing the results in the metadata's custom_content field.
47
+
48
+ Parameters
49
+ ----------
50
+ control_message : IngestControlMessage
51
+ The control message containing the DataFrame payload with text content
52
+
53
+ Returns
54
+ -------
55
+ IngestControlMessage
56
+ The modified control message with LLM summaries added to metadata
57
+ """
58
+ logger.info("UDF: Starting LLM content summarization")
59
+
60
+ api_key = os.getenv("NVIDIA_API_KEY")
61
+ model_name = os.getenv("LLM_SUMMARIZATION_MODEL", "nvidia/llama-3.1-nemotron-70b-instruct")
62
+ base_url = os.getenv("LLM_SUMMARIZATION_BASE_URL", "https://integrate.api.nvidia.com/v1")
63
+ min_content_length = int(os.getenv("LLM_MIN_CONTENT_LENGTH", 50))
64
+ max_content_length = int(os.getenv("LLM_MAX_CONTENT_LENGTH", 12000))
65
+ timeout = int(os.getenv("LLM_SUMMARIZATION_TIMEOUT", 60))
66
+
67
+ stats = {
68
+ "skipped": False,
69
+ "failed": False,
70
+ "tokens": 0,
71
+ "duration": 0.0,
72
+ }
73
+
74
+ if not api_key:
75
+ logger.error("NVIDIA_API_KEY not set. Skipping...")
76
+ return control_message
77
+
78
+ df = control_message.payload()
79
+
80
+ if df is None or df.empty:
81
+ logger.warning("No payload found. Nothing to summarize.")
82
+ return control_message
83
+
84
+ # Select first and last chunk for summarization
85
+ # According to docs/docs/extraction/user_defined_functions.md#understanding-the-dataframe-payload
86
+ # the rows are not necessarily pages. they are chunks of data extracted from the document. in order to select
87
+ # pages, it must require parsing the payload to see which chunks correspond to which pages
88
+ if len(df) > 1:
89
+ # TODO: add feature to select N first and last chunks
90
+ df = df.iloc[[0, -1]]
91
+ else:
92
+ logger.info("Document has only one chunk")
93
+
94
+ # Combine all content into a single string
95
+ content_list = df.apply(
96
+ _extract_content,
97
+ axis=1,
98
+ min_content_length=min_content_length,
99
+ max_content_length=max_content_length,
100
+ stats=stats,
101
+ )
102
+ content = " ".join(content_list)
103
+
104
+ # Nicely ask LLM to summarize content
105
+ summary, stats["duration"] = _generate_llm_summary(content, model_name, base_url, api_key, timeout)
106
+
107
+ stats["failed"] = summary is None
108
+ if not stats["failed"]:
109
+ stats["tokens"] = _estimate_tokens(content)
110
+ logger.info("Summarized %d tokens in %f seconds using %s", stats["tokens"], stats["duration"], model_name)
111
+ _store_summary(df, summary, model_name)
112
+
113
+ # Update the control message with modified DataFrame
114
+ control_message.payload(df)
115
+ else:
116
+ logger.warning("%s failed to summarize content", model_name)
117
+
118
+ return control_message
119
+
120
+
121
+ def _extract_content(row, stats: dict, min_content_length: int = 50, max_content_length: int = 12000) -> str | None:
122
+ """Extract text content from row"""
123
+ metadata = row.get("metadata")
124
+
125
+ if isinstance(metadata, dict):
126
+ content = metadata.get("content")
127
+ if content is not None:
128
+ content = content.strip()
129
+ if len(content) < min_content_length:
130
+ stats["skipped"] = True
131
+ logger.warning(f"Content less than min={min_content_length}. Skipping...")
132
+ content = ""
133
+ elif len(content) > max_content_length:
134
+ logger.warning(f"Truncating content to {max_content_length} characters")
135
+ content = content[:max_content_length]
136
+ else:
137
+ stats["skipped"] = True
138
+ content = ""
139
+
140
+ else:
141
+ stats["skipped"] = True
142
+ logger.warning("No metadata found. Skipping...")
143
+ content = ""
144
+
145
+ return content
146
+
147
+
148
+ def _generate_llm_summary(
149
+ content: str,
150
+ model_name: str,
151
+ base_url: str,
152
+ api_key: str,
153
+ timeout: int,
154
+ ) -> tuple[str | None, float]:
155
+ """Ask an LLM to summarize content extracted from doc."""
156
+
157
+ start_time = time.time()
158
+ try:
159
+ from openai import OpenAI
160
+
161
+ client = OpenAI(base_url=base_url, api_key=api_key, timeout=timeout)
162
+ start_time = time.time()
163
+ completion = client.chat.completions.create(
164
+ model=model_name,
165
+ messages=[{"role": "user", "content": PROMPT.format(content=content)}],
166
+ max_tokens=400, # Increased for more comprehensive summaries
167
+ temperature=0.7,
168
+ )
169
+ duration = time.time() - start_time
170
+
171
+ if completion.choices:
172
+ summary = completion.choices[0].message.content.strip()
173
+ return summary, duration
174
+ return None, duration
175
+
176
+ except Exception as e:
177
+ logger.error(f"API call failed: {e}")
178
+ # TODO: GitHub Thread
179
+ # Reviewers, tell me if this is a bad idea.
180
+ # I think the convention is to return timestamp for time even if it fails
181
+ return None, time.time() - start_time
182
+
183
+
184
+ def _store_summary(df, summary: str, model_name: str):
185
+ """Add summary to metadata and store in df"""
186
+ # hardcoded heuristic to store everything on chunk 0's metadata
187
+ row_0 = df.iloc[0]
188
+
189
+ # this is a reference to a dictionary that is stored in the dataframe
190
+ # and is modified in place
191
+ metadata = row_0.get("metadata")
192
+
193
+ if metadata.get("custom_content") is None:
194
+ metadata["custom_content"] = {}
195
+ metadata["custom_content"]["llm_summarizer_udf"] = {"summary": summary, "model": model_name}
196
+
197
+
198
+ def _estimate_tokens(text: str) -> int:
199
+ """Rough estimate (~4 characters per token)"""
200
+ return len(text) // 4
201
+
202
+
203
+ def _safe_model_name(name: str) -> str:
204
+ return name.replace("/", "__").replace("-", "_")