nv-ingest-api 2025.10.26.dev20251026__tar.gz → 2025.10.28.dev20251028__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (181) hide show
  1. {nv_ingest_api-2025.10.26.dev20251026/src/nv_ingest_api.egg-info → nv_ingest_api-2025.10.28.dev20251028}/PKG-INFO +2 -1
  2. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/pyproject.toml +1 -0
  3. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +4 -0
  4. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +4 -2
  5. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +10 -1
  6. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +4 -2
  7. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/schemas/extract/extract_image_schema.py +4 -2
  8. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +10 -1
  9. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +6 -4
  10. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +4 -2
  11. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/schemas/extract/extract_table_schema.py +9 -1
  12. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +2 -0
  13. nv_ingest_api-2025.10.28.dev20251028/src/nv_ingest_api/internal/schemas/mixins.py +39 -0
  14. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +4 -0
  15. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/transform/embed_text.py +82 -0
  16. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/dataloader/dataloader.py +20 -9
  17. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028/src/nv_ingest_api.egg-info}/PKG-INFO +2 -1
  18. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api.egg-info/SOURCES.txt +1 -0
  19. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api.egg-info/requires.txt +1 -0
  20. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/LICENSE +0 -0
  21. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/MANIFEST.in +0 -0
  22. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/README.md +0 -0
  23. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/setup.cfg +0 -0
  24. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/__init__.py +0 -0
  25. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/interface/__init__.py +0 -0
  26. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/interface/extract.py +0 -0
  27. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/interface/mutate.py +0 -0
  28. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/interface/store.py +0 -0
  29. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/interface/transform.py +0 -0
  30. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/interface/utility.py +0 -0
  31. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/__init__.py +0 -0
  32. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/enums/__init__.py +0 -0
  33. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/enums/common.py +0 -0
  34. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/extract/__init__.py +0 -0
  35. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/extract/audio/__init__.py +0 -0
  36. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/extract/audio/audio_extraction.py +0 -0
  37. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/extract/docx/__init__.py +0 -0
  38. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/extract/docx/docx_extractor.py +0 -0
  39. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  40. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +0 -0
  41. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +0 -0
  42. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +0 -0
  43. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/extract/html/__init__.py +0 -0
  44. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/extract/html/html_extractor.py +0 -0
  45. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/extract/image/__init__.py +0 -0
  46. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/extract/image/chart_extractor.py +0 -0
  47. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/extract/image/image_extractor.py +0 -0
  48. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/extract/image/image_helpers/__init__.py +0 -0
  49. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/extract/image/image_helpers/common.py +0 -0
  50. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/extract/image/infographic_extractor.py +0 -0
  51. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/extract/image/table_extractor.py +0 -0
  52. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/extract/pdf/__init__.py +0 -0
  53. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/extract/pdf/engines/__init__.py +0 -0
  54. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/extract/pdf/engines/adobe.py +0 -0
  55. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/extract/pdf/engines/llama.py +0 -0
  56. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +0 -0
  57. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +0 -0
  58. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/extract/pdf/engines/pdfium.py +0 -0
  59. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/extract/pdf/engines/tika.py +0 -0
  60. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +0 -0
  61. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/extract/pdf/pdf_extractor.py +0 -0
  62. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/extract/pptx/__init__.py +0 -0
  63. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  64. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +0 -0
  65. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/extract/pptx/pptx_extractor.py +0 -0
  66. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/meta/__init__.py +0 -0
  67. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/meta/udf.py +0 -0
  68. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/mutate/__init__.py +0 -0
  69. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/mutate/deduplicate.py +0 -0
  70. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/mutate/filter.py +0 -0
  71. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/primitives/__init__.py +0 -0
  72. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/primitives/control_message_task.py +0 -0
  73. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/primitives/ingest_control_message.py +0 -0
  74. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/primitives/nim/__init__.py +0 -0
  75. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/primitives/nim/default_values.py +0 -0
  76. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +0 -0
  77. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/primitives/nim/model_interface/cached.py +0 -0
  78. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +0 -0
  79. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +0 -0
  80. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +0 -0
  81. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +0 -0
  82. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/primitives/nim/model_interface/ocr.py +0 -0
  83. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +0 -0
  84. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +0 -0
  85. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +0 -0
  86. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/primitives/nim/nim_client.py +0 -0
  87. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/primitives/nim/nim_model_interface.py +0 -0
  88. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  89. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/primitives/tracing/latency.py +0 -0
  90. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/primitives/tracing/logging.py +0 -0
  91. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/primitives/tracing/tagging.py +0 -0
  92. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/schemas/__init__.py +0 -0
  93. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/schemas/extract/__init__.py +0 -0
  94. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/schemas/extract/extract_html_schema.py +0 -0
  95. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/schemas/message_brokers/__init__.py +0 -0
  96. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +0 -0
  97. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/schemas/message_brokers/request_schema.py +0 -0
  98. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/schemas/message_brokers/response_schema.py +0 -0
  99. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/schemas/meta/__init__.py +0 -0
  100. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/schemas/meta/base_model_noext.py +0 -0
  101. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/schemas/meta/metadata_schema.py +0 -0
  102. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/schemas/meta/udf.py +0 -0
  103. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/schemas/mutate/__init__.py +0 -0
  104. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +0 -0
  105. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/schemas/store/__init__.py +0 -0
  106. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/schemas/store/store_embedding_schema.py +0 -0
  107. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/schemas/store/store_image_schema.py +0 -0
  108. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/schemas/transform/__init__.py +0 -0
  109. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +0 -0
  110. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +0 -0
  111. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +0 -0
  112. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/store/__init__.py +0 -0
  113. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/store/embed_text_upload.py +0 -0
  114. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/store/image_upload.py +0 -0
  115. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/transform/__init__.py +0 -0
  116. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/transform/caption_image.py +0 -0
  117. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/internal/transform/split_text.py +0 -0
  118. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/__init__.py +0 -0
  119. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/control_message/__init__.py +0 -0
  120. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/control_message/validators.py +0 -0
  121. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/converters/__init__.py +0 -0
  122. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/converters/bytetools.py +0 -0
  123. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/converters/containers.py +0 -0
  124. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/converters/datetools.py +0 -0
  125. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/converters/dftools.py +0 -0
  126. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/converters/formats.py +0 -0
  127. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/converters/type_mappings.py +0 -0
  128. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/dataloader/__init__.py +0 -0
  129. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/detectors/__init__.py +0 -0
  130. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/detectors/language.py +0 -0
  131. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  132. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/exception_handlers/converters.py +0 -0
  133. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/exception_handlers/decorators.py +0 -0
  134. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/exception_handlers/detectors.py +0 -0
  135. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/exception_handlers/pdf.py +0 -0
  136. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/exception_handlers/schemas.py +0 -0
  137. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/image_processing/__init__.py +0 -0
  138. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/image_processing/clustering.py +0 -0
  139. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/image_processing/processing.py +0 -0
  140. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/image_processing/table_and_chart.py +0 -0
  141. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/image_processing/transforms.py +0 -0
  142. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/imports/__init__.py +0 -0
  143. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/imports/callable_signatures.py +0 -0
  144. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/imports/dynamic_resolvers.py +0 -0
  145. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/introspection/__init__.py +0 -0
  146. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/introspection/class_inspect.py +0 -0
  147. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/introspection/function_inspect.py +0 -0
  148. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/logging/__init__.py +0 -0
  149. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/logging/configuration.py +0 -0
  150. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/logging/sanitize.py +0 -0
  151. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/message_brokers/__init__.py +0 -0
  152. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +0 -0
  153. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +0 -0
  154. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +0 -0
  155. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +0 -0
  156. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/metadata/__init__.py +0 -0
  157. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/metadata/aggregators.py +0 -0
  158. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/multi_processing/__init__.py +0 -0
  159. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/multi_processing/mp_pool_singleton.py +0 -0
  160. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/nim/__init__.py +0 -0
  161. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/pdf/__init__.py +0 -0
  162. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/pdf/pdfium.py +0 -0
  163. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/schema/__init__.py +0 -0
  164. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/schema/schema_validator.py +0 -0
  165. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/service_clients/__init__.py +0 -0
  166. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/service_clients/client_base.py +0 -0
  167. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/service_clients/kafka/__init__.py +0 -0
  168. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
  169. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/service_clients/redis/redis_client.py +0 -0
  170. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  171. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/service_clients/rest/rest_client.py +0 -0
  172. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/string_processing/__init__.py +0 -0
  173. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/string_processing/configuration.py +0 -0
  174. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/string_processing/yaml.py +0 -0
  175. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/system/__init__.py +0 -0
  176. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api/util/system/hardware_info.py +0 -0
  177. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api.egg-info/dependency_links.txt +0 -0
  178. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/nv_ingest_api.egg-info/top_level.txt +0 -0
  179. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/udfs/__init__.py +0 -0
  180. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/udfs/llm_summarizer_udf.py +0 -0
  181. {nv_ingest_api-2025.10.26.dev20251026 → nv_ingest_api-2025.10.28.dev20251028}/src/version.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-api
3
- Version: 2025.10.26.dev20251026
3
+ Version: 2025.10.28.dev20251028
4
4
  Summary: Python module with core document ingestion functions.
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -222,6 +222,7 @@ Requires-Dist: fsspec>=2025.5.1
222
222
  Requires-Dist: universal_pathlib>=0.2.6
223
223
  Requires-Dist: ffmpeg-python==0.2.0
224
224
  Requires-Dist: tritonclient
225
+ Requires-Dist: glom
225
226
  Dynamic: license-file
226
227
 
227
228
  # nv-ingest-api
@@ -29,6 +29,7 @@ dependencies = [
29
29
  "universal_pathlib>=0.2.6",
30
30
  "ffmpeg-python==0.2.0",
31
31
  "tritonclient",
32
+ "glom",
32
33
  ]
33
34
 
34
35
  [project.urls]
@@ -355,6 +355,10 @@ def create_audio_inference_client(
355
355
  if (infer_protocol is None) and (grpc_endpoint and grpc_endpoint.strip()):
356
356
  infer_protocol = "grpc"
357
357
 
358
+ # Normalize protocol to lowercase for case-insensitive comparison
359
+ if infer_protocol:
360
+ infer_protocol = infer_protocol.lower()
361
+
358
362
  if infer_protocol == "http":
359
363
  raise ValueError("`http` endpoints are not supported for audio. Use `grpc`.")
360
364
 
@@ -10,10 +10,12 @@ from typing import Tuple
10
10
  from pydantic import BaseModel, Field
11
11
  from pydantic import root_validator
12
12
 
13
+ from nv_ingest_api.internal.schemas.mixins import LowercaseProtocolMixin
14
+
13
15
  logger = logging.getLogger(__name__)
14
16
 
15
17
 
16
- class AudioConfigSchema(BaseModel):
18
+ class AudioConfigSchema(LowercaseProtocolMixin):
17
19
  """
18
20
  Configuration schema for audio extraction endpoints and options.
19
21
 
@@ -87,13 +89,13 @@ class AudioConfigSchema(BaseModel):
87
89
 
88
90
  values[endpoint_name] = (grpc_service, http_service)
89
91
 
92
+ # Auto-infer protocol from endpoints if not specified
90
93
  protocol_name = "audio_infer_protocol"
91
94
  protocol_value = values.get(protocol_name)
92
95
 
93
96
  if not protocol_value:
94
97
  protocol_value = "http" if http_service else "grpc" if grpc_service else ""
95
98
 
96
- protocol_value = protocol_value.lower()
97
99
  values[protocol_name] = protocol_value
98
100
 
99
101
  return values
@@ -8,10 +8,12 @@ from typing import Tuple
8
8
 
9
9
  from pydantic import field_validator, model_validator, ConfigDict, BaseModel, Field
10
10
 
11
+ from nv_ingest_api.internal.schemas.mixins import LowercaseProtocolMixin
12
+
11
13
  logger = logging.getLogger(__name__)
12
14
 
13
15
 
14
- class ChartExtractorConfigSchema(BaseModel):
16
+ class ChartExtractorConfigSchema(LowercaseProtocolMixin):
15
17
  """
16
18
  Configuration schema for chart extraction service endpoints and options.
17
19
 
@@ -96,6 +98,13 @@ class ChartExtractorConfigSchema(BaseModel):
96
98
 
97
99
  values[endpoint_name] = (grpc_service, http_service)
98
100
 
101
+ # Auto-infer protocol from endpoints if not specified
102
+ protocol_name = endpoint_name.replace("_endpoints", "_infer_protocol")
103
+ protocol_value = values.get(protocol_name)
104
+ if not protocol_value:
105
+ protocol_value = "http" if http_service else "grpc" if grpc_service else ""
106
+ values[protocol_name] = protocol_value
107
+
99
108
  return values
100
109
 
101
110
  model_config = ConfigDict(extra="forbid")
@@ -9,10 +9,12 @@ from typing import Tuple
9
9
 
10
10
  from pydantic import model_validator, ConfigDict, BaseModel, Field
11
11
 
12
+ from nv_ingest_api.internal.schemas.mixins import LowercaseProtocolMixin
13
+
12
14
  logger = logging.getLogger(__name__)
13
15
 
14
16
 
15
- class DocxConfigSchema(BaseModel):
17
+ class DocxConfigSchema(LowercaseProtocolMixin):
16
18
  """
17
19
  Configuration schema for docx extraction endpoints and options.
18
20
 
@@ -85,11 +87,11 @@ class DocxConfigSchema(BaseModel):
85
87
 
86
88
  values[endpoint_name] = (grpc_service, http_service)
87
89
 
90
+ # Auto-infer protocol from endpoints if not specified
88
91
  protocol_name = f"{model_name}_infer_protocol"
89
92
  protocol_value = values.get(protocol_name)
90
93
  if not protocol_value:
91
94
  protocol_value = "http" if http_service else "grpc" if grpc_service else ""
92
- protocol_value = protocol_value.lower()
93
95
  values[protocol_name] = protocol_value
94
96
 
95
97
  return values
@@ -9,10 +9,12 @@ from typing import Tuple
9
9
 
10
10
  from pydantic import model_validator, ConfigDict, BaseModel, Field
11
11
 
12
+ from nv_ingest_api.internal.schemas.mixins import LowercaseProtocolMixin
13
+
12
14
  logger = logging.getLogger(__name__)
13
15
 
14
16
 
15
- class ImageConfigSchema(BaseModel):
17
+ class ImageConfigSchema(LowercaseProtocolMixin):
16
18
  """
17
19
  Configuration schema for image extraction endpoints and options.
18
20
 
@@ -85,11 +87,11 @@ class ImageConfigSchema(BaseModel):
85
87
 
86
88
  values[endpoint_name] = (grpc_service, http_service)
87
89
 
90
+ # Auto-infer protocol from endpoints if not specified
88
91
  protocol_name = f"{model_name}_infer_protocol"
89
92
  protocol_value = values.get(protocol_name)
90
93
  if not protocol_value:
91
94
  protocol_value = "http" if http_service else "grpc" if grpc_service else ""
92
- protocol_value = protocol_value.lower()
93
95
  values[protocol_name] = protocol_value
94
96
 
95
97
  return values
@@ -8,10 +8,12 @@ from typing import Tuple
8
8
 
9
9
  from pydantic import field_validator, model_validator, ConfigDict, BaseModel, Field
10
10
 
11
+ from nv_ingest_api.internal.schemas.mixins import LowercaseProtocolMixin
12
+
11
13
  logger = logging.getLogger(__name__)
12
14
 
13
15
 
14
- class InfographicExtractorConfigSchema(BaseModel):
16
+ class InfographicExtractorConfigSchema(LowercaseProtocolMixin):
15
17
  """
16
18
  Configuration schema for infographic extraction service endpoints and options.
17
19
 
@@ -89,6 +91,13 @@ class InfographicExtractorConfigSchema(BaseModel):
89
91
 
90
92
  values[endpoint_name] = (grpc_service, http_service)
91
93
 
94
+ # Auto-infer protocol from endpoints if not specified
95
+ protocol_name = endpoint_name.replace("_endpoints", "_infer_protocol")
96
+ protocol_value = values.get(protocol_name)
97
+ if not protocol_value:
98
+ protocol_value = "http" if http_service else "grpc" if grpc_service else ""
99
+ values[protocol_name] = protocol_value
100
+
92
101
  return values
93
102
 
94
103
  model_config = ConfigDict(extra="forbid")
@@ -9,10 +9,12 @@ from typing import Tuple
9
9
 
10
10
  from pydantic import model_validator, ConfigDict, BaseModel, Field
11
11
 
12
+ from nv_ingest_api.internal.schemas.mixins import LowercaseProtocolMixin
13
+
12
14
  logger = logging.getLogger(__name__)
13
15
 
14
16
 
15
- class PDFiumConfigSchema(BaseModel):
17
+ class PDFiumConfigSchema(LowercaseProtocolMixin):
16
18
  """
17
19
  Configuration schema for PDFium endpoints and options.
18
20
 
@@ -82,11 +84,11 @@ class PDFiumConfigSchema(BaseModel):
82
84
 
83
85
  values[endpoint_name] = (grpc_service, http_service)
84
86
 
87
+ # Auto-infer protocol from endpoints if not specified
85
88
  protocol_name = f"{model_name}_infer_protocol"
86
89
  protocol_value = values.get(protocol_name)
87
90
  if not protocol_value:
88
91
  protocol_value = "http" if http_service else "grpc" if grpc_service else ""
89
- protocol_value = protocol_value.lower()
90
92
  values[protocol_name] = protocol_value
91
93
 
92
94
  return values
@@ -94,7 +96,7 @@ class PDFiumConfigSchema(BaseModel):
94
96
  model_config = ConfigDict(extra="forbid")
95
97
 
96
98
 
97
- class NemoRetrieverParseConfigSchema(BaseModel):
99
+ class NemoRetrieverParseConfigSchema(LowercaseProtocolMixin):
98
100
  """
99
101
  Configuration schema for NemoRetrieverParse endpoints and options.
100
102
 
@@ -170,11 +172,11 @@ class NemoRetrieverParseConfigSchema(BaseModel):
170
172
 
171
173
  values[endpoint_name] = (grpc_service, http_service)
172
174
 
175
+ # Auto-infer protocol from endpoints if not specified
173
176
  protocol_name = f"{model_name}_infer_protocol"
174
177
  protocol_value = values.get(protocol_name)
175
178
  if not protocol_value:
176
179
  protocol_value = "http" if http_service else "grpc" if grpc_service else ""
177
- protocol_value = protocol_value.lower()
178
180
  values[protocol_name] = protocol_value
179
181
 
180
182
  return values
@@ -9,10 +9,12 @@ from typing import Tuple
9
9
 
10
10
  from pydantic import model_validator, ConfigDict, BaseModel, Field
11
11
 
12
+ from nv_ingest_api.internal.schemas.mixins import LowercaseProtocolMixin
13
+
12
14
  logger = logging.getLogger(__name__)
13
15
 
14
16
 
15
- class PPTXConfigSchema(BaseModel):
17
+ class PPTXConfigSchema(LowercaseProtocolMixin):
16
18
  """
17
19
  Configuration schema for docx extraction endpoints and options.
18
20
 
@@ -85,11 +87,11 @@ class PPTXConfigSchema(BaseModel):
85
87
 
86
88
  values[endpoint_name] = (grpc_service, http_service)
87
89
 
90
+ # Auto-infer protocol from endpoints if not specified
88
91
  protocol_name = f"{model_name}_infer_protocol"
89
92
  protocol_value = values.get(protocol_name)
90
93
  if not protocol_value:
91
94
  protocol_value = "http" if http_service else "grpc" if grpc_service else ""
92
- protocol_value = protocol_value.lower()
93
95
  values[protocol_name] = protocol_value
94
96
 
95
97
  return values
@@ -9,11 +9,12 @@ from typing import Tuple
9
9
 
10
10
  from pydantic import field_validator, model_validator, ConfigDict, BaseModel, Field
11
11
 
12
+ from nv_ingest_api.internal.schemas.mixins import LowercaseProtocolMixin
12
13
 
13
14
  logger = logging.getLogger(__name__)
14
15
 
15
16
 
16
- class TableExtractorConfigSchema(BaseModel):
17
+ class TableExtractorConfigSchema(LowercaseProtocolMixin):
17
18
  """
18
19
  Configuration schema for the table extraction stage settings.
19
20
 
@@ -91,6 +92,13 @@ class TableExtractorConfigSchema(BaseModel):
91
92
 
92
93
  values[endpoint_name] = (grpc_service, http_service)
93
94
 
95
+ # Auto-infer protocol from endpoints if not specified
96
+ protocol_name = endpoint_name.replace("_endpoints", "_infer_protocol")
97
+ protocol_value = values.get(protocol_name)
98
+ if not protocol_value:
99
+ protocol_value = "http" if http_service else "grpc" if grpc_service else ""
100
+ values[protocol_name] = protocol_value
101
+
94
102
  return values
95
103
 
96
104
  model_config = ConfigDict(extra="forbid")
@@ -126,6 +126,8 @@ class IngestTaskEmbedSchema(BaseModelNoExt):
126
126
  image_elements_modality: Optional[str] = None
127
127
  structured_elements_modality: Optional[str] = None
128
128
  audio_elements_modality: Optional[str] = None
129
+ custom_content_field: Optional[str] = None
130
+ result_target_field: Optional[str] = None
129
131
 
130
132
 
131
133
  class IngestTaskVdbUploadSchema(BaseModelNoExt):
@@ -0,0 +1,39 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: Apache-2.0
4
+
5
+ """
6
+ Shared mixins for Pydantic schemas.
7
+ """
8
+
9
+ from typing import Any
10
+ from pydantic import BaseModel, field_validator
11
+
12
+
13
+ class LowercaseProtocolMixin(BaseModel):
14
+ """
15
+ Mixin that automatically lowercases any field ending with '_infer_protocol'.
16
+
17
+ This ensures case-insensitive handling of protocol values (e.g., "HTTP" -> "http").
18
+ Apply this mixin to any schema that has protocol fields to normalize user input.
19
+
20
+ Examples
21
+ --------
22
+ >>> class MyConfigSchema(LowercaseProtocolMixin):
23
+ ... yolox_infer_protocol: str = ""
24
+ ... ocr_infer_protocol: str = ""
25
+ >>>
26
+ >>> config = MyConfigSchema(yolox_infer_protocol="GRPC", ocr_infer_protocol="HTTP")
27
+ >>> config.yolox_infer_protocol
28
+ 'grpc'
29
+ >>> config.ocr_infer_protocol
30
+ 'http'
31
+ """
32
+
33
+ @field_validator("*", mode="before")
34
+ @classmethod
35
+ def _lowercase_protocol_fields(cls, v: Any, info):
36
+ """Lowercase any field ending with '_infer_protocol'."""
37
+ if info.field_name.endswith("_infer_protocol") and v is not None:
38
+ return str(v).strip().lower()
39
+ return v
@@ -7,6 +7,8 @@ import logging
7
7
 
8
8
  from pydantic import ConfigDict, BaseModel, Field, model_validator, field_validator
9
9
 
10
+ from typing import Optional
11
+
10
12
  from nv_ingest_api.util.logging.configuration import LogLevel
11
13
 
12
14
  logger = logging.getLogger(__name__)
@@ -26,6 +28,8 @@ class TextEmbeddingSchema(BaseModel):
26
28
  image_elements_modality: str = Field(default="text")
27
29
  structured_elements_modality: str = Field(default="text")
28
30
  audio_elements_modality: str = Field(default="text")
31
+ custom_content_field: Optional[str] = None
32
+ result_target_field: Optional[str] = None
29
33
 
30
34
  model_config = ConfigDict(extra="forbid")
31
35
 
@@ -7,6 +7,7 @@ from concurrent.futures import ThreadPoolExecutor
7
7
  from functools import partial
8
8
  from typing import Any, Dict, Tuple, Optional, Iterable, List
9
9
 
10
+ import glom
10
11
  import pandas as pd
11
12
  from openai import OpenAI
12
13
 
@@ -282,6 +283,33 @@ def _add_embeddings(row, embeddings, info_msgs):
282
283
  return row
283
284
 
284
285
 
286
+ def _add_custom_embeddings(row, embeddings, result_target_field):
287
+ """
288
+ Updates a DataFrame row with embedding data and associated error info
289
+ based on a user supplied custom content field.
290
+
291
+ Parameters
292
+ ----------
293
+ row : pandas.Series
294
+ A row of the DataFrame.
295
+ embeddings : dict
296
+ Dictionary mapping row indices to embeddings.
297
+ result_target_field: str
298
+ The field in custom_content to output the embeddings to
299
+
300
+ Returns
301
+ -------
302
+ pandas.Series
303
+ The updated row
304
+ """
305
+ embedding = embeddings.get(row.name, None)
306
+
307
+ if embedding is not None:
308
+ row["metadata"] = glom.assign(row["metadata"], "custom_content." + result_target_field, embedding, missing=dict)
309
+
310
+ return row
311
+
312
+
285
313
  def _format_image_input_string(image_b64: Optional[str]) -> str:
286
314
  if not image_b64:
287
315
  return
@@ -381,6 +409,20 @@ def _get_pandas_audio_content(row, modality="text"):
381
409
  return row.get("audio_metadata", {}).get("audio_transcript")
382
410
 
383
411
 
412
+ def _get_pandas_custom_content(row, custom_content_field):
413
+ custom_content = row.get("custom_content", {})
414
+ content = glom.glom(custom_content, custom_content_field, default=None)
415
+ if content is None:
416
+ logger.warning(f"Custom content field: {custom_content_field} not found")
417
+ return None
418
+
419
+ try:
420
+ return str(content)
421
+ except (TypeError, ValueError):
422
+ logger.warning(f"Cannot convert custom content field: {custom_content_field} to string")
423
+ return None
424
+
425
+
384
426
  # ------------------------------------------------------------------------------
385
427
  # Batch Processing Utilities
386
428
  # ------------------------------------------------------------------------------
@@ -519,6 +561,7 @@ def transform_create_text_embeddings_internal(
519
561
  api_key = task_config.get("api_key") or transform_config.api_key
520
562
  endpoint_url = task_config.get("endpoint_url") or transform_config.embedding_nim_endpoint
521
563
  model_name = task_config.get("model_name") or transform_config.embedding_model
564
+ custom_content_field = task_config.get("custom_content_field") or transform_config.custom_content_field
522
565
 
523
566
  if execution_trace_log is None:
524
567
  execution_trace_log = {}
@@ -612,4 +655,43 @@ def transform_create_text_embeddings_internal(
612
655
  content_masks.append(content_mask)
613
656
 
614
657
  combined_df = _concatenate_extractions_pandas(df_transform_ledger, embedding_dataframes, content_masks)
658
+
659
+ # Embed custom content
660
+ if custom_content_field is not None:
661
+ result_target_field = task_config.get("result_target_field") or custom_content_field + "_embedding"
662
+
663
+ extracted_custom_content = (
664
+ combined_df["metadata"]
665
+ .apply(partial(_get_pandas_custom_content, custom_content_field=custom_content_field))
666
+ .apply(lambda x: x.strip() if isinstance(x, str) and x.strip() else None)
667
+ )
668
+
669
+ valid_custom_content_mask = extracted_custom_content.notna()
670
+ if valid_custom_content_mask.any():
671
+ custom_content_list = extracted_custom_content[valid_custom_content_mask].to_list()
672
+ custom_content_batches = _generate_batches(custom_content_list, batch_size=transform_config.batch_size)
673
+
674
+ custom_content_embeddings = _async_runner(
675
+ custom_content_batches,
676
+ api_key,
677
+ endpoint_url,
678
+ model_name,
679
+ transform_config.encoding_format,
680
+ transform_config.input_type,
681
+ transform_config.truncate,
682
+ False,
683
+ )
684
+ custom_embeddings_dict = dict(
685
+ zip(
686
+ extracted_custom_content.loc[valid_custom_content_mask].index,
687
+ custom_content_embeddings.get("embeddings", []),
688
+ )
689
+ )
690
+ else:
691
+ custom_embeddings_dict = {}
692
+
693
+ combined_df = combined_df.apply(
694
+ _add_custom_embeddings, embeddings=custom_embeddings_dict, result_target_field=result_target_field, axis=1
695
+ )
696
+
615
697
  return combined_df, {"trace_info": execution_trace_log}
@@ -254,22 +254,29 @@ else:
254
254
  file = None
255
255
  try:
256
256
  for file in paths:
257
+ if thread_stop.is_set():
258
+ return
257
259
  if isinstance(file, tuple):
258
260
  video_file, audio_file = file
261
+ if thread_stop.is_set():
262
+ return
259
263
  with open(video_file, "rb") as f:
260
264
  video = f.read()
265
+ if thread_stop.is_set():
266
+ return
261
267
  with open(audio_file, "rb") as f:
262
268
  audio = f.read()
263
269
  queue.put((video, audio))
264
270
  else:
265
- if thread_stop:
271
+ if thread_stop.is_set():
266
272
  return
267
273
  with open(file, "rb") as f:
268
274
  queue.put(f.read())
269
275
  except Exception as e:
270
276
  logging.error(f"Error processing file {file}: {e}")
271
277
  queue.put(RuntimeError(f"Error processing file {file}: {e}"))
272
- queue.put(StopIteration)
278
+ finally:
279
+ queue.put(StopIteration)
273
280
 
274
281
  class DataLoader:
275
282
  """
@@ -290,7 +297,7 @@ else:
290
297
  ):
291
298
  interface = interface if interface else MediaInterface()
292
299
  self.thread = None
293
- self.thread_stop = False
300
+ self.thread_stop = threading.Event()
294
301
  self.queue = queue.Queue(size)
295
302
  self.path = Path(path)
296
303
  self.output_dir = output_dir
@@ -323,16 +330,20 @@ else:
323
330
  Reset itertor by stopping the thread and clearing the queue.
324
331
  """
325
332
  if self.thread:
326
- self.thread_stop = True
333
+ self.thread_stop.set()
327
334
  self.thread.join()
328
- self.thread_stop = False
329
- while self.queue.qsize() != 0:
330
- with self.queue.mutex:
331
- self.queue.queue.clear()
335
+ self.thread = None
336
+ try:
337
+ while True:
338
+ self.queue.get_nowait()
339
+ except Exception:
340
+ pass
341
+ finally:
342
+ self.thread_stop.clear()
332
343
 
333
344
  def __iter__(self):
334
345
  self.stop()
335
- self.thread_stop = False
346
+ self.thread_stop.clear()
336
347
  self.thread = threading.Thread(
337
348
  target=load_data,
338
349
  args=(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-api
3
- Version: 2025.10.26.dev20251026
3
+ Version: 2025.10.28.dev20251028
4
4
  Summary: Python module with core document ingestion functions.
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -222,6 +222,7 @@ Requires-Dist: fsspec>=2025.5.1
222
222
  Requires-Dist: universal_pathlib>=0.2.6
223
223
  Requires-Dist: ffmpeg-python==0.2.0
224
224
  Requires-Dist: tritonclient
225
+ Requires-Dist: glom
225
226
  Dynamic: license-file
226
227
 
227
228
  # nv-ingest-api
@@ -78,6 +78,7 @@ src/nv_ingest_api/internal/primitives/tracing/latency.py
78
78
  src/nv_ingest_api/internal/primitives/tracing/logging.py
79
79
  src/nv_ingest_api/internal/primitives/tracing/tagging.py
80
80
  src/nv_ingest_api/internal/schemas/__init__.py
81
+ src/nv_ingest_api/internal/schemas/mixins.py
81
82
  src/nv_ingest_api/internal/schemas/extract/__init__.py
82
83
  src/nv_ingest_api/internal/schemas/extract/extract_audio_schema.py
83
84
  src/nv_ingest_api/internal/schemas/extract/extract_chart_schema.py
@@ -7,3 +7,4 @@ fsspec>=2025.5.1
7
7
  universal_pathlib>=0.2.6
8
8
  ffmpeg-python==0.2.0
9
9
  tritonclient
10
+ glom