nv-ingest-api 2025.5.12.dev20250512__tar.gz → 2025.5.14.dev20250514__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of nv-ingest-api might be problematic. Click here for more details.

Files changed (163) hide show
  1. {nv_ingest_api-2025.5.12.dev20250512/src/nv_ingest_api.egg-info → nv_ingest_api-2025.5.14.dev20250514}/PKG-INFO +1 -1
  2. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/interface/transform.py +1 -1
  3. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/extract/docx/docx_extractor.py +3 -3
  4. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/extract/image/chart_extractor.py +3 -3
  5. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/extract/image/image_extractor.py +5 -5
  6. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/extract/image/image_helpers/common.py +1 -1
  7. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/extract/image/infographic_extractor.py +1 -1
  8. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/extract/image/table_extractor.py +2 -2
  9. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/extract/pdf/engines/nemoretriever.py +2 -2
  10. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/extract/pdf/engines/pdfium.py +1 -1
  11. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/extract/pptx/engines/pptx_helper.py +44 -17
  12. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/extract/pptx/pptx_extractor.py +1 -1
  13. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/primitives/nim/model_interface/text_embedding.py +0 -1
  14. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/primitives/nim/model_interface/yolox.py +7 -1
  15. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/primitives/nim/nim_client.py +1 -1
  16. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/primitives/tracing/tagging.py +20 -16
  17. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/schemas/extract/extract_pdf_schema.py +1 -1
  18. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/schemas/meta/ingest_job_schema.py +2 -2
  19. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/schemas/transform/transform_image_caption_schema.py +1 -1
  20. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/transform/caption_image.py +1 -1
  21. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/transform/embed_text.py +75 -56
  22. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/util/exception_handlers/converters.py +1 -1
  23. nv_ingest_api-2025.5.14.dev20250514/src/nv_ingest_api/util/exception_handlers/decorators.py +481 -0
  24. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/util/image_processing/processing.py +1 -1
  25. nv_ingest_api-2025.5.14.dev20250514/src/nv_ingest_api/util/logging/configuration.py +38 -0
  26. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/util/pdf/pdfium.py +2 -2
  27. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/util/service_clients/redis/redis_client.py +1 -1
  28. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/util/service_clients/rest/rest_client.py +1 -1
  29. nv_ingest_api-2025.5.14.dev20250514/src/nv_ingest_api/util/system/__init__.py +0 -0
  30. nv_ingest_api-2025.5.14.dev20250514/src/nv_ingest_api/util/system/hardware_info.py +426 -0
  31. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514/src/nv_ingest_api.egg-info}/PKG-INFO +1 -1
  32. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api.egg-info/SOURCES.txt +3 -1
  33. nv_ingest_api-2025.5.12.dev20250512/src/nv_ingest_api/util/exception_handlers/decorators.py +0 -223
  34. nv_ingest_api-2025.5.12.dev20250512/src/nv_ingest_api/util/logging/configuration.py +0 -31
  35. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/LICENSE +0 -0
  36. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/MANIFEST.in +0 -0
  37. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/README.md +0 -0
  38. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/pyproject.toml +0 -0
  39. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/setup.cfg +0 -0
  40. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/__init__.py +0 -0
  41. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/interface/__init__.py +0 -0
  42. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/interface/extract.py +0 -0
  43. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/interface/mutate.py +0 -0
  44. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/interface/store.py +0 -0
  45. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/interface/utility.py +0 -0
  46. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/__init__.py +0 -0
  47. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/enums/__init__.py +0 -0
  48. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/enums/common.py +0 -0
  49. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/extract/__init__.py +0 -0
  50. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/extract/audio/__init__.py +0 -0
  51. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/extract/audio/audio_extraction.py +0 -0
  52. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/extract/docx/__init__.py +0 -0
  53. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/extract/docx/engines/__init__.py +0 -0
  54. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/__init__.py +0 -0
  55. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docx_helper.py +0 -0
  56. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/extract/docx/engines/docxreader_helpers/docxreader.py +0 -0
  57. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/extract/image/__init__.py +0 -0
  58. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/extract/image/image_helpers/__init__.py +0 -0
  59. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/extract/pdf/__init__.py +0 -0
  60. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/extract/pdf/engines/__init__.py +0 -0
  61. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/extract/pdf/engines/adobe.py +0 -0
  62. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/extract/pdf/engines/llama.py +0 -0
  63. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/extract/pdf/engines/pdf_helpers/__init__.py +0 -0
  64. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/extract/pdf/engines/tika.py +0 -0
  65. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/extract/pdf/engines/unstructured_io.py +0 -0
  66. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/extract/pdf/pdf_extractor.py +0 -0
  67. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/extract/pptx/__init__.py +0 -0
  68. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/extract/pptx/engines/__init__.py +0 -0
  69. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/mutate/__init__.py +0 -0
  70. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/mutate/deduplicate.py +0 -0
  71. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/mutate/filter.py +0 -0
  72. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/primitives/__init__.py +0 -0
  73. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/primitives/control_message_task.py +0 -0
  74. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/primitives/ingest_control_message.py +0 -0
  75. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/primitives/nim/__init__.py +0 -0
  76. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/primitives/nim/default_values.py +0 -0
  77. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/primitives/nim/model_interface/__init__.py +0 -0
  78. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/primitives/nim/model_interface/cached.py +0 -0
  79. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/primitives/nim/model_interface/decorators.py +0 -0
  80. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/primitives/nim/model_interface/deplot.py +0 -0
  81. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/primitives/nim/model_interface/helpers.py +0 -0
  82. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/primitives/nim/model_interface/nemoretriever_parse.py +0 -0
  83. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/primitives/nim/model_interface/paddle.py +0 -0
  84. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/primitives/nim/model_interface/parakeet.py +0 -0
  85. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/primitives/nim/model_interface/vlm.py +0 -0
  86. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/primitives/nim/nim_model_interface.py +0 -0
  87. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/primitives/tracing/__init__.py +0 -0
  88. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/primitives/tracing/latency.py +0 -0
  89. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/primitives/tracing/logging.py +0 -0
  90. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/schemas/__init__.py +0 -0
  91. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/schemas/extract/__init__.py +0 -0
  92. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/schemas/extract/extract_audio_schema.py +0 -0
  93. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/schemas/extract/extract_chart_schema.py +0 -0
  94. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/schemas/extract/extract_docx_schema.py +0 -0
  95. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/schemas/extract/extract_image_schema.py +0 -0
  96. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/schemas/extract/extract_infographic_schema.py +0 -0
  97. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/schemas/extract/extract_pptx_schema.py +0 -0
  98. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/schemas/extract/extract_table_schema.py +0 -0
  99. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/schemas/message_brokers/__init__.py +0 -0
  100. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/schemas/message_brokers/message_broker_client_schema.py +0 -0
  101. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/schemas/message_brokers/request_schema.py +0 -0
  102. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/schemas/message_brokers/response_schema.py +0 -0
  103. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/schemas/meta/__init__.py +0 -0
  104. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/schemas/meta/base_model_noext.py +0 -0
  105. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/schemas/meta/metadata_schema.py +0 -0
  106. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/schemas/mutate/__init__.py +0 -0
  107. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/schemas/mutate/mutate_image_dedup_schema.py +0 -0
  108. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/schemas/store/__init__.py +0 -0
  109. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/schemas/store/store_embedding_schema.py +0 -0
  110. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/schemas/store/store_image_schema.py +0 -0
  111. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/schemas/transform/__init__.py +0 -0
  112. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/schemas/transform/transform_image_filter_schema.py +0 -0
  113. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/schemas/transform/transform_text_embedding_schema.py +0 -0
  114. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/schemas/transform/transform_text_splitter_schema.py +0 -0
  115. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/store/__init__.py +0 -0
  116. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/store/embed_text_upload.py +0 -0
  117. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/store/image_upload.py +0 -0
  118. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/transform/__init__.py +0 -0
  119. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/internal/transform/split_text.py +0 -0
  120. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/util/__init__.py +0 -0
  121. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/util/control_message/__init__.py +0 -0
  122. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/util/control_message/validators.py +0 -0
  123. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/util/converters/__init__.py +0 -0
  124. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/util/converters/bytetools.py +0 -0
  125. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/util/converters/containers.py +0 -0
  126. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/util/converters/datetools.py +0 -0
  127. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/util/converters/dftools.py +0 -0
  128. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/util/converters/formats.py +0 -0
  129. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/util/converters/type_mappings.py +0 -0
  130. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/util/detectors/__init__.py +0 -0
  131. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/util/detectors/language.py +0 -0
  132. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/util/exception_handlers/__init__.py +0 -0
  133. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/util/exception_handlers/detectors.py +0 -0
  134. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/util/exception_handlers/pdf.py +0 -0
  135. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/util/exception_handlers/schemas.py +0 -0
  136. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/util/image_processing/__init__.py +0 -0
  137. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/util/image_processing/clustering.py +0 -0
  138. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/util/image_processing/table_and_chart.py +0 -0
  139. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/util/image_processing/transforms.py +0 -0
  140. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/util/logging/__init__.py +0 -0
  141. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/util/message_brokers/__init__.py +0 -0
  142. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/util/message_brokers/simple_message_broker/__init__.py +0 -0
  143. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/util/message_brokers/simple_message_broker/broker.py +0 -0
  144. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/util/message_brokers/simple_message_broker/ordered_message_queue.py +0 -0
  145. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/util/message_brokers/simple_message_broker/simple_client.py +0 -0
  146. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/util/metadata/__init__.py +0 -0
  147. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/util/metadata/aggregators.py +0 -0
  148. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/util/multi_processing/__init__.py +0 -0
  149. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/util/multi_processing/mp_pool_singleton.py +0 -0
  150. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/util/nim/__init__.py +0 -0
  151. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/util/pdf/__init__.py +0 -0
  152. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/util/schema/__init__.py +0 -0
  153. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/util/schema/schema_validator.py +0 -0
  154. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/util/service_clients/__init__.py +0 -0
  155. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/util/service_clients/client_base.py +0 -0
  156. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/util/service_clients/kafka/__init__.py +0 -0
  157. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/util/service_clients/redis/__init__.py +0 -0
  158. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/util/service_clients/rest/__init__.py +0 -0
  159. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api/util/string_processing/__init__.py +0 -0
  160. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api.egg-info/dependency_links.txt +0 -0
  161. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api.egg-info/requires.txt +0 -0
  162. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/nv_ingest_api.egg-info/top_level.txt +0 -0
  163. {nv_ingest_api-2025.5.12.dev20250512 → nv_ingest_api-2025.5.14.dev20250514}/src/version.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: nv-ingest-api
3
- Version: 2025.5.12.dev20250512
3
+ Version: 2025.5.14.dev20250514
4
4
  Summary: Python module with core document ingestion functions.
5
5
  Author-email: Jeremy Dyer <jdyer@nvidia.com>
6
6
  License: Apache License
@@ -207,7 +207,7 @@ def transform_image_create_vlm_caption(
207
207
  "api_key": api_key,
208
208
  "prompt": prompt,
209
209
  "endpoint_url": endpoint_url,
210
- "model_name": model_name,
210
+ "image_caption_model_name": model_name,
211
211
  }
212
212
  filtered_task_config: Dict[str, str] = {k: v for k, v in task_config.items() if v is not None}
213
213
 
@@ -7,7 +7,7 @@ import base64
7
7
  import functools
8
8
  import io
9
9
  import logging
10
- from typing import Optional, Dict, Any, Union
10
+ from typing import Optional, Dict, Any, Union, Tuple
11
11
 
12
12
  import pandas as pd
13
13
  from pydantic import BaseModel
@@ -146,7 +146,7 @@ def extract_primitives_from_docx_internal(
146
146
  task_config: Union[Dict[str, Any], BaseModel],
147
147
  extraction_config: DocxExtractorSchema,
148
148
  execution_trace_log: Optional[Dict[str, Any]] = None,
149
- ) -> pd.DataFrame:
149
+ ) -> Tuple[pd.DataFrame, Union[Dict, None]]:
150
150
  """
151
151
  Processes a pandas DataFrame containing DOCX files encoded in base64, extracting text from
152
152
  each document and replacing the original content with the extracted text.
@@ -202,4 +202,4 @@ def extract_primitives_from_docx_internal(
202
202
  else:
203
203
  extracted_df = pd.DataFrame({"document_type": [], "metadata": [], "uuid": []})
204
204
 
205
- return extracted_df
205
+ return extracted_df, {}
@@ -27,7 +27,7 @@ from nv_ingest_api.util.nim import create_inference_client
27
27
  PADDLE_MIN_WIDTH = 32
28
28
  PADDLE_MIN_HEIGHT = 32
29
29
 
30
- logger = logging.getLogger(f"morpheus.{__name__}")
30
+ logger = logging.getLogger(f"ray.{__name__}")
31
31
 
32
32
 
33
33
  def _filter_valid_chart_images(
@@ -80,7 +80,7 @@ def _run_chart_inference(
80
80
  yolox_client.infer,
81
81
  data=data_yolox,
82
82
  model_name="yolox",
83
- stage_name="chart_data_extraction",
83
+ stage_name="chart_extraction",
84
84
  max_batch_size=8,
85
85
  trace_info=trace_info,
86
86
  )
@@ -88,7 +88,7 @@ def _run_chart_inference(
88
88
  paddle_client.infer,
89
89
  data=data_paddle,
90
90
  model_name="paddle",
91
- stage_name="chart_data_extraction",
91
+ stage_name="chart_extraction",
92
92
  max_batch_size=1 if paddle_client.protocol == "grpc" else 2,
93
93
  trace_info=trace_info,
94
94
  )
@@ -16,7 +16,7 @@ import pandas as pd
16
16
  from pydantic import BaseModel
17
17
 
18
18
  from nv_ingest_api.internal.extract.image.image_helpers.common import unstructured_image_extractor
19
- from nv_ingest_api.internal.schemas.extract.extract_image_schema import ImageExtractorSchema
19
+ from nv_ingest_api.internal.schemas.extract.extract_image_schema import ImageConfigSchema
20
20
  from nv_ingest_api.util.exception_handlers.decorators import unified_exception_handler
21
21
 
22
22
  logger = logging.getLogger(__name__)
@@ -26,7 +26,7 @@ logger = logging.getLogger(__name__)
26
26
  def _decode_and_extract_from_image(
27
27
  base64_row: pd.Series,
28
28
  task_config: Dict[str, Any],
29
- validated_extraction_config: ImageExtractorSchema,
29
+ validated_extraction_config: ImageConfigSchema,
30
30
  execution_trace_log: Optional[List[Any]] = None,
31
31
  ) -> Any:
32
32
  """
@@ -106,10 +106,10 @@ def _decode_and_extract_from_image(
106
106
 
107
107
  logger.debug(
108
108
  f"decode_and_extract: Extracting image content using image_extraction_config: "
109
- f"{validated_extraction_config.image_extraction_config}"
109
+ f"{validated_extraction_config}"
110
110
  )
111
- if validated_extraction_config.image_extraction_config is not None:
112
- extract_params["image_extraction_config"] = validated_extraction_config.image_extraction_config
111
+ if validated_extraction_config is not None:
112
+ extract_params["image_extraction_config"] = validated_extraction_config
113
113
 
114
114
  if execution_trace_log is not None:
115
115
  extract_params["trace_info"] = execution_trace_log
@@ -223,7 +223,7 @@ def extract_page_elements_from_images(
223
223
  model_name="yolox",
224
224
  max_batch_size=YOLOX_MAX_BATCH_SIZE,
225
225
  trace_info=trace_info,
226
- stage_name="pdf_content_extractor",
226
+ stage_name="pdf_extraction",
227
227
  )
228
228
 
229
229
  # Process each result along with its corresponding image.
@@ -100,7 +100,7 @@ def _update_infographic_metadata(
100
100
  paddle_results = paddle_client.infer(
101
101
  data=data_paddle,
102
102
  model_name="paddle",
103
- stage_name="infographic_data_extraction",
103
+ stage_name="infographic_extraction",
104
104
  max_batch_size=1 if paddle_client.protocol == "grpc" else 2,
105
105
  trace_info=trace_info,
106
106
  )
@@ -81,7 +81,7 @@ def _run_inference(
81
81
  yolox_client.infer,
82
82
  data=data_yolox,
83
83
  model_name="yolox",
84
- stage_name="table_data_extraction",
84
+ stage_name="table_extraction",
85
85
  max_batch_size=8,
86
86
  trace_info=trace_info,
87
87
  )
@@ -89,7 +89,7 @@ def _run_inference(
89
89
  paddle_client.infer,
90
90
  data=data_paddle,
91
91
  model_name="paddle",
92
- stage_name="table_data_extraction",
92
+ stage_name="table_extraction",
93
93
  max_batch_size=1 if paddle_client.protocol == "grpc" else 2,
94
94
  trace_info=trace_info,
95
95
  )
@@ -466,7 +466,7 @@ def _extract_text_and_bounding_boxes(
466
466
  inference_results = nemoretriever_parse_client.infer(
467
467
  data=data,
468
468
  model_name="nemoretriever_parse",
469
- stage_name="pdf_content_extractor",
469
+ stage_name="pdf_extraction",
470
470
  max_batch_size=NEMORETRIEVER_PARSE_MAX_BATCH_SIZE,
471
471
  execution_trace_log=execution_trace_log,
472
472
  )
@@ -476,7 +476,7 @@ def _extract_text_and_bounding_boxes(
476
476
 
477
477
  def _create_clients(nemoretriever_parse_config):
478
478
  model_interface = nemoretriever_parse_utils.NemoRetrieverParseModelInterface(
479
- model_name=nemoretriever_parse_config.model_name,
479
+ model_name=nemoretriever_parse_config.nemoretriever_parse_model_name,
480
480
  )
481
481
  nemoretriever_parse_client = create_inference_client(
482
482
  nemoretriever_parse_config.nemoretriever_parse_endpoints,
@@ -105,7 +105,7 @@ def _extract_page_elements_using_image_ensemble(
105
105
  model_name="yolox",
106
106
  max_batch_size=YOLOX_MAX_BATCH_SIZE,
107
107
  trace_info=execution_trace_log,
108
- stage_name="pdf_content_extractor",
108
+ stage_name="pdf_extraction",
109
109
  )
110
110
 
111
111
  # Process results: iterate over each image's inference output.
@@ -17,7 +17,6 @@
17
17
 
18
18
  import logging
19
19
  import io
20
- import operator
21
20
  import re
22
21
  import uuid
23
22
  from collections import defaultdict
@@ -155,6 +154,12 @@ def _finalize_images(
155
154
  extracted_data.append(image_entry)
156
155
 
157
156
 
157
+ def _safe_position(shape):
158
+ top = shape.top if shape.top is not None else float("inf")
159
+ left = shape.left if shape.left is not None else float("inf")
160
+ return (top, left)
161
+
162
+
158
163
  # -----------------------------------------------------------------------------
159
164
  # Helper Function: Recursive Image Extraction
160
165
  # -----------------------------------------------------------------------------
@@ -283,7 +288,7 @@ def python_pptx(
283
288
 
284
289
  for slide_idx, slide in enumerate(presentation.slides):
285
290
  # Obtain a flat list of shapes (ungrouped) sorted by top then left.
286
- shapes = sorted(ungroup_shapes(slide.shapes), key=operator.attrgetter("top", "left"))
291
+ shapes = sorted(ungroup_shapes(slide.shapes), key=_safe_position)
287
292
 
288
293
  page_nearby_blocks = {
289
294
  "text": {"content": [], "bbox": []},
@@ -656,21 +661,43 @@ def get_bbox(
656
661
  shape_object: Optional[Slide] = None,
657
662
  text_depth: Optional[TextTypeEnum] = None,
658
663
  ):
659
- bbox = (-1, -1, -1, -1)
660
- if text_depth == TextTypeEnum.DOCUMENT:
661
- bbox = (-1, -1, -1, -1)
662
- elif text_depth == TextTypeEnum.PAGE:
663
- top = left = 0
664
- width = presentation_object.slide_width
665
- height = presentation_object.slide_height
666
- bbox = (top, left, top + height, left + width)
667
- elif shape_object:
668
- top = shape_object.top
669
- left = shape_object.left
670
- width = shape_object.width
671
- height = shape_object.height
672
- bbox = (top, left, top + height, left + width)
673
- return bbox
664
+ """
665
+ Safely computes bounding box for a slide, shape, or document.
666
+ Ensures that missing or None values are gracefully handled.
667
+
668
+ Returns
669
+ -------
670
+ Tuple[int, int, int, int]
671
+ Bounding box as (top, left, bottom, right).
672
+ Defaults to (-1, -1, -1, -1) if invalid or unsupported.
673
+ """
674
+ try:
675
+ if text_depth == TextTypeEnum.DOCUMENT:
676
+ return (-1, -1, -1, -1)
677
+
678
+ elif text_depth == TextTypeEnum.PAGE and presentation_object:
679
+ top = left = 0
680
+ width = presentation_object.slide_width
681
+ height = presentation_object.slide_height
682
+ return (top, left, top + height, left + width)
683
+
684
+ elif shape_object:
685
+ top = shape_object.top if shape_object.top is not None else -1
686
+ left = shape_object.left if shape_object.left is not None else -1
687
+ width = shape_object.width if shape_object.width is not None else -1
688
+ height = shape_object.height if shape_object.height is not None else -1
689
+
690
+ # If all are valid, return normally, else return placeholder
691
+ if -1 in [top, left, width, height]:
692
+ return (-1, -1, -1, -1)
693
+
694
+ return (top, left, top + height, left + width)
695
+
696
+ except Exception as e:
697
+ logger.warning(f"get_bbox: Failed to compute bbox due to {e}")
698
+ return (-1, -1, -1, -1)
699
+
700
+ return (-1, -1, -1, -1)
674
701
 
675
702
 
676
703
  def ungroup_shapes(shapes):
@@ -184,4 +184,4 @@ def extract_primitives_from_pptx_internal(
184
184
  else:
185
185
  extracted_df = pd.DataFrame({"document_type": [], "metadata": [], "uuid": []})
186
186
 
187
- return extracted_df
187
+ return extracted_df, {}
@@ -8,7 +8,6 @@ from nv_ingest_api.internal.primitives.nim import ModelInterface
8
8
  import numpy as np
9
9
 
10
10
 
11
- # Assume ModelInterface is defined elsewhere in the project.
12
11
  class EmbeddingModelInterface(ModelInterface):
13
12
  """
14
13
  An interface for handling inference with an embedding model endpoint.
@@ -709,7 +709,13 @@ def postprocess_results(
709
709
  raise ValueError(f"Error in postprocessing {result.shape} and {original_image_shape}: {e}")
710
710
 
711
711
  for box, score, label in zip(bboxes, scores, labels):
712
- class_name = class_labels[int(label)]
712
+ # TODO(Devin): Sometimes we get back unexpected class labels?
713
+ if (label < 0) or (label >= len(class_labels)):
714
+ logger.warning(f"Invalid class label {label} found in postprocessing")
715
+ continue
716
+ else:
717
+ class_name = class_labels[int(label)]
718
+
713
719
  annotation_dict[class_name].append([round(float(x), 4) for x in np.concatenate((box, [score]))])
714
720
 
715
721
  out.append(annotation_dict)
@@ -251,7 +251,7 @@ class NimClient:
251
251
  model_name=model_name, parameters=parameters, inputs=[input_tensors], outputs=outputs
252
252
  )
253
253
  logger.debug(f"gRPC inference response: {response}")
254
- # TODO(self.client.has_error(response)) => raise error
254
+
255
255
  if len(outputs) == 1:
256
256
  return response.as_numpy(outputs[0].name())
257
257
  else:
@@ -31,13 +31,15 @@ def traceable(trace_name=None):
31
31
 
32
32
  Notes
33
33
  -----
34
- The decorated function must accept a IngestControlMessage object as its first argument. The
35
- IngestControlMessage object must implement `has_metadata`, `get_metadata`, and `set_metadata`
36
- methods used by the decorator to check for the trace tagging flag and to add trace metadata.
34
+ The decorated function must accept a IngestControlMessage object as one of its arguments.
35
+ For a regular function, this is expected to be the first argument; for a class method,
36
+ this is expected to be the second argument (after 'self'). The IngestControlMessage object
37
+ must implement `has_metadata`, `get_metadata`, and `set_metadata` methods used by the decorator
38
+ to check for the trace tagging flag and to add trace metadata.
37
39
 
38
40
  The trace metadata added by the decorator includes two entries:
39
- - 'trace::entry::<trace_name>': The monotonic timestamp marking the function's entry.
40
- - 'trace::exit::<trace_name>': The monotonic timestamp marking the function's exit.
41
+ - 'trace::entry::<trace_name>': The timestamp marking the function's entry.
42
+ - 'trace::exit::<trace_name>': The timestamp marking the function's exit.
41
43
 
42
44
  Example
43
45
  -------
@@ -47,23 +49,25 @@ def traceable(trace_name=None):
47
49
  ... def process_message(message):
48
50
  ... pass
49
51
 
50
- Applying the decorator with a custom trace name:
51
-
52
- >>> @traceable(custom_trace_name="CustomTraceName")
53
- ... def process_message(message):
54
- ... pass
55
-
56
- In both examples, `process_message` will have entry and exit timestamps added to the
57
- IngestControlMessage's metadata if 'config::add_trace_tagging' is True.
52
+ Applying the decorator with a custom trace name on a class method:
58
53
 
54
+ >>> class Processor:
55
+ ... @traceable(trace_name="CustomTrace")
56
+ ... def process(self, message):
57
+ ... pass
59
58
  """
60
59
 
61
60
  def decorator_trace_tagging(func):
62
61
  @functools.wraps(func)
63
62
  def wrapper_trace_tagging(*args, **kwargs):
64
- # Assuming the first argument is always the message
65
63
  ts_fetched = datetime.now()
66
- message = args[0]
64
+ # Determine which argument is the message.
65
+ if hasattr(args[0], "has_metadata"):
66
+ message = args[0]
67
+ elif len(args) > 1 and hasattr(args[1], "has_metadata"):
68
+ message = args[1]
69
+ else:
70
+ raise ValueError("traceable decorator could not find a message argument with 'has_metadata()'")
67
71
 
68
72
  do_trace_tagging = (message.has_metadata("config::add_trace_tagging") is True) and (
69
73
  message.get_metadata("config::add_trace_tagging") is True
@@ -79,7 +83,7 @@ def traceable(trace_name=None):
79
83
  message.set_timestamp(f"trace::entry::{trace_prefix}_channel_in", ts_send)
80
84
  message.set_timestamp(f"trace::exit::{trace_prefix}_channel_in", ts_fetched)
81
85
 
82
- # Call the decorated function
86
+ # Call the decorated function.
83
87
  result = func(*args, **kwargs)
84
88
 
85
89
  if do_trace_tagging:
@@ -131,7 +131,7 @@ class NemoRetrieverParseConfigSchema(BaseModel):
131
131
  nemoretriever_parse_endpoints: Tuple[Optional[str], Optional[str]] = (None, None)
132
132
  nemoretriever_parse_infer_protocol: str = ""
133
133
 
134
- model_name: str = "nvidia/nemoretriever-parse"
134
+ nemoretriever_parse_model_name: str = "nvidia/nemoretriever-parse"
135
135
 
136
136
  timeout: float = 300.0
137
137
 
@@ -76,7 +76,7 @@ class IngestTaskCaptionSchema(BaseModelNoExt):
76
76
  api_key: Optional[str] = None
77
77
  endpoint_url: Optional[str] = None
78
78
  prompt: Optional[str] = None
79
- model_name: Optional[str] = None
79
+ caption_model_name: Optional[str] = None
80
80
 
81
81
 
82
82
  class IngestTaskFilterParamsSchema(BaseModelNoExt):
@@ -104,7 +104,7 @@ class IngestTaskDedupSchema(BaseModelNoExt):
104
104
 
105
105
  class IngestTaskEmbedSchema(BaseModelNoExt):
106
106
  endpoint_url: Optional[str] = None
107
- model_name: Optional[str] = None
107
+ embedding_model_name: Optional[str] = None
108
108
  api_key: Optional[str] = None
109
109
  filter_errors: bool = False
110
110
 
@@ -10,6 +10,6 @@ class ImageCaptionExtractionSchema(BaseModel):
10
10
  api_key: str = "api_key"
11
11
  endpoint_url: str = "https://ai.api.nvidia.com/v1/gr/meta/llama-3.2-11b-vision-instruct/chat/completions"
12
12
  prompt: str = "Caption the content of this image:"
13
- model_name: str = "meta/llama-3.2-11b-vision-instruct"
13
+ image_caption_model_name: str = "meta/llama-3.2-11b-vision-instruct"
14
14
  raise_on_failure: bool = False
15
15
  model_config = ConfigDict(extra="forbid")
@@ -173,7 +173,7 @@ def transform_image_create_vlm_caption_internal(
173
173
  api_key: str = task_config.get("api_key") or transform_config.api_key
174
174
  prompt: str = task_config.get("prompt") or transform_config.prompt
175
175
  endpoint_url: str = task_config.get("endpoint_url") or transform_config.endpoint_url
176
- model_name: str = task_config.get("model_name") or transform_config.model_name
176
+ model_name: str = task_config.get("image_caption_model_name") or transform_config.image_caption_model_name
177
177
 
178
178
  # Create a mask for rows where the content type is "image".
179
179
  df_mask: pd.Series = df_transform_ledger["metadata"].apply(
@@ -230,28 +230,35 @@ def _async_runner(
230
230
  def _add_embeddings(row, embeddings, info_msgs):
231
231
  """
232
232
  Updates a DataFrame row with embedding data and associated error info.
233
+ Ensures the 'embedding' field is always present, even if None.
233
234
 
234
235
  Parameters
235
236
  ----------
236
237
  row : pandas.Series
237
238
  A row of the DataFrame.
238
- embeddings : list
239
- List of embeddings corresponding to DataFrame rows.
240
- info_msgs : list
241
- List of info message dictionaries corresponding to DataFrame rows.
239
+ embeddings : dict
240
+ Dictionary mapping row indices to embeddings.
241
+ info_msgs : dict
242
+ Dictionary mapping row indices to info message dicts.
242
243
 
243
244
  Returns
244
245
  -------
245
246
  pandas.Series
246
- The updated row with embedding and info message metadata added.
247
+ The updated row with 'embedding', 'info_message_metadata', and
248
+ '_contains_embeddings' appropriately set.
247
249
  """
248
- row["metadata"]["embedding"] = embeddings[row.name]
249
- if info_msgs[row.name] is not None:
250
- row["metadata"]["info_message_metadata"] = info_msgs[row.name]
250
+ embedding = embeddings.get(row.name, None)
251
+ info_msg = info_msgs.get(row.name, None)
252
+
253
+ # Always set embedding, even if None
254
+ row["metadata"]["embedding"] = embedding
255
+
256
+ if info_msg:
257
+ row["metadata"]["info_message_metadata"] = info_msg
251
258
  row["document_type"] = ContentTypeEnum.INFO_MSG
252
259
  row["_contains_embeddings"] = False
253
260
  else:
254
- row["_contains_embeddings"] = True
261
+ row["_contains_embeddings"] = embedding is not None
255
262
 
256
263
  return row
257
264
 
@@ -287,7 +294,7 @@ def _get_pandas_table_content(row):
287
294
  str
288
295
  The table/chart content from the row.
289
296
  """
290
- return row["table_metadata"]["table_content"]
297
+ return row.get("table_metadata", {}).get("table_content")
291
298
 
292
299
 
293
300
  def _get_pandas_image_content(row):
@@ -304,7 +311,14 @@ def _get_pandas_image_content(row):
304
311
  str
305
312
  The image caption from the row.
306
313
  """
307
- return row["image_metadata"]["caption"]
314
+ return row.get("image_metadata", {}).get("caption")
315
+
316
+
317
+ def _get_pandas_audio_content(row):
318
+ """
319
+ A pandas UDF used to select extracted audio transcription to be used to create embeddings.
320
+ """
321
+ return row.get("audio_metadata", {}).get("audio_transcript")
308
322
 
309
323
 
310
324
  # ------------------------------------------------------------------------------
@@ -352,13 +366,6 @@ def _generate_batches(prompts: List[str], batch_size: int = 100) -> List[str]:
352
366
  return [batch for batch in _batch_generator(prompts, batch_size)]
353
367
 
354
368
 
355
- def _get_pandas_audio_content(row):
356
- """
357
- A pandas UDF used to select extracted audio transcription to be used to create embeddings.
358
- """
359
- return row["audio_metadata"]["audio_transcript"]
360
-
361
-
362
369
  # ------------------------------------------------------------------------------
363
370
  # DataFrame Concatenation Utility
364
371
  # ------------------------------------------------------------------------------
@@ -408,17 +415,20 @@ def transform_create_text_embeddings_internal(
408
415
  execution_trace_log: Optional[Dict] = None,
409
416
  ) -> Tuple[pd.DataFrame, Dict]:
410
417
  """
411
- Generates text embeddings for supported content types (TEXT, STRUCTURED, IMAGE)
418
+ Generates text embeddings for supported content types (TEXT, STRUCTURED, IMAGE, AUDIO)
412
419
  from a pandas DataFrame using asynchronous requests.
413
420
 
421
+ This function ensures that even if the extracted content is empty or None,
422
+ the embedding field is explicitly created and set to None.
423
+
414
424
  Parameters
415
425
  ----------
416
426
  df_transform_ledger : pd.DataFrame
417
427
  The DataFrame containing content for embedding extraction.
418
428
  task_config : Dict[str, Any]
419
429
  Dictionary containing task properties (e.g., filter error flag).
420
- transform_config : Any
421
- Validated configuration for text embedding extraction (EmbedExtractionsSchema).
430
+ transform_config : TextEmbeddingSchema, optional
431
+ Validated configuration for text embedding extraction.
422
432
  execution_trace_log : Optional[Dict], optional
423
433
  Optional trace information for debugging or logging (default is None).
424
434
 
@@ -429,24 +439,20 @@ def transform_create_text_embeddings_internal(
429
439
  - The updated DataFrame with embeddings applied.
430
440
  - A dictionary with trace information.
431
441
  """
432
-
433
- # Retrieve configuration values with fallback to transform_config defaults.
434
- api_key: str = task_config.get("api_key") or transform_config.api_key
435
- endpoint_url: str = task_config.get("endpoint_url") or transform_config.embedding_nim_endpoint
436
- model_name: str = task_config.get("model_name") or transform_config.embedding_model
442
+ api_key = task_config.get("api_key") or transform_config.api_key
443
+ endpoint_url = task_config.get("endpoint_url") or transform_config.embedding_nim_endpoint
444
+ model_name = task_config.get("model_name") or transform_config.embedding_model
437
445
 
438
446
  if execution_trace_log is None:
439
447
  execution_trace_log = {}
440
448
  logger.debug("No trace_info provided. Initialized empty trace_info dictionary.")
441
449
 
442
- # TODO(Devin)
443
450
  if df_transform_ledger.empty:
444
451
  return df_transform_ledger, {"trace_info": execution_trace_log}
445
452
 
446
453
  embedding_dataframes = []
447
- content_masks = [] # List of pandas boolean Series
454
+ content_masks = []
448
455
 
449
- # Define pandas content extractors for supported content types.
450
456
  pandas_content_extractor = {
451
457
  ContentTypeEnum.TEXT: _get_pandas_text_content,
452
458
  ContentTypeEnum.STRUCTURED: _get_pandas_table_content,
@@ -455,49 +461,62 @@ def transform_create_text_embeddings_internal(
455
461
  ContentTypeEnum.VIDEO: lambda x: None, # Not supported yet.
456
462
  }
457
463
 
458
- logger.debug("Generating text embeddings for supported content types: TEXT, STRUCTURED, IMAGE.")
459
-
460
464
  def _content_type_getter(row):
461
465
  return row["content_metadata"]["type"]
462
466
 
463
- # Process each supported content type.
464
467
  for content_type, content_getter in pandas_content_extractor.items():
465
468
  if not content_getter:
466
469
  logger.debug(f"Skipping unsupported content type: {content_type}")
467
470
  continue
468
471
 
472
+ # Get rows matching the content type
469
473
  content_mask = df_transform_ledger["metadata"].apply(_content_type_getter) == content_type.value
470
474
  if not content_mask.any():
471
475
  continue
472
476
 
473
- # Extract content from metadata and filter out rows with empty content.
474
- extracted_content = df_transform_ledger.loc[content_mask, "metadata"].apply(content_getter)
475
- non_empty_mask = extracted_content.notna() & (extracted_content.str.strip() != "")
476
- final_mask = content_mask & non_empty_mask
477
- if not final_mask.any():
478
- continue
477
+ # Always include all content_mask rows and prepare them
478
+ df_content = df_transform_ledger.loc[content_mask].copy().reset_index(drop=True)
479
479
 
480
- df_content = df_transform_ledger.loc[final_mask].copy().reset_index(drop=True)
481
- filtered_content = df_content["metadata"].apply(content_getter)
482
- filtered_content_batches = _generate_batches(filtered_content.tolist(), batch_size=transform_config.batch_size)
483
- content_embeddings = _async_runner(
484
- filtered_content_batches,
485
- api_key,
486
- endpoint_url,
487
- model_name,
488
- transform_config.encoding_format,
489
- transform_config.input_type,
490
- transform_config.truncate,
491
- False,
480
+ # Extract content and normalize empty or non-str to None
481
+ extracted_content = (
482
+ df_content["metadata"]
483
+ .apply(content_getter)
484
+ .apply(lambda x: x.strip() if isinstance(x, str) and x.strip() else None)
492
485
  )
493
- # Apply the embeddings (and any error info) to each row.
494
- df_content[["metadata", "document_type", "_contains_embeddings"]] = df_content.apply(
495
- _add_embeddings, **content_embeddings, axis=1
496
- )[["metadata", "document_type", "_contains_embeddings"]]
497
- df_content["_content"] = filtered_content
486
+ df_content["_content"] = extracted_content
487
+
488
+ # Prepare batches for only valid (non-None) content
489
+ valid_content_mask = df_content["_content"].notna()
490
+ if valid_content_mask.any():
491
+ filtered_content_batches = _generate_batches(
492
+ df_content.loc[valid_content_mask, "_content"].tolist(), batch_size=transform_config.batch_size
493
+ )
494
+ content_embeddings = _async_runner(
495
+ filtered_content_batches,
496
+ api_key,
497
+ endpoint_url,
498
+ model_name,
499
+ transform_config.encoding_format,
500
+ transform_config.input_type,
501
+ transform_config.truncate,
502
+ False,
503
+ )
504
+ # Build a simple row index -> embedding map
505
+ embeddings_dict = dict(
506
+ zip(df_content.loc[valid_content_mask].index, content_embeddings.get("embeddings", []))
507
+ )
508
+ info_msgs_dict = dict(
509
+ zip(df_content.loc[valid_content_mask].index, content_embeddings.get("info_msgs", []))
510
+ )
511
+ else:
512
+ embeddings_dict = {}
513
+ info_msgs_dict = {}
514
+
515
+ # Apply embeddings or None to all rows
516
+ df_content = df_content.apply(_add_embeddings, embeddings=embeddings_dict, info_msgs=info_msgs_dict, axis=1)
498
517
 
499
518
  embedding_dataframes.append(df_content)
500
- content_masks.append(final_mask)
519
+ content_masks.append(content_mask)
501
520
 
502
521
  combined_df = _concatenate_extractions_pandas(df_transform_ledger, embedding_dataframes, content_masks)
503
522
  return combined_df, {"trace_info": execution_trace_log}